| Trees | Indices | Help |
|
|---|
|
|
1 """Chart representation for the PCFG parser.
2
3 Since the PCFG parser is just a CKY parser with probabilities added
4 in, the chart implementation merely extends the CKY chart and overrides
5 the crucial step of applying rules so that it can do its probabilistic
6 stuff.
7
8 Note that all probabilities are log probabilities.
9
10 """
11 """
12 ============================== License ========================================
13 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
14
15 This file is part of The Jazz Parser.
16
17 The Jazz Parser is free software: you can redistribute it and/or modify
18 it under the terms of the GNU General Public License as published by
19 the Free Software Foundation, either version 3 of the License, or
20 (at your option) any later version.
21
22 The Jazz Parser is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 GNU General Public License for more details.
26
27 You should have received a copy of the GNU General Public License
28 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
29
30 ============================ End license ======================================
31
32 """
33 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
34
35 from jazzparser.parsers.cky.chart import Chart, SignHashSet
36 from jazzparser.data import DerivationTrace
37 from jazzparser.utils.strings import fmt_prob
38 from jazzparser.utils.nltk.probability import logprob
39
40 from nltk.probability import add_logs, _NINF
41
42 from jazzparser import settings
43 import logging
44
45 # Get the logger from the logging system
46 logger = logging.getLogger("main_logger")
47
49 """
50 For this chart's internal data structure, we use a modified
51 implementation of the HashSet which adds handling of probabilities.
52
53 All probabilities are logged,
54
55 """
57 self.threshold = kwargs.pop('threshold', settings.PCFG_PARSER.DEFAULT_THRESHOLD)
58 self.maxsize = kwargs.pop('maxsize', settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE)
59 super(ProbabilisticSignHashSet, self).__init__(*args, **kwargs)
60 self._beamed = False
61
63 # Sum the probabilities of the two signs
64 existing_value.probability = add_logs(new_value.probability, existing_value.probability)
65 # Do the same for the inside probs
66 existing_value.inside_probability = add_logs(
67 new_value.inside_probability,
68 existing_value.inside_probability)
69 # Continue to do whatever the formalism wants with the signs
70 super(ProbabilisticSignHashSet, self)._add_existing_value(existing_value, new_value)
71
74
76 """
77 There's no point in applying a beam if nothing's been added to
78 the set and it's already been applied. Note that something's
79 been added.
80
81 """
82 self._beamed = False
83 return super(ProbabilisticSignHashSet, self).append(*args, **kwargs)
84
88
90 """
91 Applies a beam, using the already given threshold, to the set,
92 pruning out any signs with a probability lower than the
93 given ratio of the most probable sign.
94 """
95 if not self._beamed:
96 max = self._max_probability()
97 cutoff = max + logprob(self.threshold)
98 to_remove = [sign for sign in self.values() if sign.probability < cutoff]
99 for sign in to_remove:
100 self.remove(sign)
101 logger.debug("Beam removed %d signs (max %s, min %s)" % \
102 (len(to_remove),max, cutoff))
103 # Beam is now applied: check the remaining size
104 if self.maxsize != 0:
105 if len(self) > self.maxsize:
106 logger.debug("Hard beam removed %d signs" % (self.maxsize-len(self)))
107 # Too many signs: apply a hard cutoff
108 ordered = list(sorted(self.values(), key=lambda s:s.probability))
109 for sign in ordered[self.maxsize:]:
110 self.remove(sign)
111 # Don't apply the beam again until something changes
112 self._beamed = True
113
115 """
116 Returns the signs in the set ranked by probability (highest
117 first).
118
119 """
120 return list(reversed(sorted(self.values(), key=lambda s:s.probability)))
121
123 """
124 Overrides the CKY chart to add probabilistic stuff.
125
126 Signs in the input should have an attribute 'probability'.
127 The results of rule application will also have such an attribute.
128
129 """
130 HASH_SET_IMPL = ProbabilisticSignHashSet
131
133 self.model = kwargs.pop('model', None)
134 if self.model is None:
135 raise ValueError, "PcfgChart must be instantiated with a "\
136 "model it can use to assign probabilities."
137 # The threshold probability below which we throw signs away
138 kwargs['hash_set_kwargs'] = {
139 'threshold' : kwargs.pop('threshold', None),
140 'maxsize' : kwargs.pop('maxarc', None),
141 }
142 super(PcfgChart, self).__init__(*args, **kwargs)
143 # For convenience
144 self.catrep = self.grammar.formalism.PcfgParser.category_representation
145
147 """
148 Full parses ranked by probability.
149 Returns a list.
150
151 """
152 return list(reversed(sorted(self.parses, key=lambda s:s.probability)))
153 ranked_parses = property(_get_ranked_parses)
154
156 # Apply the rule using the super method
157 def _get_res_mod(model):
158 # Closure to get model and catrep
159 def _res_mod(result, sign):
160 # Function to add the probability to each result from the input
161 # Use the model to get the probabilities
162 inside_prob = logprob(model.inside_probability(
163 'unary', result, sign)) + \
164 sign.inside_probability
165 outside_prob = logprob(model.outside_probability(result))
166 result.inside_probability = inside_prob
167 result.probability = outside_prob + inside_probability
168 _result_modifier = _get_res_mod(self.model)
169
170 # Now just use the superclass' method with this to modify to results
171 signs_added = super(PcfgChart, self).apply_unary_rule(rule, start, end, result_modifier=_result_modifier)
172
173 if beam and signs_added:
174 # Apply a beam to the arc
175 self.apply_beam((start,end))
176 return signs_added
177
179 """
180 Used by L{_apply_binary_rule} and L{_apply_binary_rule_semantics} to
181 compute the expansion probabilitiy.
182
183 This is a separate function because both of the above do the same
184 to compute the probabilities, so I don't want to repeat the code.
185
186 Returns a tuple of the probability and the inside probability.
187
188 """
189 parent = result
190 left, right = sign_pair
191 expansion = 'right'
192 # Get the probabilities from the model
193 subtree_prob = logprob(self.model.inside_probability(
194 expansion, parent, left, right))
195 outside_prob = logprob(self.model.outside_probability(parent))
196 # Multiply in the daughters' inside probs to get the inside prob
197 inside_prob = subtree_prob + left.inside_probability + \
198 right.inside_probability
199 return (inside_prob+outside_prob, inside_prob)
200
202 # Call the superclass method to do the application
203 results = super(PcfgChart, self)._apply_binary_rule(rule, sign_pair)
204
205 # Add probabilities to the results
206 for result in results:
207 result.probability, result.inside_probability = \
208 self._binary_expansion_probability(sign_pair, result)
209 return results
210
212 # Call the superclass method to do the application
213 results = super(PcfgChart, self)._apply_binary_rule_semantics(rule, sign_pair, category)
214
215 # Add probabilities to the results
216 for result in results:
217 result.probability, result.inside_probability = \
218 self._binary_expansion_probability(sign_pair, result)
219 return results
220
222 # Call the super method to apply the rules
223 signs_added = super(PcfgChart, self).apply_binary_rules(start, middle, end)
224
225 if beam and signs_added:
226 # Apply a beam to the results
227 self.apply_beam((start, end))
228 return signs_added
229
231 # Call the super method to apply the rule
232 signs_added = super(PcfgChart, self).apply_binary_rule(rule, start, middle, end)
233
234 if beam and signs_added:
235 # Apply the beam to the arc that might have got results
236 self.apply_beam((start, end))
237 return signs_added
238
240 """
241 Applies a beam to every arc in the chart. If arc is given, it
242 should be a tuple of (start,end): applies a beam only to the
243 arc starting at start and ending at end.
244
245 """
246 if arc is not None:
247 start,end = arc
248 # Never beam the longest span: there's no point, as it won't be
249 # used in any other productions
250 if not (start == 0 and end == self.size):
251 # Apply to a specific arc
252 logger.debug("Beaming (%s,%s)" % arc)
253 self._table[start][end-start-1]._apply_beam()
254 else:
255 # Apply to whole chart
256 for i,ends in enumerate(self._table):
257 for j,arcs in enumerate(ends):
258 # Exclude the longest span
259 if not (i==0 and i+j+1==self.size):
260 arcs._apply_beam()
261
264
266 # Inherit docs from Chart
267 from .inspector import PcfgChartInspectorThread
268 inspector = PcfgChartInspectorThread(self, input_strs=input)
269 self.inspector = inspector
270 if block:
271 inspector.run()
272 else:
273 inspector.start()
274
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Nov 26 16:05:03 2012 | http://epydoc.sourceforge.net |