jazzparser.parsers.pcfg.chart

1 """Chart representation for the PCFG parser. 2 3 Since the PCFG parser is just a CKY parser with probabilities added 4 in, the chart implementation merely extends the CKY chart and overrides 5 the crucial step of applying rules so that it can do its probabilistic 6 stuff. 7 8 Note that all probabilities are log probabilities. 9 10 """ 11 """ 12 ============================== License ======================================== 13 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 14 15 This file is part of The Jazz Parser. 16 17 The Jazz Parser is free software: you can redistribute it and/or modify 18 it under the terms of the GNU General Public License as published by 19 the Free Software Foundation, either version 3 of the License, or 20 (at your option) any later version. 21 22 The Jazz Parser is distributed in the hope that it will be useful, 23 but WITHOUT ANY WARRANTY; without even the implied warranty of 24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 GNU General Public License for more details. 26 27 You should have received a copy of the GNU General Public License 28 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 29 30 ============================ End license ====================================== 31 32 """ 33 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 34 35 from jazzparser.parsers.cky.chart import Chart, SignHashSet 36 from jazzparser.data import DerivationTrace 37 from jazzparser.utils.strings import fmt_prob 38 from jazzparser.utils.nltk.probability import logprob 39 40 from nltk.probability import add_logs, _NINF 41 42 from jazzparser import settings 43 import logging 44 45 # Get the logger from the logging system 46 logger = logging.getLogger("main_logger") 47

48 -class ProbabilisticSignHashSet(SignHashSet):

49 """ 50 For this chart's internal data structure, we use a modified 51 implementation of the HashSet which adds handling of probabilities. 52 53 All probabilities are logged, 54 55 """

56 - def __init__(self, *args, **kwargs):

57 self.threshold = kwargs.pop('threshold', settings.PCFG_PARSER.DEFAULT_THRESHOLD) 58 self.maxsize = kwargs.pop('maxsize', settings.PCFG_PARSER.DEFAULT_MAX_ARC_SIZE) 59 super(ProbabilisticSignHashSet, self).__init__(*args, **kwargs) 60 self._beamed = False

61

62 - def _add_existing_value(self, existing_value, new_value):

63 # Sum the probabilities of the two signs 64 existing_value.probability = add_logs(new_value.probability, existing_value.probability) 65 # Do the same for the inside probs 66 existing_value.inside_probability = add_logs( 67 new_value.inside_probability, 68 existing_value.inside_probability) 69 # Continue to do whatever the formalism wants with the signs 70 super(ProbabilisticSignHashSet, self)._add_existing_value(existing_value, new_value)

71

72 - def _max_probability(self):

73 return max([val.probability for val in self.values()]+[_NINF])

74

75 - def append(self, *args, **kwargs):

76 """ 77 There's no point in applying a beam if nothing's been added to 78 the set and it's already been applied. Note that something's 79 been added. 80 81 """ 82 self._beamed = False 83 return super(ProbabilisticSignHashSet, self).append(*args, **kwargs)

84

85 - def remove(self, *args, **kwargs):

86 self._beamed = False 87 super(ProbabilisticSignHashSet, self).remove(*args, **kwargs)

88

89 - def _apply_beam(self):

90 """ 91 Applies a beam, using the already given threshold, to the set, 92 pruning out any signs with a probability lower than the 93 given ratio of the most probable sign. 94 """ 95 if not self._beamed: 96 max = self._max_probability() 97 cutoff = max + logprob(self.threshold) 98 to_remove = [sign for sign in self.values() if sign.probability < cutoff] 99 for sign in to_remove: 100 self.remove(sign) 101 logger.debug("Beam removed %d signs (max %s, min %s)" % \ 102 (len(to_remove),max, cutoff)) 103 # Beam is now applied: check the remaining size 104 if self.maxsize != 0: 105 if len(self) > self.maxsize: 106 logger.debug("Hard beam removed %d signs" % (self.maxsize-len(self))) 107 # Too many signs: apply a hard cutoff 108 ordered = list(sorted(self.values(), key=lambda s:s.probability)) 109 for sign in ordered[self.maxsize:]: 110 self.remove(sign) 111 # Don't apply the beam again until something changes 112 self._beamed = True

113

114 - def ranked(self):

115 """ 116 Returns the signs in the set ranked by probability (highest 117 first). 118 119 """ 120 return list(reversed(sorted(self.values(), key=lambda s:s.probability)))

121

122 -class PcfgChart(Chart):

123 """ 124 Overrides the CKY chart to add probabilistic stuff. 125 126 Signs in the input should have an attribute 'probability'. 127 The results of rule application will also have such an attribute. 128 129 """ 130 HASH_SET_IMPL = ProbabilisticSignHashSet 131

132 - def __init__(self, *args, **kwargs):

133 self.model = kwargs.pop('model', None) 134 if self.model is None: 135 raise ValueError, "PcfgChart must be instantiated with a "\ 136 "model it can use to assign probabilities." 137 # The threshold probability below which we throw signs away 138 kwargs['hash_set_kwargs'] = { 139 'threshold' : kwargs.pop('threshold', None), 140 'maxsize' : kwargs.pop('maxarc', None), 141 } 142 super(PcfgChart, self).__init__(*args, **kwargs) 143 # For convenience 144 self.catrep = self.grammar.formalism.PcfgParser.category_representation

145

146 - def _get_ranked_parses(self):

147 """ 148 Full parses ranked by probability. 149 Returns a list. 150 151 """ 152 return list(reversed(sorted(self.parses, key=lambda s:s.probability)))

153 ranked_parses = property(_get_ranked_parses) 154

155 - def apply_unary_rule(self, rule, start, end, beam=True):

156 # Apply the rule using the super method 157 def _get_res_mod(model): 158 # Closure to get model and catrep 159 def _res_mod(result, sign): 160 # Function to add the probability to each result from the input 161 # Use the model to get the probabilities 162 inside_prob = logprob(model.inside_probability( 163 'unary', result, sign)) + \ 164 sign.inside_probability 165 outside_prob = logprob(model.outside_probability(result)) 166 result.inside_probability = inside_prob 167 result.probability = outside_prob + inside_probability

168 _result_modifier = _get_res_mod(self.model) 169 170 # Now just use the superclass' method with this to modify to results 171 signs_added = super(PcfgChart, self).apply_unary_rule(rule, start, end, result_modifier=_result_modifier) 172 173 if beam and signs_added: 174 # Apply a beam to the arc 175 self.apply_beam((start,end)) 176 return signs_added 177

178 - def _binary_expansion_probability(self, sign_pair, result):

179 """ 180 Used by L{_apply_binary_rule} and L{_apply_binary_rule_semantics} to 181 compute the expansion probabilitiy. 182 183 This is a separate function because both of the above do the same 184 to compute the probabilities, so I don't want to repeat the code. 185 186 Returns a tuple of the probability and the inside probability. 187 188 """ 189 parent = result 190 left, right = sign_pair 191 expansion = 'right' 192 # Get the probabilities from the model 193 subtree_prob = logprob(self.model.inside_probability( 194 expansion, parent, left, right)) 195 outside_prob = logprob(self.model.outside_probability(parent)) 196 # Multiply in the daughters' inside probs to get the inside prob 197 inside_prob = subtree_prob + left.inside_probability + \ 198 right.inside_probability 199 return (inside_prob+outside_prob, inside_prob)

200

201 - def _apply_binary_rule(self, rule, sign_pair):

202 # Call the superclass method to do the application 203 results = super(PcfgChart, self)._apply_binary_rule(rule, sign_pair) 204 205 # Add probabilities to the results 206 for result in results: 207 result.probability, result.inside_probability = \ 208 self._binary_expansion_probability(sign_pair, result) 209 return results

210

211 - def _apply_binary_rule_semantics(self, rule, sign_pair, category):

212 # Call the superclass method to do the application 213 results = super(PcfgChart, self)._apply_binary_rule_semantics(rule, sign_pair, category) 214 215 # Add probabilities to the results 216 for result in results: 217 result.probability, result.inside_probability = \ 218 self._binary_expansion_probability(sign_pair, result) 219 return results

220

221 - def apply_binary_rules(self, start, middle, end, beam=True):

222 # Call the super method to apply the rules 223 signs_added = super(PcfgChart, self).apply_binary_rules(start, middle, end) 224 225 if beam and signs_added: 226 # Apply a beam to the results 227 self.apply_beam((start, end)) 228 return signs_added

229

230 - def apply_binary_rule(self, rule, start, middle, end, beam=True):

231 # Call the super method to apply the rule 232 signs_added = super(PcfgChart, self).apply_binary_rule(rule, start, middle, end) 233 234 if beam and signs_added: 235 # Apply the beam to the arc that might have got results 236 self.apply_beam((start, end)) 237 return signs_added

238

239 - def apply_beam(self, arc=None):

240 """ 241 Applies a beam to every arc in the chart. If arc is given, it 242 should be a tuple of (start,end): applies a beam only to the 243 arc starting at start and ending at end. 244 245 """ 246 if arc is not None: 247 start,end = arc 248 # Never beam the longest span: there's no point, as it won't be 249 # used in any other productions 250 if not (start == 0 and end == self.size): 251 # Apply to a specific arc 252 logger.debug("Beaming (%s,%s)" % arc) 253 self._table[start][end-start-1]._apply_beam() 254 else: 255 # Apply to whole chart 256 for i,ends in enumerate(self._table): 257 for j,arcs in enumerate(ends): 258 # Exclude the longest span 259 if not (i==0 and i+j+1==self.size): 260 arcs._apply_beam()

261

262 - def _sign_string(self, sign):

263 return "%s (%s)" % (sign, fmt_prob(2**sign.probability))

264

265 - def launch_inspector(self, input=None, block=False):

266 # Inherit docs from Chart 267 from .inspector import PcfgChartInspectorThread 268 inspector = PcfgChartInspectorThread(self, input_strs=input) 269 self.inspector = inspector 270 if block: 271 inspector.run() 272 else: 273 inspector.start()

274

Source Code for Module jazzparser.parsers.pcfg.chart