Package jazzparser :: Package parsers :: Package cky :: Module parser
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.parsers.cky.parser

  1  """CKY parser implementation. 
  2   
  3  This is the basic parser component of the jazz chord interpretation system. 
  4  This does all of the basic CKY parsing routine, but is unaware of the  
  5  formalism, rule set, etc. 
  6   
  7  """ 
  8  """ 
  9  ============================== License ======================================== 
 10   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 11    
 12   This file is part of The Jazz Parser. 
 13    
 14   The Jazz Parser is free software: you can redistribute it and/or modify 
 15   it under the terms of the GNU General Public License as published by 
 16   the Free Software Foundation, either version 3 of the License, or 
 17   (at your option) any later version. 
 18    
 19   The Jazz Parser is distributed in the hope that it will be useful, 
 20   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 22   GNU General Public License for more details. 
 23    
 24   You should have received a copy of the GNU General Public License 
 25   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 26   
 27  ============================ End license ====================================== 
 28   
 29  """ 
 30  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"  
 31   
 32  from jazzparser.grammar import Grammar 
 33  from chart import Chart 
 34  from jazzparser.utils.base import filter_latex, ExecutionTimer 
 35  from jazzparser.utils.options import ModuleOption, new_file_option 
 36  from jazzparser.utils.strings import str_to_bool 
 37  from jazzparser.parsers.base.parser import Parser 
 38  from .tools import ChartTool, InteractiveChartTool 
 39   
 40  import sys, re 
 41   
 42  from jazzparser import settings 
 43   
44 -class CkyParser(Parser):
45 """ 46 CkyParser is the central class for the jazz chord sequence 47 recogniser parsing mechanism. 48 It constitutes the "algorithm" module of the system. 49 It begins with a set of signs assigned to the input by the 50 tagger and parses to produce a chart, from which the resultant 51 signs can be extracted. 52 53 """ 54 shell_tools = [ 55 ChartTool(), 56 InteractiveChartTool(), 57 ] 58 PARSER_OPTIONS = Parser.PARSER_OPTIONS + [ 59 ModuleOption('max_iter', filter=int, 60 help_text="Maximum number of parser iterations to perform "\ 61 "before giving up. If 0 or unspecified, continues "\ 62 "until parse is complete.", 63 usage="max_iter=X, where X is an integer.", 64 default=0, 65 ), 66 ModuleOption('min_iter', filter=int, 67 help_text="Usually, the parser will stop as soon as it finds a "\ 68 "full parse. Use min_iter to make it continue parsing until "\ 69 "it has done min_iter iterations or the tagger has ceased to "\ 70 "return any categories. Use -1 to keep going until the tagger "\ 71 "gives no more categories.", 72 usage="min_iter=X, where X is an integer.", 73 default=0, 74 ), 75 ModuleOption('parses', filter=int, 76 help_text="Number of parses to require before we terminate. "\ 77 "Default is 1: the parser will terminate as soon as it finds "\ 78 "at least one full parse (unless another option, like "\ 79 "min_iter, prevents it", 80 usage="parses=X, where X is an integer", 81 default=1, 82 ), 83 ModuleOption('timeout', filter=int, 84 help_text="Maximum time allowed for the main parse loop, in "\ 85 "minutes. If this is exceded, the backoff will kick "\ 86 "in, if one is specified. Otherwise, no results will be "\ 87 "returned. The parser will not stop as soon as the timeout "\ 88 "expires, but after finishing processing the current input "\ 89 "word. 0 (default) imposes no timeout.", 90 usage="timeout=X, where X is an integer number of seconds.", 91 default=0, 92 ), 93 ModuleOption('inspect', filter=str_to_bool, 94 help_text="If true, the graphical chart inspector will be "\ 95 "displayed during parsing.", 96 usage="inspect=X, where X is a boolean value.", 97 default=False 98 ), 99 ModuleOption('inspect_persist', filter=str_to_bool, 100 help_text="Makes the chart inspector window persist after parsing "\ 101 "is completed. By default, it will be killed", 102 usage="inspect_persist=X, where X is a boolean value.", 103 default=False 104 ), 105 ModuleOption('dump_chart', filter=new_file_option, 106 help_text="A file to dump the chart state to during parsing. "\ 107 "The first dump will be when the chart is created and "\ 108 "new dumps will be made throughout the parse.", 109 usage="dump_chart=X, where X is a filename." 110 ), 111 ModuleOption('derivations', filter=str_to_bool, 112 help_text="Store derivation traces along with the results", 113 usage="derivations=X, where X is a boolean value", 114 default=None, 115 ), 116 ] 117
118 - def _create_chart(self, *args, **kwargs):
119 self.chart = Chart(self.grammar, *args, **kwargs) 120 return self.chart
121
122 - def _add_signs(self, offset=0, prob_adder=None):
123 """ 124 Adds new signs to the chart from the supertagger, using the given 125 offset when requesting them from the tagger. 126 127 @rtype: list of tuples 128 @return: all the signs that were actually added. Each is represented 129 by a tuple (start_node, end_node, sign) 130 131 """ 132 signs = self.tagger.get_signs(offset) 133 words = self.tagger.get_string_input() 134 if signs is None or len(signs) == 0: 135 return [] 136 # Add each new sign to the chart 137 added = [] 138 for (start,end,signtup) in signs: 139 word_list = words[start:end] 140 word = " ".join(w for w in word_list) 141 # Add the probabilities as an attribute to the signs 142 cat,tag,prob = signtup 143 if prob_adder is not None: 144 prob_adder(start, end, signtup, word_list) 145 # Add the signs to the chart 146 newadd = self.chart.add_word_signs([signtup[0]], start, word, end_node=end) 147 # Keep a record of those that got added 148 if newadd: 149 added.append((start,end,signtup)) 150 return added
151
152 - def parse(self, derivations=False, summaries=False, inspect=False):
153 """ 154 Run the parser on the input, using the specified tagger. Runs 155 the CKY parsing algorithm to do chart parsing. For details of 156 chart parsing, see Chart class. 157 158 If the parser was given a maximum number of iterations, the 159 routine will return as usual after this number is completed, 160 even if no parses have been found. 161 162 @type derivations: bool 163 @param derivations: store derivation traces, which 164 can subsequently be used to trace all the derivations that 165 led to any given sign in the chart. Overridden by the module 166 option if it's given 167 @type summaries: int/bool 168 @param summaries: output chart summary information to stderr during 169 parsing to track progress. Set to 2 to output some info, 170 but not the full chart. 171 @type inspect: bool 172 @param inspect: launch a graphical chart inspector during the 173 parse to display interactive chart information. 174 175 @return: a list of signs that span the full input. 176 """ 177 if 'derivations' in self.options and self.options['derivations'] is not None: 178 derivations = self.options['derivations'] 179 180 # Time excecution if we're showing any summaries 181 time = bool(summaries) 182 # Find out from the tagger how long the input it read in was 183 input_length = self.tagger.input_length 184 # Create and initialise a chart for parsing 185 # Don't initialise the chart with signs - we'll add signs gradually instead 186 chart = self._create_chart( 187 [[]]*input_length, 188 derivations=derivations) 189 190 # Launch a chart inspector if requested 191 if self.options['inspect'] or inspect: 192 # Get a string form of the input to display 193 input_strs = self.tagger.get_string_input() 194 chart.launch_inspector(input=input_strs) 195 # Start dumping the chart if requested 196 if self.options['dump_chart']: 197 # Make the first dump of the empty chart 198 from .chart import dump_chart 199 dump_chart(chart, self.options['dump_chart']) 200 # Stop after a given number of iterations 201 if self.options['max_iter'] == 0: 202 max_iter = None 203 else: 204 max_iter = self.options['max_iter'] 205 206 if self.options['min_iter'] == -1: 207 # Special case: never stop until we've got all the categories 208 min_iter = None 209 else: 210 min_iter = self.options['min_iter'] 211 212 required_parses = self.options['parses'] 213 214 timeout = 60*self.options['timeout'] 215 check_timeout = timeout>0 216 # Make sure the timed out flag is unset to start with 217 self.timed_out = False 218 219 # This is where progress output will go 220 # Note that it's not the same as logger, which is the main system logger 221 prog_logger = self.logger 222 223 if check_timeout: 224 prog_logger.info("Due to timeout after %d mins" % self.options['timeout']) 225 226 ################################################## 227 ### Here is the parser itself. 228 # Keep track of how long since we started for timing out 229 timeout_timer = ExecutionTimer(clock=True) 230 231 signs_taken = [0]*input_length 232 233 offset = 0 234 last_lexicals = [0]*(input_length) 235 try: 236 # Keep adding signs until none left, or we get a full parse, 237 # or we complete the maximum iterations allowed 238 # Keep going if min_iter is None (special value meaning don't stop 239 # when we get a parse 240 while (min_iter is None or (offset < min_iter) \ 241 or len(chart.parses) < required_parses): 242 if max_iter is not None and offset >= max_iter: 243 # Exceded maximum number of iterations: give up 244 prog_logger.info("Reached maximum number of iterations: "\ 245 "continuing to backoff/fail") 246 break 247 prog_logger.info(">>> Parsing iteration: %d" % (offset+1)) 248 # Get new signs from the tagger 249 added = self._add_signs(offset=offset) 250 # Note whether we added anything new 251 if added: 252 # Apply unary rules to these new signs 253 added_spans = set([(start,end) for (start,end,sign) in added]) 254 for (start,end) in added_spans: 255 chart.apply_unary_rules(start,end) 256 else: 257 # No new signs added by the tagger: no point in continuing 258 prog_logger.info("No new signs added: ending parse") 259 break 260 261 ##### Main parser loop: produce all possible results 262 # Set end point to each node 263 for end in range(1,input_length+1): 264 if time: 265 # Start a timer 266 timer = ExecutionTimer() 267 chart.apply_unary_rules(end-1, end) 268 269 # Set start point to each node before the end, in reverse order 270 for start in range(end-2,-1,-1): 271 for middle in range(start+1,end): 272 chart.apply_binary_rules(start, middle, end) 273 274 # Check whether the timeout has expired and don't process 275 # any more if it has 276 if check_timeout: 277 # Check whether the timeout has passed 278 if int(timeout_timer.get_time()) > timeout: 279 # Move on to post-parse stuff 280 raise ParserTimeout 281 282 # Check for new unary rule applications 283 chart.apply_unary_rules(start, end) 284 285 if summaries: 286 prog_logger.info("Completed parsing up to node %d / %d (%.2f secs)" % (end,input_length, timer.get_time())) 287 if summaries != 2: 288 prog_logger.info(chart.summary) 289 if self.options['dump_chart']: 290 # Dump an update of the chart to the file 291 dump_chart(chart, self.options['dump_chart']) 292 293 if summaries: 294 prog_logger.info("Completed parsing to end of sequence") 295 if summaries != 2: 296 prog_logger.info(chart.summary) 297 298 offset += 1 299 except ParserTimeout: 300 # The given timeout elapsed: just continue with no parses 301 prog_logger.info("Parse timeout (%d mins) expired: continuing "\ 302 "to backoff/fail" % self.options['timeout']) 303 # Set the timed_out flag so we can check later whether we timed out 304 self.timed_out = True 305 except KeyboardInterrupt: 306 # We pass the interrupt on to a higher level, but first kill 307 # the inspector window, so it doesn't hang around and mess up 308 self.chart.kill_inspector() 309 raise 310 311 parses = chart.parses 312 if len(parses) == 0 and self.backoff is not None: 313 prog_logger.info("Using backoff model") 314 backoff_results = self.run_backoff() 315 if len(backoff_results) > 0: 316 for res in backoff_results: 317 # Put the semantics result into a sign, with a dummy 318 # syntactic category 319 sign = self.grammar.formalism.Syntax.Sign( 320 self.grammar.formalism.Syntax.DummyCategory(), 321 res) 322 # If the semantics has a probability, put this on the sign 323 if hasattr(res, "probability"): 324 sign.probability = res.probability 325 parses.append(sign) 326 elif len(parses): 327 prog_logger.info("Parse finished with %d results" % len(parses)) 328 else: 329 prog_logger.info("Parse finished with no results") 330 331 # Close the inspector window if one was opened 332 if not self.options['inspect_persist']: 333 self.chart.kill_inspector() 334 335 return parses
336
337 -class DirectedCkyParser(Parser):
338 """ 339 DirectedCkyParser is a special version of the CKY parser that tries 340 to produce a parse according to a pre-built derivation tree. 341 342 Why? 343 Canonical trees are stored implicitly in the Jazz corpus. We can 344 build the explicit structure of the trees, in accordance with the 345 implicit manual annotations, but this will not contain any signs 346 on internal nodes. The structure does not produce a parse in itself 347 or even verify that the sequence can be parsed with that structure. 348 349 The purpose of the DirectedCkyParser is to take a description of 350 this annotated structure and actually perform the parse, packing 351 the chart with only those signs that the derivation structure 352 produces. 353 354 The parser should be used with a tagger that assigns only those 355 signs that were annotated. Use the PretaggedTagger to do this. 356 357 """ 358 PARSER_OPTIONS = Parser.PARSER_OPTIONS + [ 359 ModuleOption('derivations', filter=bool, 360 help_text="Store derivation traces along with the results", 361 usage="derivations=X, where X is 'True' or 'False'.", 362 default=None, 363 ), 364 ] 365
366 - def __init__(self, grammar, tagger, derivation_tree=None, *args, **kwargs):
367 if derivation_tree is None: 368 raise ValueError, "DirectedCkyParser must be instantiated "\ 369 "with a derivation tree in kwarg 'derivation_tree'." 370 self.derivation_tree = derivation_tree 371 super(DirectedCkyParser, self).__init__(grammar,tagger,*args,**kwargs)
372
373 - def _create_chart(self, *args, **kwargs):
374 self.chart = Chart(self.grammar, *args, **kwargs) 375 return self.chart
376
377 - def parse(self, derivations=False, summaries=False):
378 """ 379 Run the parser on the input, using the specified tagger. Runs 380 the CKY parsing algorithm to do chart parsing. For details of 381 chart parsing, see Chart class. 382 """ 383 if 'derivations' in self.options and self.options['derivations'] is not None: 384 derivations = self.options['derivations'] 385 386 # Find out from the tagger how long the input it read in was 387 input_length = self.tagger.input_length 388 # Create and initialise a chart for parsing 389 # Don't initialise the chart with signs - we'll add signs gradually instead 390 chart = self._create_chart( 391 signs=[[]]*input_length, 392 derivations=derivations) 393 394 ################################################## 395 ### Here is the parser itself 396 397 # Only get signs from the tagger once: we expect to get them all first time 398 # Add all the lexical signs to the chart 399 for word in range(input_length): 400 new_cat_pairs = self.tagger.get_signs_for_word(word) 401 new_cats = [cat for (cat, tag, prob) in new_cat_pairs] 402 chart.add_word_signs(new_cats, word, self.tagger.get_word(word)) 403 404 ##### Main parser loop: produce only the signs that we're directed to produce 405 # Get a mapping from the tree's short rule names to the rule instances 406 rule_mapping = self.grammar.formalism.PcfgParser.rule_short_names 407 # Perform the parse bottom up by a depth-first left-to-right 408 # recursion on the derivation tree. Recursively parse children 409 # of each node, before applying rules for the node itself. 410 def _fill_chart(start, tree_node): 411 """ 412 Recursively fills the chart using the subtree rooted by 413 tree_node, using start as the leftmost node of the chart. 414 Returns the resulting rightmost node covered by this 415 span. 416 """ 417 if hasattr(tree_node, 'children') and len(tree_node.children) > 0: 418 if len(tree_node.children) > 2: 419 raise DirectedParseError, "invalid derivation tree. "\ 420 "Nodes may have up to 2 children. This node has "\ 421 "%d: %s" % (len(tree_node.children), tree_node) 422 ### An internal node 423 # First recurse to the sub-parses 424 sub_end = start 425 middle = None 426 for child in tree_node.children: 427 sub_end = _fill_chart(sub_end, child) 428 if middle is None: 429 # Store the first node after the start as the middle node 430 middle = sub_end 431 # We now know where this span ends. 432 end = sub_end 433 # Apply the rule associated with the node 434 try: 435 rule_details = rule_mapping[tree_node.rule] 436 except KeyError: 437 raise DirectedParseError, "tree node %s specifies a "\ 438 "rule '%s' which is not defined for this "\ 439 "formalism. Are you using the right formalism "\ 440 "for your data?" % (tree_node, tree_node.rule) 441 rule_cls = self.grammar.formalism.rules[rule_details[0]] 442 # Instantiate the rule 443 rule_kwargs = { 444 'grammar' : self.grammar, 445 'modalities' : self.grammar.modality_tree, 446 } 447 rule_kwargs.update(rule_details[1]) 448 rule = rule_cls(**rule_kwargs) 449 # Try applying the rule to the arguments we've generated 450 # Check we have the right number of children 451 if len(tree_node.children) != rule.arity: 452 raise DirectedParseError, "a node was encountered "\ 453 "that does not have the right number of children "\ 454 "for its rule. %s must have %d children." % \ 455 (tree_node.rule, rule.arity) 456 # Apply the rule to its one or two arguments 457 if rule.arity == 1: 458 added = chart.apply_unary_rule(rule, start, end) 459 debug_inputs = "%s, [%s]" % (rule, 460 ", ".join(["%s" % s for s in chart.get_signs(start, end)]) 461 ) 462 elif rule.arity == 2: 463 added = chart.apply_binary_rule(rule, start, middle, end) 464 debug_inputs = "%s, [%s] and [%s]" % (rule, 465 ", ".join(["%s" % s for s in chart.get_signs(start, middle)]), 466 ", ".join(["%s" % s for s in chart.get_signs(middle, end)]) 467 ) 468 # If nothing was added to the chart, the rule must have failed 469 if not added: 470 # No point in continuing, since stuff further up the 471 # tree will inevitably fail 472 raise DirectedParseError, "failed to apply rule %s. "\ 473 "Giving up on parse. "\ 474 "Tree: %s. Inputs: %s." % \ 475 (tree_node.rule, tree_node, debug_inputs) 476 elif hasattr(tree_node, 'chord'): 477 ### Leaf node 478 # We assume this lines up with the correct position in 479 # the tags that the tagger has given us. 480 # This arc is a leaf, so only has a span of 1. 481 end = start + 1 482 else: 483 # Tree does not conform to correct interface 484 raise DirectedParseError, "derivation tree for directed "\ 485 "parse should be made up of internal trees with "\ 486 "children and leaves with a chord attribute. This "\ 487 "node is neither: %s" % tree_node 488 return end
489 rightmost = _fill_chart(0, self.derivation_tree) 490 491 return chart.parses
492
493 -class DirectedParseError(Exception):
494 pass
495
496 -class ParserTimeout(Exception):
497 pass
498