1 """CKY parser implementation.
2
3 This is the basic parser component of the jazz chord interpretation system.
4 This does all of the basic CKY parsing routine, but is unaware of the
5 formalism, rule set, etc.
6
7 """
8 """
9 ============================== License ========================================
10 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
11
12 This file is part of The Jazz Parser.
13
14 The Jazz Parser is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 The Jazz Parser is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
26
27 ============================ End license ======================================
28
29 """
30 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
31
32 from jazzparser.grammar import Grammar
33 from chart import Chart
34 from jazzparser.utils.base import filter_latex, ExecutionTimer
35 from jazzparser.utils.options import ModuleOption, new_file_option
36 from jazzparser.utils.strings import str_to_bool
37 from jazzparser.parsers.base.parser import Parser
38 from .tools import ChartTool, InteractiveChartTool
39
40 import sys, re
41
42 from jazzparser import settings
43
45 """
46 CkyParser is the central class for the jazz chord sequence
47 recogniser parsing mechanism.
48 It constitutes the "algorithm" module of the system.
49 It begins with a set of signs assigned to the input by the
50 tagger and parses to produce a chart, from which the resultant
51 signs can be extracted.
52
53 """
54 shell_tools = [
55 ChartTool(),
56 InteractiveChartTool(),
57 ]
58 PARSER_OPTIONS = Parser.PARSER_OPTIONS + [
59 ModuleOption('max_iter', filter=int,
60 help_text="Maximum number of parser iterations to perform "\
61 "before giving up. If 0 or unspecified, continues "\
62 "until parse is complete.",
63 usage="max_iter=X, where X is an integer.",
64 default=0,
65 ),
66 ModuleOption('min_iter', filter=int,
67 help_text="Usually, the parser will stop as soon as it finds a "\
68 "full parse. Use min_iter to make it continue parsing until "\
69 "it has done min_iter iterations or the tagger has ceased to "\
70 "return any categories. Use -1 to keep going until the tagger "\
71 "gives no more categories.",
72 usage="min_iter=X, where X is an integer.",
73 default=0,
74 ),
75 ModuleOption('parses', filter=int,
76 help_text="Number of parses to require before we terminate. "\
77 "Default is 1: the parser will terminate as soon as it finds "\
78 "at least one full parse (unless another option, like "\
79 "min_iter, prevents it",
80 usage="parses=X, where X is an integer",
81 default=1,
82 ),
83 ModuleOption('timeout', filter=int,
84 help_text="Maximum time allowed for the main parse loop, in "\
85 "minutes. If this is exceded, the backoff will kick "\
86 "in, if one is specified. Otherwise, no results will be "\
87 "returned. The parser will not stop as soon as the timeout "\
88 "expires, but after finishing processing the current input "\
89 "word. 0 (default) imposes no timeout.",
90 usage="timeout=X, where X is an integer number of seconds.",
91 default=0,
92 ),
93 ModuleOption('inspect', filter=str_to_bool,
94 help_text="If true, the graphical chart inspector will be "\
95 "displayed during parsing.",
96 usage="inspect=X, where X is a boolean value.",
97 default=False
98 ),
99 ModuleOption('inspect_persist', filter=str_to_bool,
100 help_text="Makes the chart inspector window persist after parsing "\
101 "is completed. By default, it will be killed",
102 usage="inspect_persist=X, where X is a boolean value.",
103 default=False
104 ),
105 ModuleOption('dump_chart', filter=new_file_option,
106 help_text="A file to dump the chart state to during parsing. "\
107 "The first dump will be when the chart is created and "\
108 "new dumps will be made throughout the parse.",
109 usage="dump_chart=X, where X is a filename."
110 ),
111 ModuleOption('derivations', filter=str_to_bool,
112 help_text="Store derivation traces along with the results",
113 usage="derivations=X, where X is a boolean value",
114 default=None,
115 ),
116 ]
117
121
123 """
124 Adds new signs to the chart from the supertagger, using the given
125 offset when requesting them from the tagger.
126
127 @rtype: list of tuples
128 @return: all the signs that were actually added. Each is represented
129 by a tuple (start_node, end_node, sign)
130
131 """
132 signs = self.tagger.get_signs(offset)
133 words = self.tagger.get_string_input()
134 if signs is None or len(signs) == 0:
135 return []
136
137 added = []
138 for (start,end,signtup) in signs:
139 word_list = words[start:end]
140 word = " ".join(w for w in word_list)
141
142 cat,tag,prob = signtup
143 if prob_adder is not None:
144 prob_adder(start, end, signtup, word_list)
145
146 newadd = self.chart.add_word_signs([signtup[0]], start, word, end_node=end)
147
148 if newadd:
149 added.append((start,end,signtup))
150 return added
151
152 - def parse(self, derivations=False, summaries=False, inspect=False):
153 """
154 Run the parser on the input, using the specified tagger. Runs
155 the CKY parsing algorithm to do chart parsing. For details of
156 chart parsing, see Chart class.
157
158 If the parser was given a maximum number of iterations, the
159 routine will return as usual after this number is completed,
160 even if no parses have been found.
161
162 @type derivations: bool
163 @param derivations: store derivation traces, which
164 can subsequently be used to trace all the derivations that
165 led to any given sign in the chart. Overridden by the module
166 option if it's given
167 @type summaries: int/bool
168 @param summaries: output chart summary information to stderr during
169 parsing to track progress. Set to 2 to output some info,
170 but not the full chart.
171 @type inspect: bool
172 @param inspect: launch a graphical chart inspector during the
173 parse to display interactive chart information.
174
175 @return: a list of signs that span the full input.
176 """
177 if 'derivations' in self.options and self.options['derivations'] is not None:
178 derivations = self.options['derivations']
179
180
181 time = bool(summaries)
182
183 input_length = self.tagger.input_length
184
185
186 chart = self._create_chart(
187 [[]]*input_length,
188 derivations=derivations)
189
190
191 if self.options['inspect'] or inspect:
192
193 input_strs = self.tagger.get_string_input()
194 chart.launch_inspector(input=input_strs)
195
196 if self.options['dump_chart']:
197
198 from .chart import dump_chart
199 dump_chart(chart, self.options['dump_chart'])
200
201 if self.options['max_iter'] == 0:
202 max_iter = None
203 else:
204 max_iter = self.options['max_iter']
205
206 if self.options['min_iter'] == -1:
207
208 min_iter = None
209 else:
210 min_iter = self.options['min_iter']
211
212 required_parses = self.options['parses']
213
214 timeout = 60*self.options['timeout']
215 check_timeout = timeout>0
216
217 self.timed_out = False
218
219
220
221 prog_logger = self.logger
222
223 if check_timeout:
224 prog_logger.info("Due to timeout after %d mins" % self.options['timeout'])
225
226
227
228
229 timeout_timer = ExecutionTimer(clock=True)
230
231 signs_taken = [0]*input_length
232
233 offset = 0
234 last_lexicals = [0]*(input_length)
235 try:
236
237
238
239
240 while (min_iter is None or (offset < min_iter) \
241 or len(chart.parses) < required_parses):
242 if max_iter is not None and offset >= max_iter:
243
244 prog_logger.info("Reached maximum number of iterations: "\
245 "continuing to backoff/fail")
246 break
247 prog_logger.info(">>> Parsing iteration: %d" % (offset+1))
248
249 added = self._add_signs(offset=offset)
250
251 if added:
252
253 added_spans = set([(start,end) for (start,end,sign) in added])
254 for (start,end) in added_spans:
255 chart.apply_unary_rules(start,end)
256 else:
257
258 prog_logger.info("No new signs added: ending parse")
259 break
260
261
262
263 for end in range(1,input_length+1):
264 if time:
265
266 timer = ExecutionTimer()
267 chart.apply_unary_rules(end-1, end)
268
269
270 for start in range(end-2,-1,-1):
271 for middle in range(start+1,end):
272 chart.apply_binary_rules(start, middle, end)
273
274
275
276 if check_timeout:
277
278 if int(timeout_timer.get_time()) > timeout:
279
280 raise ParserTimeout
281
282
283 chart.apply_unary_rules(start, end)
284
285 if summaries:
286 prog_logger.info("Completed parsing up to node %d / %d (%.2f secs)" % (end,input_length, timer.get_time()))
287 if summaries != 2:
288 prog_logger.info(chart.summary)
289 if self.options['dump_chart']:
290
291 dump_chart(chart, self.options['dump_chart'])
292
293 if summaries:
294 prog_logger.info("Completed parsing to end of sequence")
295 if summaries != 2:
296 prog_logger.info(chart.summary)
297
298 offset += 1
299 except ParserTimeout:
300
301 prog_logger.info("Parse timeout (%d mins) expired: continuing "\
302 "to backoff/fail" % self.options['timeout'])
303
304 self.timed_out = True
305 except KeyboardInterrupt:
306
307
308 self.chart.kill_inspector()
309 raise
310
311 parses = chart.parses
312 if len(parses) == 0 and self.backoff is not None:
313 prog_logger.info("Using backoff model")
314 backoff_results = self.run_backoff()
315 if len(backoff_results) > 0:
316 for res in backoff_results:
317
318
319 sign = self.grammar.formalism.Syntax.Sign(
320 self.grammar.formalism.Syntax.DummyCategory(),
321 res)
322
323 if hasattr(res, "probability"):
324 sign.probability = res.probability
325 parses.append(sign)
326 elif len(parses):
327 prog_logger.info("Parse finished with %d results" % len(parses))
328 else:
329 prog_logger.info("Parse finished with no results")
330
331
332 if not self.options['inspect_persist']:
333 self.chart.kill_inspector()
334
335 return parses
336
338 """
339 DirectedCkyParser is a special version of the CKY parser that tries
340 to produce a parse according to a pre-built derivation tree.
341
342 Why?
343 Canonical trees are stored implicitly in the Jazz corpus. We can
344 build the explicit structure of the trees, in accordance with the
345 implicit manual annotations, but this will not contain any signs
346 on internal nodes. The structure does not produce a parse in itself
347 or even verify that the sequence can be parsed with that structure.
348
349 The purpose of the DirectedCkyParser is to take a description of
350 this annotated structure and actually perform the parse, packing
351 the chart with only those signs that the derivation structure
352 produces.
353
354 The parser should be used with a tagger that assigns only those
355 signs that were annotated. Use the PretaggedTagger to do this.
356
357 """
358 PARSER_OPTIONS = Parser.PARSER_OPTIONS + [
359 ModuleOption('derivations', filter=bool,
360 help_text="Store derivation traces along with the results",
361 usage="derivations=X, where X is 'True' or 'False'.",
362 default=None,
363 ),
364 ]
365
366 - def __init__(self, grammar, tagger, derivation_tree=None, *args, **kwargs):
367 if derivation_tree is None:
368 raise ValueError, "DirectedCkyParser must be instantiated "\
369 "with a derivation tree in kwarg 'derivation_tree'."
370 self.derivation_tree = derivation_tree
371 super(DirectedCkyParser, self).__init__(grammar,tagger,*args,**kwargs)
372
376
377 - def parse(self, derivations=False, summaries=False):
378 """
379 Run the parser on the input, using the specified tagger. Runs
380 the CKY parsing algorithm to do chart parsing. For details of
381 chart parsing, see Chart class.
382 """
383 if 'derivations' in self.options and self.options['derivations'] is not None:
384 derivations = self.options['derivations']
385
386
387 input_length = self.tagger.input_length
388
389
390 chart = self._create_chart(
391 signs=[[]]*input_length,
392 derivations=derivations)
393
394
395
396
397
398
399 for word in range(input_length):
400 new_cat_pairs = self.tagger.get_signs_for_word(word)
401 new_cats = [cat for (cat, tag, prob) in new_cat_pairs]
402 chart.add_word_signs(new_cats, word, self.tagger.get_word(word))
403
404
405
406 rule_mapping = self.grammar.formalism.PcfgParser.rule_short_names
407
408
409
410 def _fill_chart(start, tree_node):
411 """
412 Recursively fills the chart using the subtree rooted by
413 tree_node, using start as the leftmost node of the chart.
414 Returns the resulting rightmost node covered by this
415 span.
416 """
417 if hasattr(tree_node, 'children') and len(tree_node.children) > 0:
418 if len(tree_node.children) > 2:
419 raise DirectedParseError, "invalid derivation tree. "\
420 "Nodes may have up to 2 children. This node has "\
421 "%d: %s" % (len(tree_node.children), tree_node)
422
423
424 sub_end = start
425 middle = None
426 for child in tree_node.children:
427 sub_end = _fill_chart(sub_end, child)
428 if middle is None:
429
430 middle = sub_end
431
432 end = sub_end
433
434 try:
435 rule_details = rule_mapping[tree_node.rule]
436 except KeyError:
437 raise DirectedParseError, "tree node %s specifies a "\
438 "rule '%s' which is not defined for this "\
439 "formalism. Are you using the right formalism "\
440 "for your data?" % (tree_node, tree_node.rule)
441 rule_cls = self.grammar.formalism.rules[rule_details[0]]
442
443 rule_kwargs = {
444 'grammar' : self.grammar,
445 'modalities' : self.grammar.modality_tree,
446 }
447 rule_kwargs.update(rule_details[1])
448 rule = rule_cls(**rule_kwargs)
449
450
451 if len(tree_node.children) != rule.arity:
452 raise DirectedParseError, "a node was encountered "\
453 "that does not have the right number of children "\
454 "for its rule. %s must have %d children." % \
455 (tree_node.rule, rule.arity)
456
457 if rule.arity == 1:
458 added = chart.apply_unary_rule(rule, start, end)
459 debug_inputs = "%s, [%s]" % (rule,
460 ", ".join(["%s" % s for s in chart.get_signs(start, end)])
461 )
462 elif rule.arity == 2:
463 added = chart.apply_binary_rule(rule, start, middle, end)
464 debug_inputs = "%s, [%s] and [%s]" % (rule,
465 ", ".join(["%s" % s for s in chart.get_signs(start, middle)]),
466 ", ".join(["%s" % s for s in chart.get_signs(middle, end)])
467 )
468
469 if not added:
470
471
472 raise DirectedParseError, "failed to apply rule %s. "\
473 "Giving up on parse. "\
474 "Tree: %s. Inputs: %s." % \
475 (tree_node.rule, tree_node, debug_inputs)
476 elif hasattr(tree_node, 'chord'):
477
478
479
480
481 end = start + 1
482 else:
483
484 raise DirectedParseError, "derivation tree for directed "\
485 "parse should be made up of internal trees with "\
486 "children and leaves with a chord attribute. This "\
487 "node is neither: %s" % tree_node
488 return end
489 rightmost = _fill_chart(0, self.derivation_tree)
490
491 return chart.parses
492
495
498