1 """Chart representation for CKY chart parsing.
2
3 Classes and utility methods for CKY chart parsing for the Jazz Parser.
4 This provides most of the main functionality of the CKY parser,
5 apart from the main parse loop and the tagger interface.
6
7 """
8 """
9 ============================== License ========================================
10 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
11
12 This file is part of The Jazz Parser.
13
14 The Jazz Parser is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 The Jazz Parser is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
26
27 ============================ End license ======================================
28
29 """
30 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
31
32
33 from jazzparser.data import DerivationTrace, Fraction, HashSet
34 import logging
35
36
37 logger = logging.getLogger("main_logger")
38
39
41 """
42 Based on the basic hash set implementation. Provides special
43 behaviour for adding already existing signs, so that, e.g., the
44 derivation traces get added when a new sign is an alternative
45 derivation of an already existing one.
46
47 SignHashSet additionally stores and maintains an index of the signs
48 by category. It is useful during parsing to get just the distinct
49 categories, since all the signs with the same category will be
50 subject to the same rule applications.
51
52 """
53 - def __init__(self, formalism, derivation_traces=False):
58
60 """
61 Overrides append() to maintain the index of signs grouped
62 by category.
63
64 See L{jazzparser.data.HashSet} for main doc.
65
66 """
67 added = super(SignHashSet, self).append(new_entry)
68 if added:
69
70 if new_entry.category in self._signs_by_category:
71 self._signs_by_category[new_entry.category].append(new_entry)
72 else:
73 self._signs_by_category[new_entry.category] = [new_entry]
74 return added
75
77 """
78 Overrides remove() to maintain the index of signs grouped
79 by category.
80
81 See L{jazzparser.data.HashSet} for main doc.
82
83 """
84 super(SignHashSet, self).remove(entry)
85
86 if entry.category in self._signs_by_category and \
87 entry in self._signs_by_category[entry.category]:
88 self._signs_by_category[entry.category].remove(entry)
89
90 if len(self._signs_by_category[entry.category]) == 0:
91 del self._signs_by_category[entry.category]
92
101
103 """
104 @return: a list containing one of each of the distinct syntactic
105 categories found among the signs in this set.
106
107 """
108 return self._signs_by_category.keys()
109
111 """
112 @rtype: list of lists of L{Sign<jazzparser.formalisms.base.SignBase>}s
113 @return: all the signs in the set, grouped into lists of
114 signs sharing the same syntactic category.
115
116 """
117 return self._signs_by_category.values()
118
120 """
121 @rtype: list of L{Sign<jazzparser.formalisms.base.SignBase>}s
122 @return: all the signs in the set that have a category equal
123 to that given.
124
125 """
126 if category in self._signs_by_category:
127 return self._signs_by_category[category]
128 else:
129 return []
130
132 """
133 Represents a chart for use in CKY chart parsing.
134 A Chart stores a table of signs between node pairs.
135 It keeps a list of all the rules that can be applied.
136
137 There are no edges (i,i) and all edges are directed forward
138 (i.e. (i,j) where j>i). Internally, the table only
139 stores edges from each i to each j>i.
140
141 In the interface, however, edges are always referred to
142 by the nodes they go from and to.
143
144 Functions are contained in Chart for applying unary and binary rules.
145
146 You may instantiate a chart with no signs. You must still provide a
147 signs list, which will define the size of the chart, so you should
148 fill it with empty lists.
149
150 By default, chart.parses will only return atomic results, since
151 only these represent full parses. If you want all signs that span
152 the whole input, use chart.get_signs(0, end). If you decide that
153 complex results represent real parses, instantiate the chart with
154 allow_complex=True.
155
156 """
157 HASH_SET_IMPL = SignHashSet
158
159 - def __init__(self, grammar, signs, derivations=False, hash_set_kwargs={}, allow_complex=False):
160 self.derivations = derivations
161 self.grammar = grammar
162 self.allow_complex = allow_complex
163
164 self._all_brules_applied = {}
165
166 self.inspector = None
167
168
169 self._table = []
170 for x in range(len(signs)):
171
172 self._table.append([])
173 for y in range(x,len(signs)):
174
175
176 self._table[x].append(
177 self.HASH_SET_IMPL(grammar.formalism,
178 derivation_traces=derivations,
179 **hash_set_kwargs))
180 for i,sign_list in enumerate(signs):
181 if len(sign_list):
182 self.add_word_signs(sign_list,i)
183
185 return len(self._table)
186
187 size = property(__len__)
188
196 results = filter(_is_atomic, results)
197 return results
198 parses = property(_get_parses)
199
201 """
202 Gets a list of the signs in the chart between nodes start
203 and end.
204 """
205 if end <= start:
206 logger.warning("Tried to get signs from %d to %d" % (start,end))
207 return None
208 return self._table[start][end-start-1].values()
209
211 """
212 Like L{get_signs}, but return a list of lists of signs, such
213 that every sign in a sublist has the same syntactic category.
214
215 """
216 if end <= start:
217 logger.warning("Tried to get signs from %d to %d" % (start,end))
218 return None
219 return self._table[start][end-start-1].get_signs_grouped_by_category()
220
222 """
223 Returns the sign between start and end with the given index.
224 """
225 signs = self.get_signs(start, end)
226 if signs is None or index >= len(signs):
227 return None
228 else:
229 return signs[index]
230
232 """
233 Gets a list of pairs (first,second) such that
234 first starts at start and ends at middle and second
235 starts at middle and ends at end.
236 """
237 pairs = []
238 firsts = self.get_signs(start, middle)
239 seconds = self.get_signs(middle, end)
240 for first in firsts:
241 for second in seconds:
242 pairs.append((first,second))
243 return pairs
244
246 """
247 Like L{get_sign_pairs}, but instead of returning pairs of signs,
248 returns pairs of sign groups, where all the signs in the group
249 have the same syntactic category.
250
251 """
252 firsts = self.get_grouped_signs(start, middle)
253 seconds = self.get_grouped_signs(middle, end)
254 return [(first,second) for first in firsts for second in seconds]
255
257 """
258 Adds a single-word categories in list "signs" to the chart for the word
259 starting at node C{start_node}. This may span more than one node, in
260 which case C{end_node} should be given as well. By default, C{end_node}
261 will be assumed to be C{start_node}+1.
262
263 """
264
265
266 if end_node is None:
267 span_end = 0
268 else:
269 span_end = end_node-start_node-1
270 assert span_end >= 0
271
272 if self.derivations:
273 for sign in signs:
274 sign.derivation_trace = DerivationTrace(sign, word=word)
275 return self._table[start_node][span_end].extend(signs)
276
278 """
279 Adds to the chart all signs resulting from possible
280 applications of unary rules to existing signs between
281 nodes start and end.
282
283 Additional args/kwargs get passed on to L{apply_unary_rule}.
284
285 @return: True if signs were added as a result of rule application,
286 False otherwise
287 """
288 signs_added = False
289
290 unary_rules = self.grammar.unary_rules
291
292 for rule in unary_rules:
293 added = self.apply_unary_rule(rule, start, end, *args, **kwargs)
294 if added:
295 signs_added = True
296 return signs_added
297
299 """
300 Applies a given unary rule to particular arcs and adds the
301 results to the chart.
302
303 @type result_modifier: 2-arg function
304 @param result_modifier: function to be applied to each result,
305 taking the result sign as the first argument and the input
306 sign as the second.
307
308 """
309 signs_added = False
310 input_signs = self.get_signs(start, end)
311
312 for sign in input_signs:
313
314 if not sign.check_rule_applied(rule):
315
316 results = rule.apply_rule([sign])
317
318 if results is not None:
319
320 if self.derivations:
321 for result in results:
322 result.derivation_trace = DerivationTrace(result, rule, [sign.derivation_trace])
323
324 if result_modifier is not None:
325 result_modifier(result, sign)
326
327 added = self._table[start][end-start-1].extend(results)
328
329 if added:
330 signs_added = True
331
332 sign.note_rule_applied(rule)
333 return signs_added
334
336 """
337 Internal method to apply a given binary rule to a given pair
338 of signs. Note that the supported interface method is
339 apply_binary_rule(), which applies a single rule to all sign
340 pairs between given nodes.
341
342 This is used internally by L{apply_binary_rule} and
343 L{apply_binary_rules}.
344
345 """
346 if sign_pair[0].check_rule_applied(rule, sign_pair[1]):
347
348
349
350 return []
351
352 results = rule.apply_rule(sign_pair)
353
354 sign_pair[0].note_rule_applied(rule, sign_pair[1])
355 if results is not None:
356
357 if self.derivations:
358 for result in results:
359 result.derivation_trace = DerivationTrace(result, rule, [sign.derivation_trace for sign in sign_pair])
360 return results
361 else:
362 return []
363
365 """
366 Like _apply_binary_rule, but uses the C{apply_rule_semantics()}
367 of the rule instead of C{apply_rule()} and returns a list of signs
368 built by copying the category and combining it in a sign with the
369 semantics of the result.
370
371 """
372
373 results = rule.apply_rule_semantics(sign_pair)
374 if results is not None:
375
376 signs = [self.grammar.formalism.Syntax.Sign(
377 category.copy(), result)
378 for result in results]
379
380 if self.derivations:
381 for sign in signs:
382 sign.derivation_trace = DerivationTrace(sign, rule, [s.derivation_trace for s in sign_pair])
383 return signs
384 else:
385 return []
386
388 """
389 Add to the chart all signs resulting from possible
390 applications of binary rules to pairs of signs between
391 node pairs (start,middle) and (middle,end),
392 producing entries in (start,end).
393
394 @return: True if signs were added as a result of rule application,
395 False otherwise
396 """
397 signs_added = False
398
399 all_pair_results = []
400 binary_rules = self.grammar.binary_rules
401 input_pairs = self.get_grouped_sign_pairs(start, middle, end)
402
403 for first_set,second_set in input_pairs:
404
405 for rule in binary_rules:
406
407
408
409
410 results = rule.apply_rule((first_set[0], second_set[0]))
411 if results is not None:
412 if len(results) == 1:
413
414
415
416
417
418 result_cat = results[0].category
419 for first_sign in first_set:
420 for second_sign in second_set:
421
422 pair_results = self._apply_binary_rule_semantics(
423 rule,
424 (first_sign,second_sign),
425 result_cat)
426 all_pair_results.extend(pair_results)
427 else:
428
429
430
431 for first_sign in first_set:
432 for second_sign in second_set:
433
434 pair_results = self._apply_binary_rule(rule, (first_sign,second_sign))
435 all_pair_results.extend(pair_results)
436 if len(all_pair_results) > 0:
437
438 added = self._table[start][end-start-1].extend(all_pair_results)
439 if added:
440 signs_added = True
441 return signs_added
442
444 """
445 Apply a given binary rule to particular arcs in the chart.
446
447 Note that this method is not used by apply_binary_rules for
448 efficiency reasons, but apply_binary_rules simply does the same
449 thing for all possible binary rules.
450
451 """
452 all_pair_results = []
453 input_pairs = self.get_grouped_sign_pairs(start, middle, end)
454 signs_added = False
455 for first_set,second_set in input_pairs:
456
457
458 results = rule.apply_rule((first_set[0], second_set[0]))
459 if results is not None:
460
461 for first_sign in first_set:
462 for second_sign in second_set:
463 pair_results = self._apply_binary_rule(rule, (first_sign,second_sign))
464 all_pair_results.extend(pair_results)
465 if len(all_pair_results) > 0:
466
467 added = self._table[start][end-start-1].extend(all_pair_results)
468 if added:
469 signs_added = True
470 return signs_added
471
474
476 output = ""
477 table_size = len(self)
478
479 if rows is None:
480 rows = range(table_size)
481
482 if cols is None:
483 cols = range(1,table_size+1)
484
485 for x in rows:
486
487 output += "\nEdges starting at %d\n" % x
488
489 for y in cols:
490
491 if y > x and y <= table_size:
492 col = y - x - 1
493 output += " (%d,%d): %s\n" % \
494 (x,y,", ".join(["<%d> %s" % (i,self._sign_string(sign)) for i,sign in enumerate(self._table[x][col].values())]))
495 return output
496
499
501 """
502 Returns a multi-line string that presents a brief summary of the chart
503 as it currently stands. This does not show all of the signs in the
504 chart, but just a summary of how many signs there are in each cell.
505 """
506 summary = "\nF\\T"
507 for col in range(1,len(self)+1):
508 summary += "\t%d" % col
509 summary += "\n\n"
510 for row in range(len(self)):
511 summary += "%d\t" % row + \
512 "-\t"*row + \
513 "\t".join(["%s" % len(cell) for cell in self._table[row]]) + \
514 "\n"
515 return summary
516 summary = property(_get_summary)
517
519 """
520 Starts up a graphical chart inspector to inspect this chart.
521 The inspector will run in a separate thread.
522 Subclasses of Chart should override this if they have their own
523 version of the chart inspector and instantiate that instead.
524
525 @type input: list of strings
526 @param input: a string representation of the input to pass to
527 the inspector so it can display it.
528
529 """
530
531 self.kill_inspector()
532 from .inspector import ChartInspectorThread
533 inspector = ChartInspectorThread(self, input_strs=input)
534 self.inspector = inspector
535 if block:
536 inspector.run()
537 else:
538 inspector.start()
539
541 """
542 Kills a currently-running inspector for this chart if one exists.
543
544 """
545 if self.inspector is not None:
546 self.inspector.window.hide()
547 self.inspector.window.destroy()
548 self.inspector = None
549
550
551 -def list_union(main_list, new_entries, derivations=False):
552 """
553 Adds all the entries in the list new_entries to the list
554 main_list, checking first that each entry does not already
555 exist in main_list and only adding it if it doesn't.
556
557 @return: True if members were added to the main list, False if main_list is
558 unchanged.
559 """
560 added = False
561 for new_entry in new_entries:
562 if not new_entry in main_list:
563 main_list.append(new_entry)
564 added = True
565 elif derivations:
566 main_list[main_list.index(new_entry)].\
567 derivation_trace.add_rules_from_trace(new_entry.derivation_trace)
568
569 return added
570
572 """
573 Raised if something goes wrong while building or processing a chart.
574 """
575 pass
576
578 """
579 Dump a pickled representation of the chart to a file.
580
581 """
582 import cPickle as pickle
583 data = pickle.dumps(chart)
584 file = open(filename, 'w')
585 file.write(data)
586 file.close()
587
589 """
590 Read in a chart file that was dumped using L{dump_chart}.
591
592 @rtype: L{Chart}
593 @return: the chart instance
594
595 """
596 import cPickle as pickle
597 cfile = open(filename, 'r')
598 data = cfile.read()
599 cfile.close()
600 return pickle.loads(data)
601