jazzparser.parsers.cky.chart

1 """Chart representation for CKY chart parsing. 2 3 Classes and utility methods for CKY chart parsing for the Jazz Parser. 4 This provides most of the main functionality of the CKY parser, 5 apart from the main parse loop and the tagger interface. 6 7 """ 8 """ 9 ============================== License ======================================== 10 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 11 12 This file is part of The Jazz Parser. 13 14 The Jazz Parser is free software: you can redistribute it and/or modify 15 it under the terms of the GNU General Public License as published by 16 the Free Software Foundation, either version 3 of the License, or 17 (at your option) any later version. 18 19 The Jazz Parser is distributed in the hope that it will be useful, 20 but WITHOUT ANY WARRANTY; without even the implied warranty of 21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 GNU General Public License for more details. 23 24 You should have received a copy of the GNU General Public License 25 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>. 26 27 ============================ End license ====================================== 28 29 """ 30 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 31 32 33 from jazzparser.data import DerivationTrace, Fraction, HashSet 34 import logging 35 36 # Get the logger from the logging system 37 logger = logging.getLogger("main_logger") 38 39

40 -class SignHashSet(HashSet):

41 """ 42 Based on the basic hash set implementation. Provides special 43 behaviour for adding already existing signs, so that, e.g., the 44 derivation traces get added when a new sign is an alternative 45 derivation of an already existing one. 46 47 SignHashSet additionally stores and maintains an index of the signs 48 by category. It is useful during parsing to get just the distinct 49 categories, since all the signs with the same category will be 50 subject to the same rule applications. 51 52 """

53 - def __init__(self, formalism, derivation_traces=False):

54 super(SignHashSet, self).__init__() 55 self.formalism = formalism 56 self._signs_by_category = {} 57 self.derivation_traces = derivation_traces

58

59 - def append(self, new_entry):

60 """ 61 Overrides append() to maintain the index of signs grouped 62 by category. 63 64 See L{jazzparser.data.HashSet} for main doc. 65 66 """ 67 added = super(SignHashSet, self).append(new_entry) 68 if added: 69 # The new entry was added to the set: index it by category 70 if new_entry.category in self._signs_by_category: 71 self._signs_by_category[new_entry.category].append(new_entry) 72 else: 73 self._signs_by_category[new_entry.category] = [new_entry] 74 return added

75

76 - def remove(self, entry):

77 """ 78 Overrides remove() to maintain the index of signs grouped 79 by category. 80 81 See L{jazzparser.data.HashSet} for main doc. 82 83 """ 84 super(SignHashSet, self).remove(entry) 85 # Also remove it from the category index if it's there 86 if entry.category in self._signs_by_category and \ 87 entry in self._signs_by_category[entry.category]: 88 self._signs_by_category[entry.category].remove(entry) 89 # Remove the key as well if the list is now empty 90 if len(self._signs_by_category[entry.category]) == 0: 91 del self._signs_by_category[entry.category]

92

93 - def _add_existing_value(self, existing_value, new_value):

94 # Add the new derivation trace if necessary 95 if self.derivation_traces: 96 existing_value.\ 97 derivation_trace.add_rules_from_trace(new_value.derivation_trace) 98 # Hand this over to the formalism-specific routine for merging 99 # equal signs 100 self.formalism.Syntax.merge_equal_signs(existing_value, new_value)

101

102 - def get_distinct_categories(self):

103 """ 104 @return: a list containing one of each of the distinct syntactic 105 categories found among the signs in this set. 106 107 """ 108 return self._signs_by_category.keys()

109

110 - def get_signs_grouped_by_category(self):

111 """ 112 @rtype: list of lists of L{Sign<jazzparser.formalisms.base.SignBase>}s 113 @return: all the signs in the set, grouped into lists of 114 signs sharing the same syntactic category. 115 116 """ 117 return self._signs_by_category.values()

118

119 - def get_signs_by_category(self, category):

120 """ 121 @rtype: list of L{Sign<jazzparser.formalisms.base.SignBase>}s 122 @return: all the signs in the set that have a category equal 123 to that given. 124 125 """ 126 if category in self._signs_by_category: 127 return self._signs_by_category[category] 128 else: 129 return []

130

131 -class Chart(object):

132 """ 133 Represents a chart for use in CKY chart parsing. 134 A Chart stores a table of signs between node pairs. 135 It keeps a list of all the rules that can be applied. 136 137 There are no edges (i,i) and all edges are directed forward 138 (i.e. (i,j) where j>i). Internally, the table only 139 stores edges from each i to each j>i. 140 141 In the interface, however, edges are always referred to 142 by the nodes they go from and to. 143 144 Functions are contained in Chart for applying unary and binary rules. 145 146 You may instantiate a chart with no signs. You must still provide a 147 signs list, which will define the size of the chart, so you should 148 fill it with empty lists. 149 150 By default, chart.parses will only return atomic results, since 151 only these represent full parses. If you want all signs that span 152 the whole input, use chart.get_signs(0, end). If you decide that 153 complex results represent real parses, instantiate the chart with 154 allow_complex=True. 155 156 """ 157 HASH_SET_IMPL = SignHashSet 158

159 - def __init__(self, grammar, signs, derivations=False, hash_set_kwargs={}, allow_complex=False):

160 self.derivations = derivations 161 self.grammar = grammar 162 self.allow_complex = allow_complex 163 # For efficiency 164 self._all_brules_applied = {} 165 166 self.inspector = None 167 168 # Prepare the chart 169 self._table = [] 170 for x in range(len(signs)): 171 # Row for each node 172 self._table.append([]) 173 for y in range(x,len(signs)): 174 # Cell (column) for each node 175 # Cells are currently empty hash tables: will later put categories in here 176 self._table[x].append( 177 self.HASH_SET_IMPL(grammar.formalism, 178 derivation_traces=derivations, 179 **hash_set_kwargs)) 180 for i,sign_list in enumerate(signs): 181 if len(sign_list): 182 self.add_word_signs(sign_list,i)

183

184 - def __len__(self):

185 return len(self._table)

186 187 size = property(__len__) 188

189 - def _get_parses(self):

190 results = self._table[0][self.size-1].values() 191 if not self.allow_complex: 192 # Only return atomic categories: complex categories do not 193 # represent a full parse 194 def _is_atomic(sign): 195 return self.grammar.formalism.Syntax.is_atomic_category(sign.category)

196 results = filter(_is_atomic, results) 197 return results

198 parses = property(_get_parses) 199

200 - def get_signs(self, start, end):

201 """ 202 Gets a list of the signs in the chart between nodes start 203 and end. 204 """ 205 if end <= start: 206 logger.warning("Tried to get signs from %d to %d" % (start,end)) 207 return None 208 return self._table[start][end-start-1].values()

209

210 - def get_grouped_signs(self, start, end):

211 """ 212 Like L{get_signs}, but return a list of lists of signs, such 213 that every sign in a sublist has the same syntactic category. 214 215 """ 216 if end <= start: 217 logger.warning("Tried to get signs from %d to %d" % (start,end)) 218 return None 219 return self._table[start][end-start-1].get_signs_grouped_by_category()

220

221 - def get_sign(self, start, end, index):

222 """ 223 Returns the sign between start and end with the given index. 224 """ 225 signs = self.get_signs(start, end) 226 if signs is None or index >= len(signs): 227 return None 228 else: 229 return signs[index]

230

231 - def get_sign_pairs(self, start, middle, end):

232 """ 233 Gets a list of pairs (first,second) such that 234 first starts at start and ends at middle and second 235 starts at middle and ends at end. 236 """ 237 pairs = [] 238 firsts = self.get_signs(start, middle) 239 seconds = self.get_signs(middle, end) 240 for first in firsts: 241 for second in seconds: 242 pairs.append((first,second)) 243 return pairs

244

245 - def get_grouped_sign_pairs(self, start, middle, end):

246 """ 247 Like L{get_sign_pairs}, but instead of returning pairs of signs, 248 returns pairs of sign groups, where all the signs in the group 249 have the same syntactic category. 250 251 """ 252 firsts = self.get_grouped_signs(start, middle) 253 seconds = self.get_grouped_signs(middle, end) 254 return [(first,second) for first in firsts for second in seconds]

255

256 - def add_word_signs(self, signs, start_node, word, end_node=None):

257 """ 258 Adds a single-word categories in list "signs" to the chart for the word 259 starting at node C{start_node}. This may span more than one node, in 260 which case C{end_node} should be given as well. By default, C{end_node} 261 will be assumed to be C{start_node}+1. 262 263 """ 264 # Span is stored in the table internally as 265 # (start_node, end_node-start_node-1) 266 if end_node is None: 267 span_end = 0 268 else: 269 span_end = end_node-start_node-1 270 assert span_end >= 0 271 272 if self.derivations: 273 for sign in signs: 274 sign.derivation_trace = DerivationTrace(sign, word=word) 275 return self._table[start_node][span_end].extend(signs)

276

277 - def apply_unary_rules(self, start, end, *args, **kwargs):

278 """ 279 Adds to the chart all signs resulting from possible 280 applications of unary rules to existing signs between 281 nodes start and end. 282 283 Additional args/kwargs get passed on to L{apply_unary_rule}. 284 285 @return: True if signs were added as a result of rule application, 286 False otherwise 287 """ 288 signs_added = False 289 290 unary_rules = self.grammar.unary_rules 291 # Try applying each rule in turn 292 for rule in unary_rules: 293 added = self.apply_unary_rule(rule, start, end, *args, **kwargs) 294 if added: 295 signs_added = True 296 return signs_added

297

298 - def apply_unary_rule(self, rule, start, end, result_modifier=None):

299 """ 300 Applies a given unary rule to particular arcs and adds the 301 results to the chart. 302 303 @type result_modifier: 2-arg function 304 @param result_modifier: function to be applied to each result, 305 taking the result sign as the first argument and the input 306 sign as the second. 307 308 """ 309 signs_added = False 310 input_signs = self.get_signs(start, end) 311 # Apply to each existing sign 312 for sign in input_signs: 313 # Don't try applying unary rules more than once (they'll have the same results) 314 if not sign.check_rule_applied(rule): 315 # Get the possible results of applying the rule 316 results = rule.apply_rule([sign]) 317 # Check the rule was able to apply 318 if results is not None: 319 # If storing derivation traces, add them now 320 if self.derivations: 321 for result in results: 322 result.derivation_trace = DerivationTrace(result, rule, [sign.derivation_trace]) 323 # Apply a result modifier if one was given 324 if result_modifier is not None: 325 result_modifier(result, sign) 326 # Store the results in the table 327 added = self._table[start][end-start-1].extend(results) 328 # If that added anything, return True at the end 329 if added: 330 signs_added = True 331 # Note that the rule has now been applied 332 sign.note_rule_applied(rule) 333 return signs_added

334

335 - def _apply_binary_rule(self, rule, sign_pair):

336 """ 337 Internal method to apply a given binary rule to a given pair 338 of signs. Note that the supported interface method is 339 apply_binary_rule(), which applies a single rule to all sign 340 pairs between given nodes. 341 342 This is used internally by L{apply_binary_rule} and 343 L{apply_binary_rules}. 344 345 """ 346 if sign_pair[0].check_rule_applied(rule, sign_pair[1]): 347 # This sign pair has been combined by this binary rule with this input before. 348 # No need to do it again. If the application is possible, the 349 # result will be in the chart 350 return [] 351 # Get the possible results of applying the rule 352 results = rule.apply_rule(sign_pair) 353 # Note for future attempts that we've already done this 354 sign_pair[0].note_rule_applied(rule, sign_pair[1]) 355 if results is not None: 356 # If storing derivation traces, add them now 357 if self.derivations: 358 for result in results: 359 result.derivation_trace = DerivationTrace(result, rule, [sign.derivation_trace for sign in sign_pair]) 360 return results 361 else: 362 return []

363

364 - def _apply_binary_rule_semantics(self, rule, sign_pair, category):

365 """ 366 Like _apply_binary_rule, but uses the C{apply_rule_semantics()} 367 of the rule instead of C{apply_rule()} and returns a list of signs 368 built by copying the category and combining it in a sign with the 369 semantics of the result. 370 371 """ 372 # Get the possible results of applying the rule 373 results = rule.apply_rule_semantics(sign_pair) 374 if results is not None: 375 # Build signs from these and the category given 376 signs = [self.grammar.formalism.Syntax.Sign( 377 category.copy(), result) 378 for result in results] 379 # If storing derivation traces, add them now 380 if self.derivations: 381 for sign in signs: 382 sign.derivation_trace = DerivationTrace(sign, rule, [s.derivation_trace for s in sign_pair]) 383 return signs 384 else: 385 return []

386

387 - def apply_binary_rules(self, start, middle, end):

388 """ 389 Add to the chart all signs resulting from possible 390 applications of binary rules to pairs of signs between 391 node pairs (start,middle) and (middle,end), 392 producing entries in (start,end). 393 394 @return: True if signs were added as a result of rule application, 395 False otherwise 396 """ 397 signs_added = False 398 399 all_pair_results = [] 400 binary_rules = self.grammar.binary_rules 401 input_pairs = self.get_grouped_sign_pairs(start, middle, end) 402 # Apply to each pair of existing signs 403 for first_set,second_set in input_pairs: 404 # Apply each binary rule 405 for rule in binary_rules: 406 # Try applying the rule to the first sign in each of 407 # the groups. If this doesn't work, we can skip all the 408 # rest of the signs in the groups, since they all have 409 # the same syntactic category. 410 results = rule.apply_rule((first_set[0], second_set[0])) 411 if results is not None: 412 if len(results) == 1: 413 # There's only one syntactic result (this is the most 414 # common thing to happen). 415 # We only need to do the semantic part of all the other 416 # rule applications, because the category will be the 417 # same as this. 418 result_cat = results[0].category 419 for first_sign in first_set: 420 for second_sign in second_set: 421 # Apply the rule 422 pair_results = self._apply_binary_rule_semantics( 423 rule, 424 (first_sign,second_sign), 425 result_cat) 426 all_pair_results.extend(pair_results) 427 else: 428 # Rule application succeeded, so we need to do it for 429 # all the signs in the groups to get all the 430 # different semantics. 431 for first_sign in first_set: 432 for second_sign in second_set: 433 # Apply the rule 434 pair_results = self._apply_binary_rule(rule, (first_sign,second_sign)) 435 all_pair_results.extend(pair_results) 436 if len(all_pair_results) > 0: 437 # Add the resulting signs to the chart 438 added = self._table[start][end-start-1].extend(all_pair_results) 439 if added: 440 signs_added = True 441 return signs_added

442

443 - def apply_binary_rule(self, rule, start, middle, end):

444 """ 445 Apply a given binary rule to particular arcs in the chart. 446 447 Note that this method is not used by apply_binary_rules for 448 efficiency reasons, but apply_binary_rules simply does the same 449 thing for all possible binary rules. 450 451 """ 452 all_pair_results = [] 453 input_pairs = self.get_grouped_sign_pairs(start, middle, end) 454 signs_added = False 455 for first_set,second_set in input_pairs: 456 # Try applying the rule to the first of each set. If this 457 # fails, it will also fail for the rest. 458 results = rule.apply_rule((first_set[0], second_set[0])) 459 if results is not None: 460 # Apply the rule to all the pairs in the cross product 461 for first_sign in first_set: 462 for second_sign in second_set: 463 pair_results = self._apply_binary_rule(rule, (first_sign,second_sign)) 464 all_pair_results.extend(pair_results) 465 if len(all_pair_results) > 0: 466 # Store the results in the table 467 added = self._table[start][end-start-1].extend(all_pair_results) 468 if added: 469 signs_added = True 470 return signs_added

471

472 - def __str__(self):

473 return self.to_string()

474

475 - def to_string(self, rows=None, cols=None):

476 output = "" 477 table_size = len(self) 478 # Allow individual rows to be selected by the list rows 479 if rows is None: 480 rows = range(table_size) 481 # Allow individual cols to be selected by the list cols 482 if cols is None: 483 cols = range(1,table_size+1) 484 # Go through each row 485 for x in rows: 486 # Print a grid for each 487 output += "\nEdges starting at %d\n" % x 488 489 for y in cols: 490 # Check the row has a cell in column y 491 if y > x and y <= table_size: 492 col = y - x - 1 493 output += " (%d,%d): %s\n" % \ 494 (x,y,", ".join(["<%d> %s" % (i,self._sign_string(sign)) for i,sign in enumerate(self._table[x][col].values())])) 495 return output

496

497 - def _sign_string(self, sign):

498 return "%s" % sign

499

500 - def _get_summary(self):

501 """ 502 Returns a multi-line string that presents a brief summary of the chart 503 as it currently stands. This does not show all of the signs in the 504 chart, but just a summary of how many signs there are in each cell. 505 """ 506 summary = "\nF\\T" 507 for col in range(1,len(self)+1): 508 summary += "\t%d" % col 509 summary += "\n\n" 510 for row in range(len(self)): 511 summary += "%d\t" % row + \ 512 "-\t"*row + \ 513 "\t".join(["%s" % len(cell) for cell in self._table[row]]) + \ 514 "\n" 515 return summary

516 summary = property(_get_summary) 517

518 - def launch_inspector(self, input=None, block=False):

519 """ 520 Starts up a graphical chart inspector to inspect this chart. 521 The inspector will run in a separate thread. 522 Subclasses of Chart should override this if they have their own 523 version of the chart inspector and instantiate that instead. 524 525 @type input: list of strings 526 @param input: a string representation of the input to pass to 527 the inspector so it can display it. 528 529 """ 530 # Get rid of any existing inspector for this chart 531 self.kill_inspector() 532 from .inspector import ChartInspectorThread 533 inspector = ChartInspectorThread(self, input_strs=input) 534 self.inspector = inspector 535 if block: 536 inspector.run() 537 else: 538 inspector.start()

539

540 - def kill_inspector(self):

541 """ 542 Kills a currently-running inspector for this chart if one exists. 543 544 """ 545 if self.inspector is not None: 546 self.inspector.window.hide() 547 self.inspector.window.destroy() 548 self.inspector = None

549 550

551 -def list_union(main_list, new_entries, derivations=False):

552 """ 553 Adds all the entries in the list new_entries to the list 554 main_list, checking first that each entry does not already 555 exist in main_list and only adding it if it doesn't. 556 557 @return: True if members were added to the main list, False if main_list is 558 unchanged. 559 """ 560 added = False 561 for new_entry in new_entries: 562 if not new_entry in main_list: 563 main_list.append(new_entry) 564 added = True 565 elif derivations: 566 main_list[main_list.index(new_entry)].\ 567 derivation_trace.add_rules_from_trace(new_entry.derivation_trace) 568 569 return added

570

571 -class ChartError(Exception):

572 """ 573 Raised if something goes wrong while building or processing a chart. 574 """ 575 pass

576

577 -def dump_chart(chart, filename):

578 """ 579 Dump a pickled representation of the chart to a file. 580 581 """ 582 import cPickle as pickle 583 data = pickle.dumps(chart) 584 file = open(filename, 'w') 585 file.write(data) 586 file.close()

587

588 -def load_chart(filename):

589 """ 590 Read in a chart file that was dumped using L{dump_chart}. 591 592 @rtype: L{Chart} 593 @return: the chart instance 594 595 """ 596 import cPickle as pickle 597 cfile = open(filename, 'r') 598 data = cfile.read() 599 cfile.close() 600 return pickle.loads(data)

601

Source Code for Module jazzparser.parsers.cky.chart