Package jazzparser :: Package data :: Module trees
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.data.trees

  1  """ 
  2  Building derivation trees from annotated chord sequences. 
  3   
  4  Trees are minimally represented in the database. Each chord may define  
  5  certain pieces of additional tree information which influence the  
  6  way its tree structure is built. 
  7   
  8  A default tree structure is defined by a left-to-right parsing algorithm, 
  9  which applies rules as it may in a particular order. The algorithm does  
 10  not parse properly, but uses only the slash direction information to  
 11  decide what rule to apply. 
 12  Essentially this results in trees in which slash categories are always  
 13  composed as soon as possible and then applied. Consecutive atomic  
 14  categories must unambiguously be combined by continuation. 
 15  Coordination will never be used by the default trees. A minimal amount  
 16  of information (the end of each coordinated constituent) can be  
 17  specified on the chords to prompt the tree to use coordination. The  
 18  first constituent is always as long as possible (which is a reasonable  
 19  arbitrary canonical tree). 
 20   
 21  Note that this should not operate directly on sequences read from the  
 22  database, but on their mirrors (see jazzparser.data.db_mirrors), since 
 23  this allows it potentially to be used independently of the database. 
 24   
 25  """ 
 26  """ 
 27  ============================== License ======================================== 
 28   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 29    
 30   This file is part of The Jazz Parser. 
 31    
 32   The Jazz Parser is free software: you can redistribute it and/or modify 
 33   it under the terms of the GNU General Public License as published by 
 34   the Free Software Foundation, either version 3 of the License, or 
 35   (at your option) any later version. 
 36    
 37   The Jazz Parser is distributed in the hope that it will be useful, 
 38   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 39   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 40   GNU General Public License for more details. 
 41    
 42   You should have received a copy of the GNU General Public License 
 43   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 44   
 45  ============================ End license ====================================== 
 46   
 47  """ 
 48  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 
 49   
 50  import jazzparser.settings as jazzsettings 
 51  from jazzparser.grammar import get_grammar 
 52  from jazzparser.utils.strings import strs 
 53   
54 -class GeneralizedCategory(object):
55 """ 56 Partially represents a CCG category in a simple way. This only 57 contains the information that's important to inferring the tree 58 structure. It's a heavily abstracted form of CCG. 59 60 """ 61 tree = None 62
63 - def is_complex(self):
64 return isinstance(self, SlashCategory)
65 complex = property(is_complex) 66
67 - def __repr__(self):
68 return str(self)
69
70 -class SlashCategory(GeneralizedCategory):
71 - def __init__(self, result, forward, argument):
72 self.result = result 73 self.forward = forward 74 self.argument = argument
75
76 - def __str__(self):
77 return "%s%s%s" % (self.result, self.forward and "/" or "\\", self.argument)
78
79 - def copy(self):
80 return SlashCategory( 81 self.result.copy(), 82 self.forward, 83 self.argument.copy())
84
85 - def __eq__(self, other):
86 return type(self) == type(other) and \ 87 self.result == other.result and \ 88 self.argument == other.argument and \ 89 self.forward == other.forward
90
91 -class AtomicCategory(GeneralizedCategory):
92 - def __init__(self):
93 pass
94
95 - def __str__(self):
96 return "X"
97
98 - def copy(self):
99 return AtomicCategory()
100
101 - def __eq__(self, other):
102 return type(self) == type(other)
103
104 -class UnknownCategory(GeneralizedCategory):
105 """ 106 The categories of some chords are unknown and not specified in the 107 annotation. We still need to parse the rest of the sequence, so 108 we just leave this in the tree as a marker of the unknown category. 109 110 """
111 - def __str__(self):
112 return "?"
113
114 - def copy(self):
115 return UnknownCategory()
116
117 - def __eq__(self, other):
118 # Can't be equal if we don't know what we are! 119 return False
120
121 -class StackMarker(object):
122 """ 123 Special type to place markers among the categories in the stack. 124 125 """ 126 pass
127
128 -class CoordinationMiddleMarker(StackMarker):
129 """ 130 Marks the end of the first part of a coordination. 131 132 """
133 - def __str__(self):
134 return "&"
135 __repr__ = __str__
136 137
138 -class SyntacticTreeNode(object):
139 """ 140 Superclass of nodes in syntactic trees. These classes are used to 141 represent the trees that we build using basic category information 142 with minimal manually-specified structural ambiguity resolution. 143 144 """ 145 pass
146
147 -class SyntacticNonTerminal(SyntacticTreeNode):
148 """ 149 An internal node in a syntactic tree. 150 151 """
152 - def __init__(self, children, rule):
153 self.children = children 154 self.rule = rule
155
156 - def __str__(self):
157 return "(%s)%s" % (" ".join(["%s" % c for c in self.children]),self.rule)
158
159 - def _get_span_length(self):
160 return sum([c.span_length for c in self.children])
161 span_length = property(_get_span_length)
162
163 -class SyntacticTerminal(SyntacticTreeNode):
164 """ 165 A terminal node (leaf) in a syntactic tree. 166 167 """
168 - def __init__(self, chord, category=None):
169 self.chord = chord 170 self.category = category
171
172 - def __str__(self):
173 return str(self.chord)
174 175 span_length = 1
176
177 -class SyntacticTreeRoot(object):
178 """ 179 The root of a syntactic tree. Note that this may be a parent node 180 of multiple tree unrelated by rule applications, if the derivation 181 did not build a single tree. 182 183 """
184 - def __init__(self, children, shift_reduce=None):
185 self.children = children 186 self.shift_reduce = shift_reduce
187
188 - def __str__(self):
189 return "Trees:%s" % " | ".join([str(tree) for tree in self.children])
190 191 192 ################### Grammatical rules for generalized categories ########## 193 """ 194 These are a very simple form of the grammatical rules we use in the 195 Music Keyspan formalism. They just do category structure checks. 196 197 """
198 -def attach_tree(cat, inputs, rule):
199 """ 200 Attach a non-terminal tree to a category on the basis of the inputs 201 from which it was built. 202 """ 203 cat.tree = SyntacticNonTerminal([c.tree for c in inputs], rule)
204
205 -def _comp(dir, stack):
206 """ Generic composition (use compf or compb). """ 207 if len(stack) < 2: 208 return False 209 if not isinstance(stack[-1], GeneralizedCategory) or \ 210 not isinstance(stack[-2], GeneralizedCategory): 211 return False 212 if not isinstance(stack[-2], SlashCategory) or \ 213 not isinstance(stack[-1], SlashCategory) or \ 214 not stack[-2].forward == dir or not stack[-1].forward == dir: 215 return False 216 else: 217 cat1 = stack.pop() 218 cat0 = stack.pop() 219 stack.append(SlashCategory(cat0.result.copy(), dir, cat1.argument.copy())) 220 attach_tree(stack[-1], [cat0, cat1], dir and "compf" or "compb") 221 return True
222
223 -def compf(stack):
224 """ Forward composition """ 225 return _comp(True, stack)
226 compf.name = "compf" 227
228 -def compb(stack):
229 """ Backward composition """ 230 return _comp(False, stack)
231 compb.name = "compb" 232
233 -def appf(stack):
234 """ Forward application """ 235 if len(stack) < 2: 236 return False 237 if not isinstance(stack[-1], GeneralizedCategory) or \ 238 not isinstance(stack[-2], GeneralizedCategory): 239 return False 240 if not isinstance(stack[-2], SlashCategory) or \ 241 not stack[-2].forward or \ 242 not stack[-2].argument == stack[-1]: 243 return False 244 else: 245 cat1 = stack.pop() 246 cat0 = stack.pop() 247 stack.append(cat0.result.copy()) 248 attach_tree(stack[-1], [cat0, cat1], "appf") 249 return True
250 appf.name = "appf" 251
252 -def appb(stack):
253 """ Backward application """ 254 if len(stack) < 2: 255 return False 256 if not isinstance(stack[-1], GeneralizedCategory) or \ 257 not isinstance(stack[-2], GeneralizedCategory): 258 return False 259 if not isinstance(stack[-1], SlashCategory) or \ 260 stack[-1].forward or \ 261 not stack[-1].argument == stack[-2]: 262 return False 263 else: 264 cat1 = stack.pop() 265 cat0 = stack.pop() 266 stack.append(cat1.result.copy()) 267 attach_tree(stack[-1], [cat0, cat1], "appb") 268 return True
269 appb.name = "appb" 270
271 -def cont(stack):
272 """ Tonic continuation """ 273 if len(stack) < 2: 274 return False 275 if not isinstance(stack[-1], GeneralizedCategory) or \ 276 not isinstance(stack[-2], GeneralizedCategory): 277 return False 278 if not isinstance(stack[-2], AtomicCategory) or \ 279 not isinstance(stack[-1], AtomicCategory): 280 return False 281 else: 282 cat1 = stack.pop() 283 cat0 = stack.pop() 284 stack.append(AtomicCategory()) 285 attach_tree(stack[-1], [cat0, cat1], "cont") 286 return True
287 cont.name = "cont" 288
289 -def coord(stack):
290 """ Cadence coordination """ 291 # stack should look like this: [... cadence, marker, cadence] 292 if len(stack) < 3: 293 return False 294 if not isinstance(stack[-1], GeneralizedCategory) or \ 295 not isinstance(stack[-3], GeneralizedCategory) or \ 296 not isinstance(stack[-2], CoordinationMiddleMarker): 297 return False 298 if not isinstance(stack[-3], SlashCategory) or \ 299 not isinstance(stack[-1], SlashCategory) or \ 300 not stack[-3].forward or not stack[-1].forward: 301 return False 302 else: 303 cat1 = stack.pop() 304 marker = stack.pop() 305 cat0 = stack.pop() 306 stack.append(cat0.copy()) 307 attach_tree(stack[-1], [cat0, cat1], "coord") 308 return True
309 coord.name = "coord" 310 311 312
313 -def generalize_category(category, formalism):
314 """ 315 Builds a simple GeneralizedCategory from a real grammatical category 316 as represented in the parser (see the Music Keyspan formalism). 317 318 """ 319 if isinstance(category, GeneralizedCategory): 320 return category 321 # Allow the formalism to specify its own way of handling as much 322 # as it likes 323 if hasattr(formalism.Syntax, "pre_generalize_category"): 324 gencat = formalism.Syntax.pre_generalize_category(category) 325 # It can return None to pass it back to the standard 326 # generalization below 327 if gencat is not None: 328 return gencat 329 330 # Put an unknown category marker in the stack where the category isn't given 331 if category is None: 332 return UnknownCategory() 333 if formalism.Syntax.is_atomic_category(category): 334 return AtomicCategory() 335 else: 336 result = generalize_category(category.result, formalism) 337 argument = generalize_category(category.argument, formalism) 338 return SlashCategory(result, category.slash.forward, argument)
339 340
341 -def build_tree_for_sequence(sequence, debug_stack=False, grammar=None, logger=None):
342 """ 343 Run through the motions of parsing the sequence in order to build 344 its tree structure. Most of the structure is implicit in the 345 lexical categories. Additional information is given in the TreeInfo 346 model, associated with chords. 347 348 """ 349 # Read in the possible categories from the grammar 350 if grammar is None: 351 grammar = get_grammar() 352 # This function will format a string and output it to a logger if logging 353 if logger is None: 354 def _log(*args): 355 pass
356 else: 357 def _log(string, *args): 358 string = string % args 359 logger.info(string) 360 361 input = [] 362 shift_reduce = [] 363 364 categories = [] 365 for chord in sequence.iterator(): 366 # Try getting a family for the specified category 367 if chord.category is None or chord.category == "": 368 category = None 369 cat_name = None 370 else: 371 if chord.category not in grammar.families: 372 raise TreeBuildError, "Could not find the category %s in "\ 373 "the lexicon" % chord.category 374 # Assume there's only one entry per family, or at least that if 375 # there are multiple they have the same argument structure. 376 category = grammar.families[chord.category][0].entries[0].sign.category 377 cat_name = chord.category 378 # Put the generalized form of the category into the stack 379 gen_cat = generalize_category(category, grammar.formalism) 380 # Attached a tree leaf to this chord 381 gen_cat.tree = SyntacticTerminal(chord, category=cat_name) 382 input.append(gen_cat) 383 categories.append("%s <= %s" % (chord,category)) 384 _log("CATEGORIES %s", categories) 385 386 input = list(reversed(input)) 387 stack = [] 388 rules = [ compf, compb, appf, appb, cont ] 389 # Now do the vague pseudo-parse 390 while len(input) > 0: 391 # SHIFT 392 shift_reduce.append("S") 393 stack.append(input.pop()) 394 if debug_stack: 395 print stack 396 _log("SHIFT stack = %s, input = %s", stack, input) 397 # Use the additional information given to us to override default 398 # rule applications 399 coord_unresolved = False 400 coord_resolved = False 401 if stack[-1].tree.chord.treeinfo.coord_unresolved: 402 # This is the end of the first part of a coordination. 403 # Continue reducing, but add a special marker afterwards 404 coord_unresolved = True 405 if stack[-1].tree.chord.treeinfo.coord_resolved: 406 # The end of the second part of a coordination. 407 # Continue reducing, then apply coordination 408 coord_resolved = True 409 410 # REDUCE 411 # Try combining the top categories on the stack 412 changed = True 413 while changed: 414 changed = False 415 # Try each rule and see whether it applies 416 for rule in rules: 417 res = rule(stack) 418 if res: 419 shift_reduce.append("R(%s)" % rule.name) 420 changed = True 421 _log("REDUCE %s, stack = %s", rule.name, stack) 422 423 if coord_resolved: 424 # Try to reduce the coordination 425 coord(stack) 426 if coord_unresolved: 427 # Add a special marker to the stack so we know where the 428 # coordination began 429 stack.append(CoordinationMiddleMarker()) 430 for cat in stack: 431 if isinstance(cat, CoordinationMiddleMarker): 432 raise TreeBuildError, "Coordination middle marker not "\ 433 "matched by an end marker. Stack: %s" % strs(stack, ", ") 434 tree = SyntacticTreeRoot([cat.tree for cat in stack], shift_reduce=shift_reduce) 435 return tree 436
437 -def tree_to_nltk(intree):
438 """ 439 Given a syntactic tree using the SyntacticTreeNode classes, 440 produces an NLTK tree. 441 This is useful, for example, to display a tree: tree.draw(). 442 You need NLTK installed to be able to do this. 443 444 """ 445 from jazzparser.utils.base import load_optional_package 446 tree = load_optional_package('nltk.tree', 'NLTK', 'doing NLTK tree operations') 447 def _node_to_nltk_tree(node): 448 if isinstance(node, SyntacticNonTerminal): 449 # Recursive case: build an internal node 450 children = [] 451 for child in node.children: 452 children.append(_node_to_nltk_tree(child)) 453 return tree.Tree("%s" % node.rule, children) 454 elif isinstance(node, SyntacticTreeRoot): 455 # Recursive case (root): build an internal node 456 children = [] 457 for child in node.children: 458 children.append(_node_to_nltk_tree(child)) 459 return tree.Tree("Root", children) 460 elif isinstance(node, SyntacticTerminal): 461 # Base case: build a leaf 462 leaf = tree.Tree("%s" % node.chord, []) 463 if node.category is not None: 464 # Add an extra node to represent the category with one child 465 leaf = tree.Tree("%s" % node.category, [leaf]) 466 return leaf
467 return _node_to_nltk_tree(intree) 468 469
470 -class TreeBuildError(Exception):
471 pass
472