Package jazzparser :: Package data :: Module dependencies
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.data.dependencies

  1  """Data structures for dependency graphs. 
  2   
  3  """ 
  4  """ 
  5  ============================== License ======================================== 
  6   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
  7    
  8   This file is part of The Jazz Parser. 
  9    
 10   The Jazz Parser is free software: you can redistribute it and/or modify 
 11   it under the terms of the GNU General Public License as published by 
 12   the Free Software Foundation, either version 3 of the License, or 
 13   (at your option) any later version. 
 14    
 15   The Jazz Parser is distributed in the hope that it will be useful, 
 16   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18   GNU General Public License for more details. 
 19    
 20   You should have received a copy of the GNU General Public License 
 21   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 22   
 23  ============================ End license ====================================== 
 24   
 25  """ 
 26  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 
 27   
28 -class DependencyGraph(object):
29 """ 30 Data structure to represent dependency graphs. Each dependency arc is 31 represented as a pair of indices, with a label. 32 33 Node index 0 has a special meaning: it is the root node. 34 35 """
36 - def __init__(self, arcs=[], words=[]):
37 self.arcs = [] 38 self.nodes = [] 39 self.words = words 40 # For efficient lookup, index arcs by start and end edges 41 self._arcs_by_source = {} 42 self._arcs_by_target = {} 43 44 # Add each arc one by one to get all the indexing right 45 for (source,target,label) in arcs: 46 self.add_arc(source, target, label)
47
48 - def __str__(self):
49 if self.words: 50 words = "%s\n" % " ".join(self.words) 51 else: 52 words = "" 53 def _node(i): 54 if i == 0: 55 return "ROOT" 56 else: 57 if i-1 < len(self.words): 58 return "%d:%s" % (i, self.words[i-1]) 59 else: 60 return str(i)
61 return "%s[%s]" % (words, "\n ".join("%s -> %s (%s)" % (_node(source), 62 _node(target), 63 label) \ 64 for (source,target,label) in sorted(self.arcs)))
65
66 - def __len__(self):
67 return len(self.arcs)
68
69 - def add_arc(self, source, target, label):
70 if (source, target, label) not in self: 71 self.arcs.append((source, target, label)) 72 if source not in self.nodes: 73 self.nodes.append(source) 74 if target not in self.nodes: 75 self.nodes.append(target) 76 77 self._arcs_by_source.setdefault(source, []).append((source, target, label)) 78 self._arcs_by_target.setdefault(target, []).append((source, target, label))
79
80 - def remove_arc(self, source, target, label):
81 if (source, target, label) in self: 82 self.arcs.remove((source, target, label)) 83 del self._arcs_by_source[source] 84 del self._arcs_by_target[target]
85
86 - def arcs_from(self, source):
87 return self._arcs_by_source.get(source, [])
88
89 - def arcs_to(self, target):
90 return self._arcs_by_target.get(target, [])
91
92 - def arcs_spanning(self, source, target):
93 return [arc for arc in self.arcs_from(source) if arc[1] == target]
94
95 - def root_arcs(self):
96 """ 97 Returns all the arcs coming directly from the root node. 98 99 """ 100 return self.arcs_from(0)
101
102 - def __contains__(self, arc):
103 return arc in self.arcs
104 105 106 ########################### Input/output ###################
107 -def malt_tab_to_dependency_graphs(data):
108 """ 109 Reads in data in the Malt-TAB format, as used by the Malt Parser, 110 and returns a list of dependency graphs. 111 112 A error will be raised if the input is in two-column format, because 113 you can't build a dependency graph from that. 114 115 @type data: string 116 @param data: file content 117 118 @see: http://w3.msi.vxu.se/~nivre/research/MaltXML.html 119 120 """ 121 lines = data.splitlines() 122 graph_arcs = [] 123 124 current_arcs = [] 125 for line in lines: 126 if len(line.strip()) == 0: 127 # Empty line: start a new graph 128 if len(current_arcs): 129 graph_arcs.append(current_arcs) 130 current_arcs = [] 131 continue 132 133 # Interpret the line by splitting on tabs 134 parts = line.split("\t") 135 # The line may have two parts (form and pos tag) or four 136 if len(parts) == 2: 137 raise MaltTabReadError, "can't construct a dependency graph "\ 138 "out of a two-column malt-tab file" 139 elif len(parts) != 4: 140 raise MaltTabReadError, "each line may have 2 or 4 columns, "\ 141 "found %d in \"%s\"" % (len(parts), line) 142 else: 143 word = parts[0] 144 pos = parts[1] 145 if parts[2]: 146 head = int(parts[2]) 147 else: 148 head = None 149 label = parts[3] 150 current_arcs.append((word,pos,head,label)) 151 152 # Finish the last graph 153 if len(current_arcs): 154 graph_arcs.append(current_arcs) 155 156 # Read all the lines successfully 157 # Produce a representation of each graph 158 graphs = [] 159 for arcset in graph_arcs: 160 words = [arc[0] for arc in arcset] 161 arcs = [(i+1,arc[2],arc[3]) for (i,arc) in enumerate(arcset)] 162 graphs.append(DependencyGraph(arcs, words)) 163 164 return graphs
165
166 -class MaltTabReadError(Exception):
167 pass
168
169 -def dependency_graph_to_latex(graph, words=[], number_nodes=False, 170 fmt_lab=str, graph_id=None, extra_rows=[]):
171 """ 172 Output a latex representation of the dependency graph to typeset it 173 using tikz-dependency. 174 175 """ 176 if number_nodes: 177 words = [str(i) for i in range(len(graph.nodes)-1)] 178 else: 179 if len(words): 180 words = list(words) 181 else: 182 words = graph.words 183 if len(words) < len(graph.nodes)+1: 184 words.extend([""]*(len(graph.nodes)-len(words)-1)) 185 if len(extra_rows): 186 # Add blanks to the extra rows as well 187 extra_rows = [ 188 row + [""]*(len(graph.nodes)-len(row)-1) for row in extra_rows] 189 190 nmap = dict([(old,new) for (new,old) in enumerate(sorted(graph.nodes))]) 191 192 deptext = " \\& ".join(words) + "\\\\\n" 193 if len(extra_rows): 194 deptext += "\\\\\n".join("\\& ".join(row) for row in extra_rows) 195 196 edges = "\n".join([ 197 " \\deproot{%d}{%s}" % (nmap[start],fmt_lab(label)) if end == 0 else \ 198 " \\depedge{%d}{%d}{%s}" % (nmap[end],nmap[start],fmt_lab(label)) for (start,end,label) \ 199 in sorted(graph.arcs)]) 200 201 options = [] 202 if graph_id is not None: 203 # Set a custom id for the graph, so we can refer to it elsewhere 204 options.append(", dep id=%s" % graph_id) 205 206 string = """\ 207 \\begin{dependency}[theme = simple%(opts)s] 208 \\begin{deptext}[column sep=0.6em] 209 %(words)s \\\\ 210 \\end{deptext} 211 %(edges)s 212 \\end{dependency}""" % { 213 'words' : deptext, 214 'edges' : edges, 215 'opts' : "".join(options) } 216 return string
217 218 219 ########################### Alignment-related utilities ################## 220
221 -def optimal_node_alignment(graph1, graph2, label_compare=(lambda x,y:x==y)):
222 """ 223 Produces the alignment between the nodes of the two dependency graphs that 224 maximizes the shared dependencies. 225 226 Returns a list of aligned pairs of node indices, using C{None} to 227 represent deletions/insertions. 228 229 """ 230 N = len(graph1.nodes) 231 M = len(graph2.nodes) 232 indices1 = list(enumerate(sorted(graph1.nodes))) 233 indices2 = list(enumerate(sorted(graph2.nodes))) 234 root1 = min(graph1.nodes) 235 root2 = min(graph2.nodes) 236 237 # Initialize a score matrix 238 S = [] 239 P = [] 240 for x in range(N): 241 row = [] 242 prow = [] 243 for y in range(M): 244 row.append([]) 245 prow.append([]) 246 S.append(row) 247 P.append(prow) 248 249 # Utility functions 250 def _append(lst, val): 251 if val not in lst: 252 lst.append(val)
253 def addS(x, y, val, pointer): 254 # If a value (same score, same dependencies) is already there, don't 255 # add it again 256 if val not in S[x][y]: 257 S[x][y].append(val) 258 # Also add a pointer so we can trace the alignment back 259 P[x][y].append(pointer) 260 def removeS(x, y, index): 261 # Remove the value from S and pointers P 262 del S[x][y][index] 263 del P[x][y][index] 264 265 # First nodes should always be aligned: these are the root nodes 266 # Find dependencies to or from root 267 root_deps = [] 268 for (source1,target1,label1) in graph1.arcs_from(root1): 269 for (source2,target2,label2) in graph2.arcs_from(root2): 270 if label_compare(label1, label2): 271 # If we align the nodes at the other end of this 272 # arc, we match a dependency 273 _append(root_deps, (target1,target2)) 274 for (source1,target1,label1) in graph1.arcs_to(root1): 275 for (source2,target2,label2) in graph2.arcs_to(root2): 276 if label_compare(label1, label2): 277 # If we align the nodes at the other end of this 278 # arc, we match a dependency 279 _append(root_deps, (source1,source2)) 280 # We don't propogate these potential alignments through the table, since 281 # they're the same everywhere 282 283 # Initialize (0,0) to empty 284 # This will get propogated along the first row and column 285 addS(0, 0, (0, []), ('UL',0)) 286 287 for x,node1 in indices1: 288 for y,node2 in indices2: 289 if x > 0: 290 # Insertion in graph1 291 for i,(score,deps) in enumerate(S[x-1][y]): 292 # Remove dependencies that can't possibly be match now 293 deps = [(goal1, goal2) for (goal1, goal2) in deps if \ 294 goal1 != node1] 295 # Don't add anything to the score for this 296 addS(x, y, (score,deps), ('U',i)) 297 298 if y > 0: 299 # Insertion in graph2 300 for i,(score,deps) in enumerate(S[x][y-1]): 301 # Remove dependencies that can't possibly be match now 302 deps = [(goal1, goal2) for (goal1, goal2) in deps if \ 303 goal2 != node2] 304 # Don't add anything to the score for this 305 addS(x, y, (score,deps), ('L',i)) 306 307 if x > 0 and y > 0: 308 # Alignment 309 for i,(score,deps) in enumerate(S[x-1][y-1]): 310 # Count how many dependencies were satisfied by this alignment 311 matched = 0 312 new_deps = [] 313 for (goal1, goal2) in deps: 314 if goal1 == node1 and goal2 == node2: 315 # A required match to match a dependency arc 316 matched += 1 317 elif goal1 > node1 and goal2 > node2: 318 # Keep looking for this 319 new_deps.append((goal1, goal2)) 320 # Check if we've matched any root arcs by this alignment 321 matched += root_deps.count((node1,node2)) 322 323 # Find dependencies that might later be matched thanks to 324 # this alignment 325 for (source1,target1,label1) in graph1.arcs_from(node1): 326 for (source2,target2,label2) in graph2.arcs_from(node2): 327 if label_compare(label1, label2) and target1 > node1 and target2 > node2: 328 # If we align the nodes at the other end of this 329 # arc, we match a dependency 330 new_deps.append((target1,target2)) 331 332 # Do the same for dependencies pointing backwards to here 333 for (source1,target1,label1) in graph1.arcs_to(node1): 334 for (source2,target2,label2) in graph2.arcs_to(node2): 335 if label_compare(label1, label2) and source1 > node1 and source2 > node2: 336 # If we align the nodes at the other end of this 337 # arc, we match a dependency 338 new_deps.append((source1,source2)) 339 340 addS(x, y, (score+matched, new_deps), ('UL',i)) 341 342 # Remove from S[x][y] any (score,deps) where the maximum 343 # possible score, score+len(deps), <= some other option's 344 # minimum score 345 # Find the option with the highest minimum score 346 max_min_score,top_scorer = max((score,i) for i,(score,deps) in enumerate(S[x][y])) 347 # Get indices to remove 348 # Make sure not to remove the one that had the max min score 349 remove = [i for i,(score,deps) in enumerate(S[x][y]) if \ 350 score+len(deps) <= max_min_score \ 351 and i != top_scorer] 352 shifter = 0 353 for i in remove: 354 removeS(x, y, i-shifter) 355 shifter += 1 356 357 # Trace back through the pointers to find operations that make the alignment 358 index,(score,deps) = max(enumerate(S[-1][-1]), key=lambda x:x[1][0]) 359 x = N-1 360 y = M-1 361 ops = [] 362 while not (x == 0 and y == 0): 363 direction,index = P[x][y][index] 364 if direction == 'U': 365 x -= 1 366 ops.append('I1') 367 elif direction == 'L': 368 y -= 1 369 ops.append('I2') 370 else: 371 y -= 1 372 x -= 1 373 ops.append('A') 374 # Always align the root nodes with each other 375 ops.append('A') 376 ops.reverse() 377 378 # Pair up the node indices according to these operations 379 nodes1it = iter(sorted(graph1.nodes)) 380 nodes2it = iter(sorted(graph2.nodes)) 381 pairs = [] 382 for op in ops: 383 if op == "A": 384 # Take one from both graphs 385 pairs.append((nodes1it.next(), nodes2it.next())) 386 elif op == "I1": 387 # Insertion in graph1 388 pairs.append((nodes1it.next(), None)) 389 else: 390 # Insertion in graph2 391 pairs.append((None, nodes2it.next())) 392 393 return pairs 394
395 -def alignment_to_graph(node_pairs, graph1, graph2, label_compare=(lambda x,y:x==y)):
396 """ 397 Given a list of pairs of aligned nodes, as returned by 398 L{optimal_node_alignment}, produces the dependency graph that contains 399 only the shared dependencies. The node indices do not, of course, 400 correspond to those in the input graphs, so we also return a mapping 401 from the node indices in the common graph to those in each of the 402 other graphs. 403 404 """ 405 arcs = [] 406 # Filter out pairs that don't align the two graphs 407 node_pairs = [(n1,n2) for (n1,n2) in node_pairs if n1 is not None and n2 is not None] 408 g1_to_g2 = dict(node_pairs) 409 new_nodes = dict([(node1,i) for i,(node1,node2) in enumerate(node_pairs)]) 410 new_nodes2 = dict([(node2,i) for i,(node1,node2) in enumerate(node_pairs)]) 411 412 for source1,target1,label1 in graph1.arcs: 413 # Check whether the other graph has a corresponding arc 414 if source1 in g1_to_g2: 415 source2 = g1_to_g2[source1] 416 else: 417 continue 418 419 if target1 in g1_to_g2: 420 target2 = g1_to_g2[target1] 421 else: 422 continue 423 424 matching = graph2.arcs_spanning(source2, target2) 425 for (__,__,label2) in matching: 426 if label_compare(label1, label2): 427 # This arc matches between the two 428 # Add an arc to the common graph 429 arcs.append((new_nodes[source1], new_nodes[target1], label1)) 430 431 return DependencyGraph(arcs), new_nodes, new_nodes2
432