1 """Data structures for dependency graphs.
2
3 """
4 """
5 ============================== License ========================================
6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
7
8 This file is part of The Jazz Parser.
9
10 The Jazz Parser is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 The Jazz Parser is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
22
23 ============================ End license ======================================
24
25 """
26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
27
29 """
30 Data structure to represent dependency graphs. Each dependency arc is
31 represented as a pair of indices, with a label.
32
33 Node index 0 has a special meaning: it is the root node.
34
35 """
37 self.arcs = []
38 self.nodes = []
39 self.words = words
40
41 self._arcs_by_source = {}
42 self._arcs_by_target = {}
43
44
45 for (source,target,label) in arcs:
46 self.add_arc(source, target, label)
47
49 if self.words:
50 words = "%s\n" % " ".join(self.words)
51 else:
52 words = ""
53 def _node(i):
54 if i == 0:
55 return "ROOT"
56 else:
57 if i-1 < len(self.words):
58 return "%d:%s" % (i, self.words[i-1])
59 else:
60 return str(i)
61 return "%s[%s]" % (words, "\n ".join("%s -> %s (%s)" % (_node(source),
62 _node(target),
63 label) \
64 for (source,target,label) in sorted(self.arcs)))
65
68
69 - def add_arc(self, source, target, label):
70 if (source, target, label) not in self:
71 self.arcs.append((source, target, label))
72 if source not in self.nodes:
73 self.nodes.append(source)
74 if target not in self.nodes:
75 self.nodes.append(target)
76
77 self._arcs_by_source.setdefault(source, []).append((source, target, label))
78 self._arcs_by_target.setdefault(target, []).append((source, target, label))
79
81 if (source, target, label) in self:
82 self.arcs.remove((source, target, label))
83 del self._arcs_by_source[source]
84 del self._arcs_by_target[target]
85
87 return self._arcs_by_source.get(source, [])
88
90 return self._arcs_by_target.get(target, [])
91
93 return [arc for arc in self.arcs_from(source) if arc[1] == target]
94
96 """
97 Returns all the arcs coming directly from the root node.
98
99 """
100 return self.arcs_from(0)
101
103 return arc in self.arcs
104
105
106
108 """
109 Reads in data in the Malt-TAB format, as used by the Malt Parser,
110 and returns a list of dependency graphs.
111
112 A error will be raised if the input is in two-column format, because
113 you can't build a dependency graph from that.
114
115 @type data: string
116 @param data: file content
117
118 @see: http://w3.msi.vxu.se/~nivre/research/MaltXML.html
119
120 """
121 lines = data.splitlines()
122 graph_arcs = []
123
124 current_arcs = []
125 for line in lines:
126 if len(line.strip()) == 0:
127
128 if len(current_arcs):
129 graph_arcs.append(current_arcs)
130 current_arcs = []
131 continue
132
133
134 parts = line.split("\t")
135
136 if len(parts) == 2:
137 raise MaltTabReadError, "can't construct a dependency graph "\
138 "out of a two-column malt-tab file"
139 elif len(parts) != 4:
140 raise MaltTabReadError, "each line may have 2 or 4 columns, "\
141 "found %d in \"%s\"" % (len(parts), line)
142 else:
143 word = parts[0]
144 pos = parts[1]
145 if parts[2]:
146 head = int(parts[2])
147 else:
148 head = None
149 label = parts[3]
150 current_arcs.append((word,pos,head,label))
151
152
153 if len(current_arcs):
154 graph_arcs.append(current_arcs)
155
156
157
158 graphs = []
159 for arcset in graph_arcs:
160 words = [arc[0] for arc in arcset]
161 arcs = [(i+1,arc[2],arc[3]) for (i,arc) in enumerate(arcset)]
162 graphs.append(DependencyGraph(arcs, words))
163
164 return graphs
165
168
171 """
172 Output a latex representation of the dependency graph to typeset it
173 using tikz-dependency.
174
175 """
176 if number_nodes:
177 words = [str(i) for i in range(len(graph.nodes)-1)]
178 else:
179 if len(words):
180 words = list(words)
181 else:
182 words = graph.words
183 if len(words) < len(graph.nodes)+1:
184 words.extend([""]*(len(graph.nodes)-len(words)-1))
185 if len(extra_rows):
186
187 extra_rows = [
188 row + [""]*(len(graph.nodes)-len(row)-1) for row in extra_rows]
189
190 nmap = dict([(old,new) for (new,old) in enumerate(sorted(graph.nodes))])
191
192 deptext = " \\& ".join(words) + "\\\\\n"
193 if len(extra_rows):
194 deptext += "\\\\\n".join("\\& ".join(row) for row in extra_rows)
195
196 edges = "\n".join([
197 " \\deproot{%d}{%s}" % (nmap[start],fmt_lab(label)) if end == 0 else \
198 " \\depedge{%d}{%d}{%s}" % (nmap[end],nmap[start],fmt_lab(label)) for (start,end,label) \
199 in sorted(graph.arcs)])
200
201 options = []
202 if graph_id is not None:
203
204 options.append(", dep id=%s" % graph_id)
205
206 string = """\
207 \\begin{dependency}[theme = simple%(opts)s]
208 \\begin{deptext}[column sep=0.6em]
209 %(words)s \\\\
210 \\end{deptext}
211 %(edges)s
212 \\end{dependency}""" % {
213 'words' : deptext,
214 'edges' : edges,
215 'opts' : "".join(options) }
216 return string
217
218
219
220
222 """
223 Produces the alignment between the nodes of the two dependency graphs that
224 maximizes the shared dependencies.
225
226 Returns a list of aligned pairs of node indices, using C{None} to
227 represent deletions/insertions.
228
229 """
230 N = len(graph1.nodes)
231 M = len(graph2.nodes)
232 indices1 = list(enumerate(sorted(graph1.nodes)))
233 indices2 = list(enumerate(sorted(graph2.nodes)))
234 root1 = min(graph1.nodes)
235 root2 = min(graph2.nodes)
236
237
238 S = []
239 P = []
240 for x in range(N):
241 row = []
242 prow = []
243 for y in range(M):
244 row.append([])
245 prow.append([])
246 S.append(row)
247 P.append(prow)
248
249
250 def _append(lst, val):
251 if val not in lst:
252 lst.append(val)
253 def addS(x, y, val, pointer):
254
255
256 if val not in S[x][y]:
257 S[x][y].append(val)
258
259 P[x][y].append(pointer)
260 def removeS(x, y, index):
261
262 del S[x][y][index]
263 del P[x][y][index]
264
265
266
267 root_deps = []
268 for (source1,target1,label1) in graph1.arcs_from(root1):
269 for (source2,target2,label2) in graph2.arcs_from(root2):
270 if label_compare(label1, label2):
271
272
273 _append(root_deps, (target1,target2))
274 for (source1,target1,label1) in graph1.arcs_to(root1):
275 for (source2,target2,label2) in graph2.arcs_to(root2):
276 if label_compare(label1, label2):
277
278
279 _append(root_deps, (source1,source2))
280
281
282
283
284
285 addS(0, 0, (0, []), ('UL',0))
286
287 for x,node1 in indices1:
288 for y,node2 in indices2:
289 if x > 0:
290
291 for i,(score,deps) in enumerate(S[x-1][y]):
292
293 deps = [(goal1, goal2) for (goal1, goal2) in deps if \
294 goal1 != node1]
295
296 addS(x, y, (score,deps), ('U',i))
297
298 if y > 0:
299
300 for i,(score,deps) in enumerate(S[x][y-1]):
301
302 deps = [(goal1, goal2) for (goal1, goal2) in deps if \
303 goal2 != node2]
304
305 addS(x, y, (score,deps), ('L',i))
306
307 if x > 0 and y > 0:
308
309 for i,(score,deps) in enumerate(S[x-1][y-1]):
310
311 matched = 0
312 new_deps = []
313 for (goal1, goal2) in deps:
314 if goal1 == node1 and goal2 == node2:
315
316 matched += 1
317 elif goal1 > node1 and goal2 > node2:
318
319 new_deps.append((goal1, goal2))
320
321 matched += root_deps.count((node1,node2))
322
323
324
325 for (source1,target1,label1) in graph1.arcs_from(node1):
326 for (source2,target2,label2) in graph2.arcs_from(node2):
327 if label_compare(label1, label2) and target1 > node1 and target2 > node2:
328
329
330 new_deps.append((target1,target2))
331
332
333 for (source1,target1,label1) in graph1.arcs_to(node1):
334 for (source2,target2,label2) in graph2.arcs_to(node2):
335 if label_compare(label1, label2) and source1 > node1 and source2 > node2:
336
337
338 new_deps.append((source1,source2))
339
340 addS(x, y, (score+matched, new_deps), ('UL',i))
341
342
343
344
345
346 max_min_score,top_scorer = max((score,i) for i,(score,deps) in enumerate(S[x][y]))
347
348
349 remove = [i for i,(score,deps) in enumerate(S[x][y]) if \
350 score+len(deps) <= max_min_score \
351 and i != top_scorer]
352 shifter = 0
353 for i in remove:
354 removeS(x, y, i-shifter)
355 shifter += 1
356
357
358 index,(score,deps) = max(enumerate(S[-1][-1]), key=lambda x:x[1][0])
359 x = N-1
360 y = M-1
361 ops = []
362 while not (x == 0 and y == 0):
363 direction,index = P[x][y][index]
364 if direction == 'U':
365 x -= 1
366 ops.append('I1')
367 elif direction == 'L':
368 y -= 1
369 ops.append('I2')
370 else:
371 y -= 1
372 x -= 1
373 ops.append('A')
374
375 ops.append('A')
376 ops.reverse()
377
378
379 nodes1it = iter(sorted(graph1.nodes))
380 nodes2it = iter(sorted(graph2.nodes))
381 pairs = []
382 for op in ops:
383 if op == "A":
384
385 pairs.append((nodes1it.next(), nodes2it.next()))
386 elif op == "I1":
387
388 pairs.append((nodes1it.next(), None))
389 else:
390
391 pairs.append((None, nodes2it.next()))
392
393 return pairs
394
396 """
397 Given a list of pairs of aligned nodes, as returned by
398 L{optimal_node_alignment}, produces the dependency graph that contains
399 only the shared dependencies. The node indices do not, of course,
400 correspond to those in the input graphs, so we also return a mapping
401 from the node indices in the common graph to those in each of the
402 other graphs.
403
404 """
405 arcs = []
406
407 node_pairs = [(n1,n2) for (n1,n2) in node_pairs if n1 is not None and n2 is not None]
408 g1_to_g2 = dict(node_pairs)
409 new_nodes = dict([(node1,i) for i,(node1,node2) in enumerate(node_pairs)])
410 new_nodes2 = dict([(node2,i) for i,(node1,node2) in enumerate(node_pairs)])
411
412 for source1,target1,label1 in graph1.arcs:
413
414 if source1 in g1_to_g2:
415 source2 = g1_to_g2[source1]
416 else:
417 continue
418
419 if target1 in g1_to_g2:
420 target2 = g1_to_g2[target1]
421 else:
422 continue
423
424 matching = graph2.arcs_spanning(source2, target2)
425 for (__,__,label2) in matching:
426 if label_compare(label1, label2):
427
428
429 arcs.append((new_nodes[source1], new_nodes[target1], label1))
430
431 return DependencyGraph(arcs), new_nodes, new_nodes2
432