| Trees | Indices | Help |
|
|---|
|
|
1 """Largest Common Embeddable Subtree.
2
3 Implementation of the algorithm from I{On the Maximum Common Embedded Subtree
4 Problem for Ordered Trees} by Loxano and Valiente. This is for unlabeled,
5 ordered trees.
6
7 """
8 """
9 ============================== License ========================================
10 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
11
12 This file is part of The Jazz Parser.
13
14 The Jazz Parser is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version.
18
19 The Jazz Parser is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
23
24 You should have received a copy of the GNU General Public License
25 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
26
27 ============================ End license ======================================
28
29 """
30 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
31
32 from .datastructs import Node, MutableTree
33 from .balancedseq import BalancedSequence
34
36 """
37 Computes the size of the largest common embedded subtree for two
38 unlabeled trees. It is quicker to compute the size than to compute
39 the common tree itself, so this function doesn't actually compute what
40 the tree is.
41
42 """
43 # Get a balanced sequence to represent each tree
44 bs1 = BalancedSequence.from_tree(tree1)
45 bs2 = BalancedSequence.from_tree(tree2)
46
47 # To make sure we never recompute an lcs value for a pair of subsequences
48 # twice, we store those we've computed in a dictionary
49 lcs_cache = {}
50
51 def _lcs(seq1, seq2):
52 # Base cases: common subtree of empty tree and anything is size 0
53 if len(seq1) == 0:
54 return 0
55 if len(seq2) == 0:
56 return 0
57
58 # Check whether we've computed this before
59 if (seq1,seq2) in lcs_cache:
60 return lcs_cache[(seq1,seq2)]
61 # Also the other way round: the result would be the same
62 if (seq2,seq1) in lcs_cache:
63 return lcs_cache[(seq2,seq1)]
64
65 # Break the seqs into heads and tails
66 head1,tail1 = seq1.head_tail()
67 head2,tail2 = seq2.head_tail()
68
69 # Try the three possible matchings and take the max size
70 # First try matching the heads to each other
71 head_match = _lcs(head1, head2) + _lcs(tail1, tail2) + 1
72 # Next try skipping a level of embedding on the head of tree1
73 # Note that head1+tail1 != tree1
74 head_skip1 = _lcs(head1+tail1, seq2)
75 # Finally, try skipping a level on tree2 as well
76 head_skip2 = _lcs(seq1, head2+tail2)
77
78 max_size = max([head_match, head_skip1, head_skip2])
79
80 # Remember this in case we get asked the same thing again
81 lcs_cache[(seq1,seq2)] = max_size
82 return max_size
83
84 # Run recursive lcs computation on the two sequences
85 size = _lcs(bs1, bs2)
86 return size
87
88
90 """
91 Computes the largest common embedded subtree for two
92 unlabeled trees. Even you only need to know the size, use L{lces_size},
93 since it's a slightly simpler problem.
94
95 """
96 cat = BalancedSequence.cat
97
98 # Get a balanced sequence to represent each tree
99 bs1 = BalancedSequence.from_tree(tree1)
100 bs2 = BalancedSequence.from_tree(tree2)
101
102 # To make sure we never recompute an lcs value for a pair of subsequences
103 # twice, we store those we've computed in a dictionary
104 lcs_cache = {}
105
106 def _lcs(seq1, seq2):
107 # Base cases: common subtree of empty tree and anything is the empty tree
108 if len(seq1) == 0:
109 return BalancedSequence([]), 0
110 if len(seq2) == 0:
111 return BalancedSequence([]), 0
112
113 # Check whether we've computed this before
114 if (seq1,seq2) in lcs_cache:
115 return lcs_cache[(seq1,seq2)]
116 # Also the other way round: the result would be the same
117 if (seq2,seq1) in lcs_cache:
118 return lcs_cache[(seq2,seq1)]
119
120 # Break the seqs into heads and tails
121 head1,tail1 = seq1.head_tail()
122 head2,tail2 = seq2.head_tail()
123
124 # Try the three possible matchings and take the max size
125 # First try matching the heads to each other
126 head_match_h, head_match_h_size = _lcs(head1, head2)
127 head_match_t, head_match_t_size = _lcs(tail1, tail2)
128 # Next try skipping a level of embedding on the head of tree1
129 # Note that head1+tail1 != tree1
130 head_skip1, head_skip1_size = _lcs(head1+tail1, seq2)
131 # Finally, try skipping a level on tree2 as well
132 head_skip2, head_skip2_size = _lcs(seq1, head2+tail2)
133
134 # Choose whichever of these options gives us the biggest sequence
135 sizes = [
136 ('h', head_match_h_size + head_match_t_size + 1),
137 ('s1', head_skip1_size),
138 ('s2', head_skip2_size),
139 ]
140 (max_op,max_size) = max(sizes, key=lambda x:x[1])
141
142 if max_op == 'h':
143 # Matched the heads
144 # Put together the tree that does this
145 seq = cat(head_match_h, head_match_t)
146 elif max_op == 's1':
147 # Skipped the head of the first sequence
148 seq = head_skip1
149 else:
150 seq = head_skip2
151
152 # Remember this in case we get asked the same thing again
153 lcs_cache[(seq1,seq2)] = (seq,max_size)
154 return (seq,max_size)
155
156 # Run recursive lcs computation on the two sequences
157 max_seq, max_size = _lcs(bs1, bs2)
158 return max_seq.to_tree()
159
| Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Nov 26 16:05:00 2012 | http://epydoc.sourceforge.net |