Package jazzparser :: Package misc :: Package tree :: Module lces
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.misc.tree.lces

  1  """Largest Common Embeddable Subtree. 
  2   
  3  Implementation of the algorithm from I{On the Maximum Common Embedded Subtree  
  4  Problem for Ordered Trees} by Loxano and Valiente. This is for unlabeled,  
  5  ordered trees. 
  6   
  7  """ 
  8  """ 
  9  ============================== License ======================================== 
 10   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
 11    
 12   This file is part of The Jazz Parser. 
 13    
 14   The Jazz Parser is free software: you can redistribute it and/or modify 
 15   it under the terms of the GNU General Public License as published by 
 16   the Free Software Foundation, either version 3 of the License, or 
 17   (at your option) any later version. 
 18    
 19   The Jazz Parser is distributed in the hope that it will be useful, 
 20   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 21   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 22   GNU General Public License for more details. 
 23    
 24   You should have received a copy of the GNU General Public License 
 25   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 26   
 27  ============================ End license ====================================== 
 28   
 29  """ 
 30  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"  
 31   
 32  from .datastructs import Node, MutableTree 
 33  from .balancedseq import BalancedSequence 
 34   
35 -def lces_size(tree1, tree2):
36 """ 37 Computes the size of the largest common embedded subtree for two 38 unlabeled trees. It is quicker to compute the size than to compute 39 the common tree itself, so this function doesn't actually compute what 40 the tree is. 41 42 """ 43 # Get a balanced sequence to represent each tree 44 bs1 = BalancedSequence.from_tree(tree1) 45 bs2 = BalancedSequence.from_tree(tree2) 46 47 # To make sure we never recompute an lcs value for a pair of subsequences 48 # twice, we store those we've computed in a dictionary 49 lcs_cache = {} 50 51 def _lcs(seq1, seq2): 52 # Base cases: common subtree of empty tree and anything is size 0 53 if len(seq1) == 0: 54 return 0 55 if len(seq2) == 0: 56 return 0 57 58 # Check whether we've computed this before 59 if (seq1,seq2) in lcs_cache: 60 return lcs_cache[(seq1,seq2)] 61 # Also the other way round: the result would be the same 62 if (seq2,seq1) in lcs_cache: 63 return lcs_cache[(seq2,seq1)] 64 65 # Break the seqs into heads and tails 66 head1,tail1 = seq1.head_tail() 67 head2,tail2 = seq2.head_tail() 68 69 # Try the three possible matchings and take the max size 70 # First try matching the heads to each other 71 head_match = _lcs(head1, head2) + _lcs(tail1, tail2) + 1 72 # Next try skipping a level of embedding on the head of tree1 73 # Note that head1+tail1 != tree1 74 head_skip1 = _lcs(head1+tail1, seq2) 75 # Finally, try skipping a level on tree2 as well 76 head_skip2 = _lcs(seq1, head2+tail2) 77 78 max_size = max([head_match, head_skip1, head_skip2]) 79 80 # Remember this in case we get asked the same thing again 81 lcs_cache[(seq1,seq2)] = max_size 82 return max_size
83 84 # Run recursive lcs computation on the two sequences 85 size = _lcs(bs1, bs2) 86 return size 87 88
89 -def lces(tree1, tree2):
90 """ 91 Computes the largest common embedded subtree for two 92 unlabeled trees. Even you only need to know the size, use L{lces_size}, 93 since it's a slightly simpler problem. 94 95 """ 96 cat = BalancedSequence.cat 97 98 # Get a balanced sequence to represent each tree 99 bs1 = BalancedSequence.from_tree(tree1) 100 bs2 = BalancedSequence.from_tree(tree2) 101 102 # To make sure we never recompute an lcs value for a pair of subsequences 103 # twice, we store those we've computed in a dictionary 104 lcs_cache = {} 105 106 def _lcs(seq1, seq2): 107 # Base cases: common subtree of empty tree and anything is the empty tree 108 if len(seq1) == 0: 109 return BalancedSequence([]), 0 110 if len(seq2) == 0: 111 return BalancedSequence([]), 0 112 113 # Check whether we've computed this before 114 if (seq1,seq2) in lcs_cache: 115 return lcs_cache[(seq1,seq2)] 116 # Also the other way round: the result would be the same 117 if (seq2,seq1) in lcs_cache: 118 return lcs_cache[(seq2,seq1)] 119 120 # Break the seqs into heads and tails 121 head1,tail1 = seq1.head_tail() 122 head2,tail2 = seq2.head_tail() 123 124 # Try the three possible matchings and take the max size 125 # First try matching the heads to each other 126 head_match_h, head_match_h_size = _lcs(head1, head2) 127 head_match_t, head_match_t_size = _lcs(tail1, tail2) 128 # Next try skipping a level of embedding on the head of tree1 129 # Note that head1+tail1 != tree1 130 head_skip1, head_skip1_size = _lcs(head1+tail1, seq2) 131 # Finally, try skipping a level on tree2 as well 132 head_skip2, head_skip2_size = _lcs(seq1, head2+tail2) 133 134 # Choose whichever of these options gives us the biggest sequence 135 sizes = [ 136 ('h', head_match_h_size + head_match_t_size + 1), 137 ('s1', head_skip1_size), 138 ('s2', head_skip2_size), 139 ] 140 (max_op,max_size) = max(sizes, key=lambda x:x[1]) 141 142 if max_op == 'h': 143 # Matched the heads 144 # Put together the tree that does this 145 seq = cat(head_match_h, head_match_t) 146 elif max_op == 's1': 147 # Skipped the head of the first sequence 148 seq = head_skip1 149 else: 150 seq = head_skip2 151 152 # Remember this in case we get asked the same thing again 153 lcs_cache[(seq1,seq2)] = (seq,max_size) 154 return (seq,max_size)
155 156 # Run recursive lcs computation on the two sequences 157 max_seq, max_size = _lcs(bs1, bs2) 158 return max_seq.to_tree() 159