Package jazzparser :: Package utils :: Module distance
[hide private]
[frames] | no frames]

Source Code for Module jazzparser.utils.distance

  1  """Algorithms for commonly-used distance metrics. 
  2   
  3  """ 
  4  """ 
  5  ============================== License ======================================== 
  6   Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding 
  7    
  8   This file is part of The Jazz Parser. 
  9    
 10   The Jazz Parser is free software: you can redistribute it and/or modify 
 11   it under the terms of the GNU General Public License as published by 
 12   the Free Software Foundation, either version 3 of the License, or 
 13   (at your option) any later version. 
 14    
 15   The Jazz Parser is distributed in the hope that it will be useful, 
 16   but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18   GNU General Public License for more details. 
 19    
 20   You should have received a copy of the GNU General Public License 
 21   along with The Jazz Parser.  If not, see <http://www.gnu.org/licenses/>. 
 22   
 23  ============================ End license ====================================== 
 24   
 25  """ 
 26  __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>" 
 27   
28 -def levenshtein_distance(seq1, seq2, delins_cost=1, subst_cost_fun=None):
29 """ 30 Compute the Levenshtein distance between two sequences. 31 By default, will compare the elements using the == operator, but 32 any binary function can be given as the equality argument. 33 34 delins_cost is the cost applied for deletions and insertions. 35 36 subst_cost_fun is a binary function that gives the cost to substitute 37 the first argument with the second. If not given, a cost of delins 38 is used for any substitution. 39 40 """ 41 if subst_cost_fun is None: 42 subst_cost_fun = lambda x,y: 0 if x==y else delins_cost 43 44 # Simple cases: empty sequences 45 if len(seq1) == 0: 46 return len(seq2) * delins_cost 47 elif len(seq2) == 0: 48 return len(seq1) * delins_cost 49 50 # Create a table with m+1 rows and n+1 columns 51 dist = [] 52 for i in range(len(seq1)+1): 53 dist.append([None] * (len(seq2)+1)) 54 55 for i in range(len(seq1)+1): 56 dist[i][0] = i*delins_cost # deletion 57 for j in range(len(seq2)+1): 58 dist[0][j] = j*delins_cost # insertion 59 60 for j in range(1, len(seq2)+1): 61 for i in range(1, len(seq1)+1): 62 dist[i][j] = min( 63 dist[i-1][j] + delins_cost, # deletion 64 dist[i][j-1] + delins_cost, # insertion 65 dist[i-1][j-1] + subst_cost_fun(seq1[i-1], seq2[j-1]) # substitution 66 ) 67 68 return dist[-1][-1]
69
70 -def levenshtein_distance_with_pointers(seq1, seq2, delins_cost=1, subst_cost_fun=None):
71 """ 72 Compute the Levenshtein distance between two sequences. 73 This does the same thing as levenshtein_distance, but stores 74 pointers to indicate what alignments gave the costs and returns 75 the full cost matrix, plus the pointer matrix. 76 77 C{seq2} is aligned with C{seq1}: that is, a deletion indicates that 78 C{seq1} moves on a cell without a corresponding cell in C{seq2}. 79 80 """ 81 if subst_cost_fun is None: 82 subst_cost_fun = lambda x,y: 0 if x==y else delins_cost 83 84 # Simple cases: empty sequences 85 if len(seq1) == 0: 86 return len(seq2) * delins_cost 87 elif len(seq2) == 0: 88 return len(seq1) * delins_cost 89 90 # Create a table with m+1 rows and n+1 columns 91 pointers = [] 92 dist = [] 93 for i in range(len(seq1)+1): 94 dist.append([None] * (len(seq2)+1)) 95 pointers.append([None] * (len(seq2)+1)) 96 97 for i in range(len(seq1)+1): 98 dist[i][0] = i*delins_cost # deletion 99 pointers[i][0] = 'D' 100 for j in range(len(seq2)+1): 101 dist[0][j] = j*delins_cost # insertion 102 pointers[0][j] = 'I' 103 104 for j in range(1, len(seq2)+1): 105 for i in range(1, len(seq1)+1): 106 dist[i][j],pointers[i][j] = min( 107 (dist[i-1][j] + delins_cost, 'D'), # deletion 108 (dist[i][j-1] + delins_cost, 'I'), # insertion 109 (dist[i-1][j-1] + subst_cost_fun(seq1[i-1], seq2[j-1]), 'S'), # substitution 110 key=lambda x:x[0]) 111 112 return dist, pointers
113
114 -def align(seq1, seq2, delins_cost=1, subst_cost=None, dist=False):
115 """ 116 Finds the optimal alignment of the two sequences using Levenshtein 117 distance and traces back the pointers to find the alignment. Returns 118 a list of pairs, containing the points from the two lists. 119 120 In the case of a substitution, it will contain the two points that were 121 aligned. In the case of an insertion, the first value will be C{None} 122 and the second the inserted value. In the case of a deletion, the 123 second value will be C{None} and the first the deleted value. 124 125 Note that the pair of values in the case of a substitution may be 126 equal - an alignment - or not - a substitution - depending on the 127 substitution cost function. 128 129 @type dist: bool 130 @param dist: return a tuple of the alignment and the dist 131 132 """ 133 dists,pointers = levenshtein_distance_with_pointers( 134 seq1, 135 seq2, 136 delins_cost=delins_cost, 137 subst_cost_fun=subst_cost) 138 # We now have the matrix of costs and the pointers that generated 139 # those costs. 140 # Trace back to find out what costs were incurred in the optimal 141 # alignment 142 operations = [] 143 # Start at the top right corner 144 i,j = (len(dists)-1), (len(dists[0])-1) 145 while not i == j == 0: 146 if pointers[i][j] == "I": 147 operations.append((None,seq2[j-1])) 148 j -= 1 149 elif pointers[i][j] == "D": 150 operations.append((seq1[i-1],None)) 151 i -= 1 152 else: 153 operations.append((seq1[i-1],seq2[j-1])) 154 j -= 1 155 i -= 1 156 157 operations.reverse() 158 159 if dist: 160 return operations,dists[-1][-1] 161 else: 162 return operations
163
164 -def local_levenshtein_distance(seq1, seq2, delins_cost=1, subst_cost_fun=None):
165 """ 166 Compute a local alignment variant of the Levenshtein distance between two 167 sequences. Options are the same as L{levenshtein_distance_with_pointers}. 168 169 Finds the optimal alignment of seq2 within seq1. 170 171 In addition to the operations I, D and S used in 172 L{levenshtein_distance_with_pointers}, we use here '.' to indicate a 173 deletion at zero-cost at the beginning or end. 174 175 """ 176 if subst_cost_fun is None: 177 subst_cost_fun = lambda x,y: 0 if x==y else delins_cost 178 179 # Simple cases: empty sequences 180 if len(seq1) == 0: 181 return len(seq2) * delins_cost 182 elif len(seq2) == 0: 183 return len(seq1) * delins_cost 184 185 # Create a table with m+1 rows and n+1 columns 186 dist = [] 187 pointers = [] 188 for i in range(len(seq1)+1): 189 dist.append([None] * (len(seq2)+1)) 190 pointers.append([None] * (len(seq2)+1)) 191 192 for i in range(len(seq1)+1): 193 # Initial deletions cost us nothing because the alignment's local 194 dist[i][0] = 0 195 pointers[i][0] = '.' 196 for j in range(len(seq2)+1): 197 dist[0][j] = j*delins_cost # insertion 198 pointers[0][j] = 'I' 199 200 for j in range(1, len(seq2)+1): 201 for i in range(1, len(seq1)+1): 202 dist[i][j],pointers[i][j] = min( 203 (dist[i-1][j] + delins_cost, 'D'), # deletion 204 (dist[i][j-1] + delins_cost, 'I'), # insertion 205 (dist[i-1][j-1] + subst_cost_fun(seq1[i-1], seq2[j-1]), 'S'), # substitution 206 key=lambda x:x[0]) 207 208 # Allow free deletions at the end of seq2 209 for i in range(1, len(seq1)+1): 210 # Let us move along seq1 at 0-cost when seq1 is over if it helps 211 dist[i][-1],pointers[i][-1] = min( 212 (dist[i][-1], pointers[i][-1]), # Keep the same 213 (dist[i-1][-1], '.'), # Use free move 214 key=lambda x:x[0]) 215 216 return dist, pointers
217