1 """Algorithms for commonly-used distance metrics.
2
3 """
4 """
5 ============================== License ========================================
6 Copyright (C) 2008, 2010-12 University of Edinburgh, Mark Granroth-Wilding
7
8 This file is part of The Jazz Parser.
9
10 The Jazz Parser is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation, either version 3 of the License, or
13 (at your option) any later version.
14
15 The Jazz Parser is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with The Jazz Parser. If not, see <http://www.gnu.org/licenses/>.
22
23 ============================ End license ======================================
24
25 """
26 __author__ = "Mark Granroth-Wilding <mark.granroth-wilding@ed.ac.uk>"
27
29 """
30 Compute the Levenshtein distance between two sequences.
31 By default, will compare the elements using the == operator, but
32 any binary function can be given as the equality argument.
33
34 delins_cost is the cost applied for deletions and insertions.
35
36 subst_cost_fun is a binary function that gives the cost to substitute
37 the first argument with the second. If not given, a cost of delins
38 is used for any substitution.
39
40 """
41 if subst_cost_fun is None:
42 subst_cost_fun = lambda x,y: 0 if x==y else delins_cost
43
44
45 if len(seq1) == 0:
46 return len(seq2) * delins_cost
47 elif len(seq2) == 0:
48 return len(seq1) * delins_cost
49
50
51 dist = []
52 for i in range(len(seq1)+1):
53 dist.append([None] * (len(seq2)+1))
54
55 for i in range(len(seq1)+1):
56 dist[i][0] = i*delins_cost
57 for j in range(len(seq2)+1):
58 dist[0][j] = j*delins_cost
59
60 for j in range(1, len(seq2)+1):
61 for i in range(1, len(seq1)+1):
62 dist[i][j] = min(
63 dist[i-1][j] + delins_cost,
64 dist[i][j-1] + delins_cost,
65 dist[i-1][j-1] + subst_cost_fun(seq1[i-1], seq2[j-1])
66 )
67
68 return dist[-1][-1]
69
71 """
72 Compute the Levenshtein distance between two sequences.
73 This does the same thing as levenshtein_distance, but stores
74 pointers to indicate what alignments gave the costs and returns
75 the full cost matrix, plus the pointer matrix.
76
77 C{seq2} is aligned with C{seq1}: that is, a deletion indicates that
78 C{seq1} moves on a cell without a corresponding cell in C{seq2}.
79
80 """
81 if subst_cost_fun is None:
82 subst_cost_fun = lambda x,y: 0 if x==y else delins_cost
83
84
85 if len(seq1) == 0:
86 return len(seq2) * delins_cost
87 elif len(seq2) == 0:
88 return len(seq1) * delins_cost
89
90
91 pointers = []
92 dist = []
93 for i in range(len(seq1)+1):
94 dist.append([None] * (len(seq2)+1))
95 pointers.append([None] * (len(seq2)+1))
96
97 for i in range(len(seq1)+1):
98 dist[i][0] = i*delins_cost
99 pointers[i][0] = 'D'
100 for j in range(len(seq2)+1):
101 dist[0][j] = j*delins_cost
102 pointers[0][j] = 'I'
103
104 for j in range(1, len(seq2)+1):
105 for i in range(1, len(seq1)+1):
106 dist[i][j],pointers[i][j] = min(
107 (dist[i-1][j] + delins_cost, 'D'),
108 (dist[i][j-1] + delins_cost, 'I'),
109 (dist[i-1][j-1] + subst_cost_fun(seq1[i-1], seq2[j-1]), 'S'),
110 key=lambda x:x[0])
111
112 return dist, pointers
113
114 -def align(seq1, seq2, delins_cost=1, subst_cost=None, dist=False):
115 """
116 Finds the optimal alignment of the two sequences using Levenshtein
117 distance and traces back the pointers to find the alignment. Returns
118 a list of pairs, containing the points from the two lists.
119
120 In the case of a substitution, it will contain the two points that were
121 aligned. In the case of an insertion, the first value will be C{None}
122 and the second the inserted value. In the case of a deletion, the
123 second value will be C{None} and the first the deleted value.
124
125 Note that the pair of values in the case of a substitution may be
126 equal - an alignment - or not - a substitution - depending on the
127 substitution cost function.
128
129 @type dist: bool
130 @param dist: return a tuple of the alignment and the dist
131
132 """
133 dists,pointers = levenshtein_distance_with_pointers(
134 seq1,
135 seq2,
136 delins_cost=delins_cost,
137 subst_cost_fun=subst_cost)
138
139
140
141
142 operations = []
143
144 i,j = (len(dists)-1), (len(dists[0])-1)
145 while not i == j == 0:
146 if pointers[i][j] == "I":
147 operations.append((None,seq2[j-1]))
148 j -= 1
149 elif pointers[i][j] == "D":
150 operations.append((seq1[i-1],None))
151 i -= 1
152 else:
153 operations.append((seq1[i-1],seq2[j-1]))
154 j -= 1
155 i -= 1
156
157 operations.reverse()
158
159 if dist:
160 return operations,dists[-1][-1]
161 else:
162 return operations
163
165 """
166 Compute a local alignment variant of the Levenshtein distance between two
167 sequences. Options are the same as L{levenshtein_distance_with_pointers}.
168
169 Finds the optimal alignment of seq2 within seq1.
170
171 In addition to the operations I, D and S used in
172 L{levenshtein_distance_with_pointers}, we use here '.' to indicate a
173 deletion at zero-cost at the beginning or end.
174
175 """
176 if subst_cost_fun is None:
177 subst_cost_fun = lambda x,y: 0 if x==y else delins_cost
178
179
180 if len(seq1) == 0:
181 return len(seq2) * delins_cost
182 elif len(seq2) == 0:
183 return len(seq1) * delins_cost
184
185
186 dist = []
187 pointers = []
188 for i in range(len(seq1)+1):
189 dist.append([None] * (len(seq2)+1))
190 pointers.append([None] * (len(seq2)+1))
191
192 for i in range(len(seq1)+1):
193
194 dist[i][0] = 0
195 pointers[i][0] = '.'
196 for j in range(len(seq2)+1):
197 dist[0][j] = j*delins_cost
198 pointers[0][j] = 'I'
199
200 for j in range(1, len(seq2)+1):
201 for i in range(1, len(seq1)+1):
202 dist[i][j],pointers[i][j] = min(
203 (dist[i-1][j] + delins_cost, 'D'),
204 (dist[i][j-1] + delins_cost, 'I'),
205 (dist[i-1][j-1] + subst_cost_fun(seq1[i-1], seq2[j-1]), 'S'),
206 key=lambda x:x[0])
207
208
209 for i in range(1, len(seq1)+1):
210
211 dist[i][-1],pointers[i][-1] = min(
212 (dist[i][-1], pointers[i][-1]),
213 (dist[i-1][-1], '.'),
214 key=lambda x:x[0])
215
216 return dist, pointers
217