3 E-Mail : wangvisual AT sohu DOT com
7 /* filename: distance.cc */
9 http://www.merriampark.com/ld.htm
10 What is Levenshtein Distance?
12 Levenshtein distance (LD) is a measure of the similarity between two strings,
13 which we will refer to as the source string (s) and the target string (t).
14 The distance is the number of deletions, insertions, or substitutions required
15 to transform s into t. For example,
17 * If s is "test" and t is "test", then LD(s,t) = 0, because no transformations are needed.
18 The strings are already identical.
19 * If s is "test" and t is "tent", then LD(s,t) = 1, because one substitution
20 (change "s" to "n") is sufficient to transform s into t.
22 The greater the Levenshtein distance, the more different the strings are.
24 Levenshtein distance is named after the Russian scientist Vladimir Levenshtein,
25 who devised the algorithm in 1965. If you can't spell or pronounce Levenshtein,
26 the metric is also sometimes called edit distance.
28 The Levenshtein distance algorithm has been used in:
33 * Plagiarism detection
45 Cover transposition, in addition to deletion,
46 insertion and substitution. This step is taken from:
47 Berghel, Hal ; Roach, David : "An Extension of Ukkonen's
48 Enhanced Dynamic Programming ASM Algorithm"
49 (http://www.acm.org/~hlb/publications/asm/asm.html)
51 #define COVER_TRANSPOSITION
53 /****************************************/
54 /*Implementation of Levenshtein distance*/
55 /****************************************/
57 EditDistance::EditDistance()
59 currentelements = 2500; // It's enough for most conditions :-)
60 d = (int*)malloc(sizeof(int) * currentelements);
63 EditDistance::~EditDistance()
65 // printf("size:%d\n",currentelements);
71 int EditDistance::CalEditDistance(const gunichar *s, const gunichar *t, const int limit)
72 /*Compute levenshtein distance between s and t, this is using QUICK algorithm*/
74 int n = 0, m = 0, iLenDif, k, i, j, cost;
75 // Remove leftmost matching portion of strings
76 while ( *s && (*s == *t) )
91 // Remove rightmost matching portion of strings by decrement n and m.
92 while ( n && m && (*(s + n - 1) == *(t + m - 1)) )
97 if ( m == 0 || n == 0 || d == (int*)0 )
101 const gunichar * temp = s;
109 if ( iLenDif >= limit )
114 // d=(int*)malloc(sizeof(int)*m*n);
115 if ( m*n > currentelements )
117 currentelements = m * n * 2; // double the request
118 d = (int*)realloc(d, sizeof(int) * currentelements);
122 // step 2, init matrix
123 for (k = 0;k < n;k++)
125 for (k = 1;k < m;k++)
128 for (i = 1;i < n;i++)
130 // first calculate column, d(i,j)
131 for ( j = 1;j < iLenDif + i;j++ )
133 cost = s[i - 1] == t[j - 1] ? 0 : 1;
134 d[j*n + i] = minimum(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
135 #ifdef COVER_TRANSPOSITION
137 if ( i >= 2 && j >= 2 && (d[j*n + i] - d[(j - 2)*n + i - 2] == 2)
138 && (s[i - 2] == t[j - 1]) && (s[i - 1] == t[j - 2]) )
143 // second calculate row, d(k,j)
145 for ( k = 1;k <= i;k++ )
147 cost = s[k - 1] == t[j - 1] ? 0 : 1;
148 d[j*n + k] = minimum(d[(j - 1) * n + k] + 1, d[j * n + k - 1] + 1, d[(j - 1) * n + k - 1] + cost);
149 #ifdef COVER_TRANSPOSITION
151 if ( k >= 2 && j >= 2 && (d[j*n + k] - d[(j - 2)*n + k - 2] == 2)
152 && (s[k - 2] == t[j - 1]) && (s[k - 1] == t[j - 2]) )
157 // test if d(i,j) limit gets equal or exceed
158 if ( d[j*n + i] >= limit )
167 int EditDistance::CalEditDistance(const char *s, const char *t, const int limit)
170 int k, i, j, n, m, cost;
173 if ( n != 0 && m != 0 && d != (int*)0 )
177 if ( m*n > currentelements )
179 currentelements = m * n * 2;
180 d = (int*)realloc(d, sizeof(int) * currentelements);
185 for (k = 0;k < n;k++)
187 for (k = 0;k < m;k++)
190 for (i = 1;i < n;i++)
191 for (j = 1;j < m;j++)
194 if (s[i - 1] == t[j - 1])
199 d[j*n + i] = minimum(d[(j - 1) * n + i] + 1, d[j * n + i - 1] + 1, d[(j - 1) * n + i - 1] + cost);
200 #ifdef COVER_TRANSPOSITION
202 if ( i >= 2 && j >= 2 && (d[j*n + i] - d[(j - 2)*n + i - 2] == 2)
203 && (s[i - 2] == t[j - 1]) && (s[i - 1] == t[j - 2]) )