Anklang 0.3.0-460-gc4ef46ba
ASE — Anklang Sound Engine (C++)

« « « Anklang Documentation
Loading...
Searching...
No Matches
levenshtein.cc
Go to the documentation of this file.
1 // This Source Code Form is licensed MPL-2.0: http://mozilla.org/MPL/2.0
2#include "levenshtein.hh"
3#include "internal.hh"
4#include <string_view>
5#include <vector>
6
7// https://dl.acm.org/doi/10.1145/1963190.1963191 — Indexing methods for approximate dictionary searching: Comparative analysis: ACM Journal of Experimental Algorithmics: Vol 16
8
9namespace Ase {
10
15float
17 const float ci, const float cd, const float cs, const float ct)
18{
19 // Compute optimal string alignment distance or restricted edit distance:
20 // https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance
21 auto min3 = [] (float a, float b, float c) { return std::min (a, std::min (b, c)); };
22 ssize_t n = source.size(), m = target.size();
23 // increase cache locality
24 if (m > n)
25 return damerau_levenshtein_restricted (target, source, cd, ci, cs, ct);
26 const char *a = source.data(), *b = target.data();
27 // remove common prefix
28 while (n && m && *a == *b)
29 a++, b++, n--, m--;
30 // remove common suffix
31 while (n && m && a[n-1] == b[m-1])
32 n--, m--;
33 // handle zero length strings
34 if (!n)
35 return m * ci;
36 if (!m)
37 return n * cd;
38 // allocate 3 rows: row2=row[i-2], row1=row[i-1], row0=row[i]
39 std::vector<float> rowmem (3 * (m+1));
40 float *row2 = rowmem.data(), *row1 = row2 + m+1, *row0 = row1 + m+1;
41 // the last row corresponds to source + insertions -> target
42 for (ssize_t j = 0; j <= m; j++)
43 row1[j] = j * ci;
44 // fill the matrix, note that a and b are 1-indexed
45 for (ssize_t i = 1; i <= n; i++) {
46 row0[0] = i * cd; // i && !j corresponds to source + deletion -> target
47 for (ssize_t j = 1; j <= m; j++) {
48 const bool ue = a[i-1] != b[j-1];
49 row0[j] = min3 (row1[j] + cd, // deletion
50 row0[j-1] + ci, // insertion
51 row1[j-1] + cs * ue); // substitution
52 if (ue && i > 1 && j > 1 && a[i-1] == b[j-2] && a[i-2] == b[j-1])
53 row0[j] = std::min (row0[j], row2[j-2] + ct);
54 }
55 // shift rows along
56 float *const next = row2;
57 row2 = row1;
58 row1 = row0;
59 row0 = next;
60 }
61 return row1[m];
62}
63
64template<typename T>
65struct L2DMatrix {
67 const size_t sa, sb;
68 L2DMatrix (size_t a, size_t b, T init = {}) :
69 sa (a), sb (b)
70 {
71 v.resize (sa * sb, init);
72 }
73 T&
74 operator() (size_t a, size_t b)
75 {
76 static T tmp{};
77 assert_return (a < sa, tmp);
78 assert_return (b < sb, tmp);
79 return v[a * sb + b];
80 }
81};
82
83// Damerau-Levenshtein Distance with edited transpositions, quadratic memory requirement
84static float
85damerau_levenshtein_unrestricted (const std::string_view &source, const std::string_view &target,
86 const float ci, const float cd, const float cs, const float ct)
87{
88 auto u = [] (char c) { return uint8_t (c); };
89 const size_t lp = source.size(), ls = target.size();
90 const char *p = source.data()-1, *s = target.data()-1; // strings p,s are 1-indexed
91 // Computation of the unrestricted Damerau-Levenshtein distance between strings p and s
92 L2DMatrix<float> C (lp+1, ls+1); // C: [0..|p|,0..|s|]
93 const int S = 256; // |Σ| for unsigned char
94 std::vector<int> CP (S, 0); // CP: [1..|Σ|]; we use 0…255 however
95 for (int i = 0; i <= lp; i++)
96 C(i,0) = i * ci;
97 for (int j = 0; j <= ls; j++)
98 C(0,j) = j * cd;
99 // CP[0…255] is 0-initialized; // CP[1…|Σ|]:=0
100 for (int i = 1; i <= lp; i++) {
101 int CS = 0;
102 for (int j = 1; j <= ls; j++) {
103 const float d = p[i] == s[j] ? 0 : cs; // if p[i]=s[j] then d:=0 else d:=1
104 C(i,j) = std::min (C(i-1,j) + ci, // insertion cost
105 C(i,j-1) + cd); // deletion cost
106 C(i,j) = std::min (C(i,j),
107 C(i-1,j-1) + d); // substitution cost
108 const int i_ = CP[u(s[j])]; // CP[c] stores the largest index i_<i such that p[i_]=c
109 const int j_ = CS; // CS stores the largest index j_<j such that s[j_] = p[i]
110 if (i_ > 0 and j_ > 0)
111 C(i,j) = std::min (C(i,j), C(i_-1,j_-1) + (i-i_)-1 + (j-j_)-1 + ct);
112 if (p[i] == s[j])
113 CS = j;
114 }
115 CP[u(p[i])] = i;
116 }
117 return C(lp,ls);
118}
119
124float
126 const float ci, const float cd, const float cs, const float ct)
127{
128 size_t n = source.size(), m = target.size();
129 const char *a = source.data(), *b = target.data();
130 // remove common prefix
131 while (n && m && *a == *b)
132 a++, b++, n--, m--;
133 // remove common suffix
134 while (n && m && a[n-1] == b[m-1])
135 n--, m--;
136 // handle zero length strings
137 if (!n)
138 return m;
139 if (!m)
140 return n;
141 // calc Damerau-Levenshtein Distance on differing fragments only
142 if (m <= n)
143 return damerau_levenshtein_unrestricted ({a, n}, {b, m}, ci, cd, cs, ct);
144 else // swap strings to increase cache locality
145 return damerau_levenshtein_unrestricted ({b, m}, {a, n}, cd, ci, cs, ct);
146}
147
148} // Ase
149
150
151#include "testing.hh"
152
153namespace { // Anon
154using namespace Ase;
155
156TEST_INTEGRITY (levenshtein_tests);
157static void
158levenshtein_tests()
159{
160 // damerau_levenshtein_restricted - no editing of transposed character pairs
161 TCMP (0, ==, damerau_levenshtein_restricted ("", ""));
162 TCMP (0, ==, damerau_levenshtein_restricted ("A", "A"));
163 TCMP (0, ==, damerau_levenshtein_restricted ("AZ", "AZ"));
164 TCMP (1, ==, damerau_levenshtein_restricted ("", "1"));
165 TCMP (2, ==, damerau_levenshtein_restricted ("", "12"));
166 TCMP (3, ==, damerau_levenshtein_restricted ("", "123"));
167 TCMP (1, ==, damerau_levenshtein_restricted ("1", ""));
168 TCMP (2, ==, damerau_levenshtein_restricted ("12", ""));
169 TCMP (3, ==, damerau_levenshtein_restricted ("123", ""));
170 TCMP (1, ==, damerau_levenshtein_restricted ("A", "B"));
171 TCMP (1, ==, damerau_levenshtein_restricted ("AB", "BA"));
172 TCMP (3, ==, damerau_levenshtein_restricted ("ABC", "CA")); // restricted edit distance: CA -> A -> AB -> ABC
173 TCMP (3, ==, damerau_levenshtein_restricted ("123", "abc"));
174 TCMP (5, ==, damerau_levenshtein_restricted ("12345", "abc"));
175 TCMP (4, ==, damerau_levenshtein_restricted ("123", "abcd"));
176 TCMP (1, ==, damerau_levenshtein_restricted ("AaaaB", "AaaaC"));
177 TCMP (2, ==, damerau_levenshtein_restricted ("Aa_aB", "AaaaC"));
178 TCMP (2, ==, damerau_levenshtein_restricted ("aAaaB", "AaaaC"));
179 TCMP (3, ==, damerau_levenshtein_restricted ("___Ab#-##^^^", "___bA##+#^^^"));
180 TCMP (1, ==, damerau_levenshtein_restricted ("_ABC", "ABC"));
181 TCMP (1, ==, damerau_levenshtein_restricted ("ABCD", "BCD"));
182 TCMP (3, ==, damerau_levenshtein_restricted ("BADCFE", "ABCDEF"));
183 TCMP (2, ==, damerau_levenshtein_restricted ("AAAArzxyAzxy", "AArzxyAzxy"));
184 TCMP (3, ==, damerau_levenshtein_restricted ("ab+cd+ef", "ba+dc+fe"));
185 TCMP (5, ==, damerau_levenshtein_restricted ("ab+cd+ef", "ba_dc_fe"));
186 TCMP (3, ==, damerau_levenshtein_restricted ("kitten", "sitting"));
187 TCMP (4, ==, damerau_levenshtein_restricted ("AGTACGCA", "TATGC")); // -A -G C2T -A
188 TCMP (2, ==, damerau_levenshtein_restricted ("a cat", "an act"));
189 TCMP (4, ==, damerau_levenshtein_restricted ("a cat", "an abct")); // +n -c +b +c
190
191 // damerau_levenshtein_distance - allows insert/delete between transposed character pair
192 TCMP (0, ==, damerau_levenshtein_distance ("", ""));
193 TCMP (0, ==, damerau_levenshtein_distance ("A", "A"));
194 TCMP (0, ==, damerau_levenshtein_distance ("AZ", "AZ"));
195 TCMP (1, ==, damerau_levenshtein_distance ("", "1"));
196 TCMP (2, ==, damerau_levenshtein_distance ("", "12"));
197 TCMP (3, ==, damerau_levenshtein_distance ("", "123"));
198 TCMP (1, ==, damerau_levenshtein_distance ("1", ""));
199 TCMP (2, ==, damerau_levenshtein_distance ("12", ""));
200 TCMP (3, ==, damerau_levenshtein_distance ("123", ""));
201 TCMP (1, ==, damerau_levenshtein_distance ("A", "B"));
202 TCMP (1, ==, damerau_levenshtein_distance ("AB", "BA"));
203 TCMP (2, ==, damerau_levenshtein_distance ("ABC", "CA")); // edits in adjacent transpositions: CA -> AC -> ABC
204 TCMP (3, ==, damerau_levenshtein_distance ("123", "abc"));
205 TCMP (5, ==, damerau_levenshtein_distance ("12345", "abc"));
206 TCMP (4, ==, damerau_levenshtein_distance ("123", "abcd"));
207 TCMP (1, ==, damerau_levenshtein_distance ("AaaaB", "AaaaC"));
208 TCMP (2, ==, damerau_levenshtein_distance ("Aa_aB", "AaaaC"));
209 TCMP (2, ==, damerau_levenshtein_distance ("aAaaB", "AaaaC"));
210 TCMP (3, ==, damerau_levenshtein_distance ("___Ab#-##^^^", "___bA##+#^^^"));
211 TCMP (1, ==, damerau_levenshtein_distance ("_ABC", "ABC"));
212 TCMP (1, ==, damerau_levenshtein_distance ("ABCD", "BCD"));
213 TCMP (3, ==, damerau_levenshtein_distance ("BADCFE", "ABCDEF"));
214 TCMP (2, ==, damerau_levenshtein_distance ("AAAArzxyAzxy", "AArzxyAzxy"));
215 TCMP (3, ==, damerau_levenshtein_distance ("ab+cd+ef", "ba+dc+fe"));
216 TCMP (5, ==, damerau_levenshtein_distance ("ab+cd+ef", "ba_dc_fe"));
217 TCMP (3, ==, damerau_levenshtein_distance ("kitten", "sitting"));
218 TCMP (4, ==, damerau_levenshtein_distance ("AGTACGCA", "TATGC")); // -A -G C2T -A
219 TCMP (2, ==, damerau_levenshtein_distance ("a cat", "an act"));
220 TCMP (3, ==, damerau_levenshtein_distance ("a cat", "an abct")); // +n ca2ac +b
221}
222
223} // Anon
T data(T... args)
#define assert_return(expr,...)
Return from the current function if expr is unmet and issue an assertion warning.
Definition internal.hh:29
#define TEST_INTEGRITY(FUNC)
Register func as an integrity test.
Definition internal.hh:77
T min(T... args)
The Anklang C++ API namespace.
Definition api.hh:9
float damerau_levenshtein_distance(const std::string &source, const std::string &target, const float ci, const float cd, const float cs, const float ct)
float damerau_levenshtein_restricted(const std::string &source, const std::string &target, const float ci, const float cd, const float cs, const float ct)
T size(T... args)
typedef uint8_t
typedef ssize_t
#define TCMP(a, cmp, b)
Compare a and b according to operator cmp, verbose on failiure.
Definition testing.hh:23