v / vlib / strings / similarity.v
224 lines · 210 sloc · 5.45 KB · 16f3cb42991b2968e6d3f93328dacd925c1f672a
Raw
1module strings
2
3@[inline]
4fn min(a int, b int, c int) int {
5 mut m := a
6 if b < m {
7 m = b
8 }
9 if c < m {
10 m = c
11 }
12 return m
13}
14
15@[inline]
16fn max2(a int, b int) int {
17 if a < b {
18 return b
19 }
20 return a
21}
22
23@[inline]
24fn min2(a int, b int) int {
25 if a < b {
26 return a
27 }
28 return b
29}
30
31@[inline]
32fn abs2(a int, b int) int {
33 if a < b {
34 return b - a
35 }
36 return a - b
37}
38
39// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
40// the distance between between two strings `a` and `b` (lower is closer).
41@[direct_array_access]
42pub fn levenshtein_distance(a string, b string) int {
43 if a.len == 0 {
44 return b.len
45 }
46 if b.len == 0 {
47 return a.len
48 }
49 if a == b {
50 return 0
51 }
52 mut row := []int{len: a.len + 1, init: index}
53 for i := 1; i < b.len + 1; i++ {
54 mut prev := i
55 for j := 1; j < a.len + 1; j++ {
56 mut current := row[j - 1] // match
57 if b[i - 1] != a[j - 1] {
58 // insertion, substitution, deletion
59 current = min(row[j - 1] + 1, prev + 1, row[j] + 1)
60 }
61 row[j - 1] = prev
62 prev = current
63 }
64 row[a.len] = prev
65 }
66 return row[a.len]
67}
68
69// levenshtein_distance_percentage uses the Levenshtein Distance algorithm to calculate how similar two strings are as a percentage (higher is closer).
70pub fn levenshtein_distance_percentage(a string, b string) f32 {
71 d := levenshtein_distance(a, b)
72 l := if a.len >= b.len { a.len } else { b.len }
73 return (1.00 - f32(d) / f32(l)) * 100.00
74}
75
76// dice_coefficient implements the Sørensen–Dice coefficient.
77// It finds the similarity between two strings, and returns a coefficient
78// between 0.0 (not similar) and 1.0 (exact match).
79pub fn dice_coefficient(s1 string, s2 string) f32 {
80 if s1.len == 0 || s2.len == 0 {
81 return 0.0
82 }
83 if s1 == s2 {
84 return 1.0
85 }
86 if s1.len < 2 || s2.len < 2 {
87 return 0.0
88 }
89 a := if s1.len > s2.len { s1 } else { s2 }
90 b := if a == s1 { s2 } else { s1 }
91 mut first_bigrams := map[string]int{}
92 for i in 0 .. a.len - 1 {
93 bigram := a[i..i + 2]
94 q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
95 first_bigrams[bigram] = q
96 }
97 mut intersection_size := 0
98 for i in 0 .. b.len - 1 {
99 bigram := b[i..i + 2]
100 count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
101 if count > 0 {
102 first_bigrams[bigram] = count - 1
103 intersection_size++
104 }
105 }
106 return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
107}
108
109// hamming_distance uses the Hamming Distance algorithm to calculate
110// the distance between two strings `a` and `b` (lower is closer).
111@[direct_array_access]
112pub fn hamming_distance(a string, b string) int {
113 if a.len == 0 && b.len == 0 {
114 return 0
115 }
116 mut match_len := min2(a.len, b.len)
117 mut diff_count := abs2(a.len, b.len)
118 for i in 0 .. match_len {
119 if a[i] != b[i] {
120 diff_count++
121 }
122 }
123 return diff_count
124}
125
126// hamming_similarity uses the Hamming Distance algorithm to calculate the distance between two strings `a` and `b`.
127// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
128pub fn hamming_similarity(a string, b string) f32 {
129 l := max2(a.len, b.len)
130 if l == 0 {
131 // Both are empty strings, should return 1.0
132 return 1.0
133 }
134 d := hamming_distance(a, b)
135 return 1.00 - f32(d) / f32(l)
136}
137
138// jaro_similarity uses the Jaro Distance algorithm to calculate
139// the distance between two strings `a` and `b`.
140// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
141@[direct_array_access]
142pub fn jaro_similarity(a string, b string) f64 {
143 a_len := a.len
144 b_len := b.len
145 if a_len == 0 && b_len == 0 {
146 // Both are empty strings, should return 1.0
147 return 1.0
148 }
149 if a_len == 0 || b_len == 0 {
150 return 0
151 }
152
153 // Maximum distance upto which matching is allowed
154 match_distance := max2(max2(a_len, b_len) / 2 - 1, 0)
155
156 mut a_matches := []bool{len: a_len}
157 mut b_matches := []bool{len: b_len}
158 mut matches := 0
159 mut transpositions := 0.0
160
161 // Traverse through the first string
162 for i in 0 .. a_len {
163 start := max2(0, i - match_distance)
164 end := min2(b_len, i + match_distance + 1)
165 for k in start .. end {
166 // If there is a match
167 if b_matches[k] {
168 continue
169 }
170 if a[i] != b[k] {
171 continue
172 }
173 a_matches[i] = true
174 b_matches[k] = true
175 matches++
176 break
177 }
178 }
179 // If there is no match
180 if matches == 0 {
181 return 0
182 }
183 mut k := 0
184 // Count number of occurrences where two characters match but
185 // there is a third matched character in between the indices
186 for i in 0 .. a_len {
187 if !a_matches[i] {
188 continue
189 }
190 // Find the next matched character in second string
191 for !b_matches[k] {
192 k++
193 }
194 if a[i] != b[k] {
195 transpositions++
196 }
197 k++
198 }
199 transpositions /= 2
200 return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
201}
202
203// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
204// the distance between two strings `a` and `b`.
205// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
206// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
207// similarities, making it especially effective for cases where slight misspellings
208// or prefixes are common.
209@[direct_array_access]
210pub fn jaro_winkler_similarity(a string, b string) f64 {
211 // Maximum of 4 characters are allowed in prefix
212 mut lmax := min2(4, min2(a.len, b.len))
213 mut l := 0
214 for i in 0 .. lmax {
215 if a[i] == b[i] {
216 l++
217 }
218 }
219 js := jaro_similarity(a, b)
220 // select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
221 p := 0.1
222 ws := js + f64(l) * p * (1 - js)
223 return ws
224}
225