| 1 | module strings |
| 2 | |
| 3 | @[inline] |
| 4 | fn min(a int, b int, c int) int { |
| 5 | mut m := a |
| 6 | if b < m { |
| 7 | m = b |
| 8 | } |
| 9 | if c < m { |
| 10 | m = c |
| 11 | } |
| 12 | return m |
| 13 | } |
| 14 | |
| 15 | @[inline] |
| 16 | fn max2(a int, b int) int { |
| 17 | if a < b { |
| 18 | return b |
| 19 | } |
| 20 | return a |
| 21 | } |
| 22 | |
| 23 | @[inline] |
| 24 | fn min2(a int, b int) int { |
| 25 | if a < b { |
| 26 | return a |
| 27 | } |
| 28 | return b |
| 29 | } |
| 30 | |
| 31 | @[inline] |
| 32 | fn abs2(a int, b int) int { |
| 33 | if a < b { |
| 34 | return b - a |
| 35 | } |
| 36 | return a - b |
| 37 | } |
| 38 | |
| 39 | // levenshtein_distance uses the Levenshtein Distance algorithm to calculate |
| 40 | // the distance between between two strings `a` and `b` (lower is closer). |
| 41 | @[direct_array_access] |
| 42 | pub fn levenshtein_distance(a string, b string) int { |
| 43 | if a.len == 0 { |
| 44 | return b.len |
| 45 | } |
| 46 | if b.len == 0 { |
| 47 | return a.len |
| 48 | } |
| 49 | if a == b { |
| 50 | return 0 |
| 51 | } |
| 52 | mut row := []int{len: a.len + 1, init: index} |
| 53 | for i := 1; i < b.len + 1; i++ { |
| 54 | mut prev := i |
| 55 | for j := 1; j < a.len + 1; j++ { |
| 56 | mut current := row[j - 1] // match |
| 57 | if b[i - 1] != a[j - 1] { |
| 58 | // insertion, substitution, deletion |
| 59 | current = min(row[j - 1] + 1, prev + 1, row[j] + 1) |
| 60 | } |
| 61 | row[j - 1] = prev |
| 62 | prev = current |
| 63 | } |
| 64 | row[a.len] = prev |
| 65 | } |
| 66 | return row[a.len] |
| 67 | } |
| 68 | |
| 69 | // levenshtein_distance_percentage uses the Levenshtein Distance algorithm to calculate how similar two strings are as a percentage (higher is closer). |
| 70 | pub fn levenshtein_distance_percentage(a string, b string) f32 { |
| 71 | d := levenshtein_distance(a, b) |
| 72 | l := if a.len >= b.len { a.len } else { b.len } |
| 73 | return (1.00 - f32(d) / f32(l)) * 100.00 |
| 74 | } |
| 75 | |
| 76 | // dice_coefficient implements the Sørensen–Dice coefficient. |
| 77 | // It finds the similarity between two strings, and returns a coefficient |
| 78 | // between 0.0 (not similar) and 1.0 (exact match). |
| 79 | pub fn dice_coefficient(s1 string, s2 string) f32 { |
| 80 | if s1.len == 0 || s2.len == 0 { |
| 81 | return 0.0 |
| 82 | } |
| 83 | if s1 == s2 { |
| 84 | return 1.0 |
| 85 | } |
| 86 | if s1.len < 2 || s2.len < 2 { |
| 87 | return 0.0 |
| 88 | } |
| 89 | a := if s1.len > s2.len { s1 } else { s2 } |
| 90 | b := if a == s1 { s2 } else { s1 } |
| 91 | mut first_bigrams := map[string]int{} |
| 92 | for i in 0 .. a.len - 1 { |
| 93 | bigram := a[i..i + 2] |
| 94 | q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 } |
| 95 | first_bigrams[bigram] = q |
| 96 | } |
| 97 | mut intersection_size := 0 |
| 98 | for i in 0 .. b.len - 1 { |
| 99 | bigram := b[i..i + 2] |
| 100 | count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 } |
| 101 | if count > 0 { |
| 102 | first_bigrams[bigram] = count - 1 |
| 103 | intersection_size++ |
| 104 | } |
| 105 | } |
| 106 | return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2) |
| 107 | } |
| 108 | |
| 109 | // hamming_distance uses the Hamming Distance algorithm to calculate |
| 110 | // the distance between two strings `a` and `b` (lower is closer). |
| 111 | @[direct_array_access] |
| 112 | pub fn hamming_distance(a string, b string) int { |
| 113 | if a.len == 0 && b.len == 0 { |
| 114 | return 0 |
| 115 | } |
| 116 | mut match_len := min2(a.len, b.len) |
| 117 | mut diff_count := abs2(a.len, b.len) |
| 118 | for i in 0 .. match_len { |
| 119 | if a[i] != b[i] { |
| 120 | diff_count++ |
| 121 | } |
| 122 | } |
| 123 | return diff_count |
| 124 | } |
| 125 | |
| 126 | // hamming_similarity uses the Hamming Distance algorithm to calculate the distance between two strings `a` and `b`. |
| 127 | // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match). |
| 128 | pub fn hamming_similarity(a string, b string) f32 { |
| 129 | l := max2(a.len, b.len) |
| 130 | if l == 0 { |
| 131 | // Both are empty strings, should return 1.0 |
| 132 | return 1.0 |
| 133 | } |
| 134 | d := hamming_distance(a, b) |
| 135 | return 1.00 - f32(d) / f32(l) |
| 136 | } |
| 137 | |
| 138 | // jaro_similarity uses the Jaro Distance algorithm to calculate |
| 139 | // the distance between two strings `a` and `b`. |
| 140 | // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match). |
| 141 | @[direct_array_access] |
| 142 | pub fn jaro_similarity(a string, b string) f64 { |
| 143 | a_len := a.len |
| 144 | b_len := b.len |
| 145 | if a_len == 0 && b_len == 0 { |
| 146 | // Both are empty strings, should return 1.0 |
| 147 | return 1.0 |
| 148 | } |
| 149 | if a_len == 0 || b_len == 0 { |
| 150 | return 0 |
| 151 | } |
| 152 | |
| 153 | // Maximum distance upto which matching is allowed |
| 154 | match_distance := max2(max2(a_len, b_len) / 2 - 1, 0) |
| 155 | |
| 156 | mut a_matches := []bool{len: a_len} |
| 157 | mut b_matches := []bool{len: b_len} |
| 158 | mut matches := 0 |
| 159 | mut transpositions := 0.0 |
| 160 | |
| 161 | // Traverse through the first string |
| 162 | for i in 0 .. a_len { |
| 163 | start := max2(0, i - match_distance) |
| 164 | end := min2(b_len, i + match_distance + 1) |
| 165 | for k in start .. end { |
| 166 | // If there is a match |
| 167 | if b_matches[k] { |
| 168 | continue |
| 169 | } |
| 170 | if a[i] != b[k] { |
| 171 | continue |
| 172 | } |
| 173 | a_matches[i] = true |
| 174 | b_matches[k] = true |
| 175 | matches++ |
| 176 | break |
| 177 | } |
| 178 | } |
| 179 | // If there is no match |
| 180 | if matches == 0 { |
| 181 | return 0 |
| 182 | } |
| 183 | mut k := 0 |
| 184 | // Count number of occurrences where two characters match but |
| 185 | // there is a third matched character in between the indices |
| 186 | for i in 0 .. a_len { |
| 187 | if !a_matches[i] { |
| 188 | continue |
| 189 | } |
| 190 | // Find the next matched character in second string |
| 191 | for !b_matches[k] { |
| 192 | k++ |
| 193 | } |
| 194 | if a[i] != b[k] { |
| 195 | transpositions++ |
| 196 | } |
| 197 | k++ |
| 198 | } |
| 199 | transpositions /= 2 |
| 200 | return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3 |
| 201 | } |
| 202 | |
| 203 | // jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate |
| 204 | // the distance between two strings `a` and `b`. |
| 205 | // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match). |
| 206 | // The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix |
| 207 | // similarities, making it especially effective for cases where slight misspellings |
| 208 | // or prefixes are common. |
| 209 | @[direct_array_access] |
| 210 | pub fn jaro_winkler_similarity(a string, b string) f64 { |
| 211 | // Maximum of 4 characters are allowed in prefix |
| 212 | mut lmax := min2(4, min2(a.len, b.len)) |
| 213 | mut l := 0 |
| 214 | for i in 0 .. lmax { |
| 215 | if a[i] == b[i] { |
| 216 | l++ |
| 217 | } |
| 218 | } |
| 219 | js := jaro_similarity(a, b) |
| 220 | // select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity |
| 221 | p := 0.1 |
| 222 | ws := js + f64(l) * p * (1 - js) |
| 223 | return ws |
| 224 | } |
| 225 | |