Gitly


1 module strings
2 
3 @[inline]
4 fn min(a int, b int, c int) int {
5     mut m := a
6     if b < m {
7         m = b
8     }
9     if c < m {
10         m = c
11     }
12     return m
13 }
14 
15 @[inline]
16 fn max2(a int, b int) int {
17     if a < b {
18         return b
19     }
20     return a
21 }
22 
23 @[inline]
24 fn min2(a int, b int) int {
25     if a < b {
26         return a
27     }
28     return b
29 }
30 
31 @[inline]
32 fn abs2(a int, b int) int {
33     if a < b {
34         return b - a
35     }
36     return a - b
37 }
38 
39 // levenshtein_distance uses the Levenshtein Distance algorithm to calculate
40 // the distance between between two strings `a` and `b` (lower is closer).
41 @[direct_array_access]
42 pub fn levenshtein_distance(a string, b string) int {
43     if a.len == 0 {
44         return b.len
45     }
46     if b.len == 0 {
47         return a.len
48     }
49     if a == b {
50         return 0
51     }
52     mut row := []int{len: a.len + 1, init: index}
53     for i := 1; i < b.len + 1; i++ {
54         mut prev := i
55         for j := 1; j < a.len + 1; j++ {
56             mut current := row[j - 1] // match
57             if b[i - 1] != a[j - 1] {
58                 // insertion, substitution, deletion
59                 current = min(row[j - 1] + 1, prev + 1, row[j] + 1)
60             }
61             row[j - 1] = prev
62             prev = current
63         }
64         row[a.len] = prev
65     }
66     return row[a.len]
67 }
68 
69 // levenshtein_distance_percentage uses the Levenshtein Distance algorithm to calculate how similar two strings are as a percentage (higher is closer).
70 pub fn levenshtein_distance_percentage(a string, b string) f32 {
71     d := levenshtein_distance(a, b)
72     l := if a.len >= b.len { a.len } else { b.len }
73     return (1.00 - f32(d) / f32(l)) * 100.00
74 }
75 
76 // dice_coefficient implements the Sørensen–Dice coefficient.
77 // It finds the similarity between two strings, and returns a coefficient
78 // between 0.0 (not similar) and 1.0 (exact match).
79 pub fn dice_coefficient(s1 string, s2 string) f32 {
80     if s1.len == 0 || s2.len == 0 {
81         return 0.0
82     }
83     if s1 == s2 {
84         return 1.0
85     }
86     if s1.len < 2 || s2.len < 2 {
87         return 0.0
88     }
89     a := if s1.len > s2.len { s1 } else { s2 }
90     b := if a == s1 { s2 } else { s1 }
91     mut first_bigrams := map[string]int{}
92     for i in 0 .. a.len - 1 {
93         bigram := a[i..i + 2]
94         q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
95         first_bigrams[bigram] = q
96     }
97     mut intersection_size := 0
98     for i in 0 .. b.len - 1 {
99         bigram := b[i..i + 2]
100         count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
101         if count > 0 {
102             first_bigrams[bigram] = count - 1
103             intersection_size++
104         }
105     }
106     return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
107 }
108 
109 // hamming_distance uses the Hamming Distance algorithm to calculate
110 // the distance between two strings `a` and `b` (lower is closer).
111 @[direct_array_access]
112 pub fn hamming_distance(a string, b string) int {
113     if a.len == 0 && b.len == 0 {
114         return 0
115     }
116     mut match_len := min2(a.len, b.len)
117     mut diff_count := abs2(a.len, b.len)
118     for i in 0 .. match_len {
119         if a[i] != b[i] {
120             diff_count++
121         }
122     }
123     return diff_count
124 }
125 
126 // hamming_similarity uses the Hamming Distance algorithm to calculate the distance between two strings `a` and `b`.
127 // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
128 pub fn hamming_similarity(a string, b string) f32 {
129     l := max2(a.len, b.len)
130     if l == 0 {
131         // Both are empty strings, should return 1.0
132         return 1.0
133     }
134     d := hamming_distance(a, b)
135     return 1.00 - f32(d) / f32(l)
136 }
137 
138 // jaro_similarity uses the Jaro Distance algorithm to calculate
139 // the distance between two strings `a` and `b`.
140 // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
141 @[direct_array_access]
142 pub fn jaro_similarity(a string, b string) f64 {
143     a_len := a.len
144     b_len := b.len
145     if a_len == 0 && b_len == 0 {
146         // Both are empty strings, should return 1.0
147         return 1.0
148     }
149     if a_len == 0 || b_len == 0 {
150         return 0
151     }
152 
153     // Maximum distance upto which matching is allowed
154     match_distance := max2(max2(a_len, b_len) / 2 - 1, 0)
155 
156     mut a_matches := []bool{len: a_len}
157     mut b_matches := []bool{len: b_len}
158     mut matches := 0
159     mut transpositions := 0.0
160 
161     // Traverse through the first string
162     for i in 0 .. a_len {
163         start := max2(0, i - match_distance)
164         end := min2(b_len, i + match_distance + 1)
165         for k in start .. end {
166             // If there is a match
167             if b_matches[k] {
168                 continue
169             }
170             if a[i] != b[k] {
171                 continue
172             }
173             a_matches[i] = true
174             b_matches[k] = true
175             matches++
176             break
177         }
178     }
179     // If there is no match
180     if matches == 0 {
181         return 0
182     }
183     mut k := 0
184     // Count number of occurrences where two characters match but
185     // there is a third matched character in between the indices
186     for i in 0 .. a_len {
187         if !a_matches[i] {
188             continue
189         }
190         // Find the next matched character in second string
191         for !b_matches[k] {
192             k++
193         }
194         if a[i] != b[k] {
195             transpositions++
196         }
197         k++
198     }
199     transpositions /= 2
200     return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
201 }
202 
203 // jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
204 // the distance between two strings `a` and `b`.
205 // It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
206 // The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
207 // similarities, making it especially effective for cases where slight misspellings
208 // or prefixes are common.
209 @[direct_array_access]
210 pub fn jaro_winkler_similarity(a string, b string) f64 {
211     // Maximum of 4 characters are allowed in prefix
212     mut lmax := min2(4, min2(a.len, b.len))
213     mut l := 0
214     for i in 0 .. lmax {
215         if a[i] == b[i] {
216             l++
217         }
218     }
219     js := jaro_similarity(a, b)
220     // select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
221     p := 0.1
222     ws := js + f64(l) * p * (1 - js)
223     return ws
224 }
225

1	module strings
2
3	@[inline]
4	fn min(a int, b int, c int) int {
5	mut m := a
6	if b < m {
7	m = b
8	}
9	if c < m {
10	m = c
11	}
12	return m
13	}
14
15	@[inline]
16	fn max2(a int, b int) int {
17	if a < b {
18	return b
19	}
20	return a
21	}
22
23	@[inline]
24	fn min2(a int, b int) int {
25	if a < b {
26	return a
27	}
28	return b
29	}
30
31	@[inline]
32	fn abs2(a int, b int) int {
33	if a < b {
34	return b - a
35	}
36	return a - b
37	}
38
39	// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
40	// the distance between between two strings `a` and `b` (lower is closer).
41	@[direct_array_access]
42	pub fn levenshtein_distance(a string, b string) int {
43	if a.len == 0 {
44	return b.len
45	}
46	if b.len == 0 {
47	return a.len
48	}
49	if a == b {
50	return 0
51	}
52	mut row := []int{len: a.len + 1, init: index}
53	for i := 1; i < b.len + 1; i++ {
54	mut prev := i
55	for j := 1; j < a.len + 1; j++ {
56	mut current := row[j - 1] // match
57	if b[i - 1] != a[j - 1] {
58	// insertion, substitution, deletion
59	current = min(row[j - 1] + 1, prev + 1, row[j] + 1)
60	}
61	row[j - 1] = prev
62	prev = current
63	}
64	row[a.len] = prev
65	}
66	return row[a.len]
67	}
68
69	// levenshtein_distance_percentage uses the Levenshtein Distance algorithm to calculate how similar two strings are as a percentage (higher is closer).
70	pub fn levenshtein_distance_percentage(a string, b string) f32 {
71	d := levenshtein_distance(a, b)
72	l := if a.len >= b.len { a.len } else { b.len }
73	return (1.00 - f32(d) / f32(l)) * 100.00
74	}
75
76	// dice_coefficient implements the Sørensen–Dice coefficient.
77	// It finds the similarity between two strings, and returns a coefficient
78	// between 0.0 (not similar) and 1.0 (exact match).
79	pub fn dice_coefficient(s1 string, s2 string) f32 {
80	if s1.len == 0 \|\| s2.len == 0 {
81	return 0.0
82	}
83	if s1 == s2 {
84	return 1.0
85	}
86	if s1.len < 2 \|\| s2.len < 2 {
87	return 0.0
88	}
89	a := if s1.len > s2.len { s1 } else { s2 }
90	b := if a == s1 { s2 } else { s1 }
91	mut first_bigrams := map[string]int{}
92	for i in 0 .. a.len - 1 {
93	bigram := a[i..i + 2]
94	q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
95	first_bigrams[bigram] = q
96	}
97	mut intersection_size := 0
98	for i in 0 .. b.len - 1 {
99	bigram := b[i..i + 2]
100	count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
101	if count > 0 {
102	first_bigrams[bigram] = count - 1
103	intersection_size++
104	}
105	}
106	return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
107	}
108
109	// hamming_distance uses the Hamming Distance algorithm to calculate
110	// the distance between two strings `a` and `b` (lower is closer).
111	@[direct_array_access]
112	pub fn hamming_distance(a string, b string) int {
113	if a.len == 0 && b.len == 0 {
114	return 0
115	}
116	mut match_len := min2(a.len, b.len)
117	mut diff_count := abs2(a.len, b.len)
118	for i in 0 .. match_len {
119	if a[i] != b[i] {
120	diff_count++
121	}
122	}
123	return diff_count
124	}
125
126	// hamming_similarity uses the Hamming Distance algorithm to calculate the distance between two strings `a` and `b`.
127	// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
128	pub fn hamming_similarity(a string, b string) f32 {
129	l := max2(a.len, b.len)
130	if l == 0 {
131	// Both are empty strings, should return 1.0
132	return 1.0
133	}
134	d := hamming_distance(a, b)
135	return 1.00 - f32(d) / f32(l)
136	}
137
138	// jaro_similarity uses the Jaro Distance algorithm to calculate
139	// the distance between two strings `a` and `b`.
140	// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
141	@[direct_array_access]
142	pub fn jaro_similarity(a string, b string) f64 {
143	a_len := a.len
144	b_len := b.len
145	if a_len == 0 && b_len == 0 {
146	// Both are empty strings, should return 1.0
147	return 1.0
148	}
149	if a_len == 0 \|\| b_len == 0 {
150	return 0
151	}
152
153	// Maximum distance upto which matching is allowed
154	match_distance := max2(max2(a_len, b_len) / 2 - 1, 0)
155
156	mut a_matches := []bool{len: a_len}
157	mut b_matches := []bool{len: b_len}
158	mut matches := 0
159	mut transpositions := 0.0
160
161	// Traverse through the first string
162	for i in 0 .. a_len {
163	start := max2(0, i - match_distance)
164	end := min2(b_len, i + match_distance + 1)
165	for k in start .. end {
166	// If there is a match
167	if b_matches[k] {
168	continue
169	}
170	if a[i] != b[k] {
171	continue
172	}
173	a_matches[i] = true
174	b_matches[k] = true
175	matches++
176	break
177	}
178	}
179	// If there is no match
180	if matches == 0 {
181	return 0
182	}
183	mut k := 0
184	// Count number of occurrences where two characters match but
185	// there is a third matched character in between the indices
186	for i in 0 .. a_len {
187	if !a_matches[i] {
188	continue
189	}
190	// Find the next matched character in second string
191	for !b_matches[k] {
192	k++
193	}
194	if a[i] != b[k] {
195	transpositions++
196	}
197	k++
198	}
199	transpositions /= 2
200	return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
201	}
202
203	// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
204	// the distance between two strings `a` and `b`.
205	// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
206	// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
207	// similarities, making it especially effective for cases where slight misspellings
208	// or prefixes are common.
209	@[direct_array_access]
210	pub fn jaro_winkler_similarity(a string, b string) f64 {
211	// Maximum of 4 characters are allowed in prefix
212	mut lmax := min2(4, min2(a.len, b.len))
213	mut l := 0
214	for i in 0 .. lmax {
215	if a[i] == b[i] {
216	l++
217	}
218	}
219	js := jaro_similarity(a, b)
220	// select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
221	p := 0.1
222	ws := js + f64(l) * p * (1 - js)
223	return ws
224	}
225