v2 / vlib / math / stats / stats.v
545 lines · 507 sloc · 12.41 KB · 844c3327cf77113c8f3a2354e79e86a33647bc09
Raw
1module stats
2
3import math
4
5// freq calculates the Measure of Occurrence
6// Frequency of a given number
7// Based on
8// https://www.mathsisfun.com/data/frequency-distribution.html
9pub fn freq[T](data []T, val T) int {
10 if data.len == 0 {
11 return 0
12 }
13 mut count := 0
14 for v in data {
15 if v == val {
16 count++
17 }
18 }
19 return count
20}
21
22// mean calculates the average
23// of the given input array, sum(data)/data.len
24// Based on
25// https://www.mathsisfun.com/data/central-measures.html
26pub fn mean[T](data []T) T {
27 if data.len == 0 {
28 return T(0)
29 }
30 mut sum := T(0)
31 for v in data {
32 sum += v
33 }
34 return T(sum / data.len)
35}
36
37// geometric_mean calculates the central tendency
38// of the given input array, product(data)**1/data.len
39// Based on
40// https://www.mathsisfun.com/numbers/geometric-mean.html
41pub fn geometric_mean[T](data []T) T {
42 if data.len == 0 {
43 return T(0)
44 }
45 mut sum := T(1)
46 for v in data {
47 sum *= v
48 }
49 $if T is f64 {
50 return math.pow(sum, f64(1.0) / data.len)
51 } $else {
52 // use f32 for f32/int/...
53 return T(math.powf(f32(sum), f32(1.0) / data.len))
54 }
55}
56
57// harmonic_mean calculates the reciprocal of the average of reciprocals
58// of the given input array
59// Based on
60// https://www.mathsisfun.com/numbers/harmonic-mean.html
61pub fn harmonic_mean[T](data []T) T {
62 if data.len == 0 {
63 return T(0)
64 }
65 $if T is f64 {
66 mut sum := f64(0)
67 for v in data {
68 sum += f64(1.0) / v
69 }
70 return f64(f64(data.len) / sum)
71 } $else {
72 // use f32 for f32/int/...
73 mut sum := f32(0)
74 for v in data {
75 sum += f32(1.0) / f32(v)
76 }
77 return T(f32(data.len) / sum)
78 }
79}
80
81// median returns the middlemost value of the given input array ( input array is assumed to be sorted )
82// Based on
83// https://www.mathsisfun.com/data/central-measures.html
84pub fn median[T](sorted_data []T) T {
85 if sorted_data.len == 0 {
86 return T(0)
87 }
88 if sorted_data.len % 2 == 0 {
89 mid := (sorted_data.len / 2) - 1
90 return (sorted_data[mid] + sorted_data[mid + 1]) / T(2)
91 } else {
92 return sorted_data[((sorted_data.len - 1) / 2)]
93 }
94}
95
96// mode calculates the highest occurring value of the given input array
97// Based on
98// https://www.mathsisfun.com/data/central-measures.html
99pub fn mode[T](data []T) T {
100 if data.len == 0 {
101 return T(0)
102 }
103 mut freqs := []int{}
104 for v in data {
105 freqs << freq(data, v)
106 }
107 mut max := 0
108 for i := 0; i < freqs.len; i++ {
109 if freqs[i] > freqs[max] {
110 max = i
111 }
112 }
113 return data[max]
114}
115
116// rms, Root Mean Square, calculates the sqrt of the mean of the squares of the given input array
117// Based on
118// https://en.wikipedia.org/wiki/Root_mean_square
119pub fn rms[T](data []T) T {
120 if data.len == 0 {
121 return T(0)
122 }
123
124 $if T is f64 {
125 mut sum := f64(0)
126 for v in data {
127 sum += math.pow(v, 2)
128 }
129 return math.sqrt(sum / data.len)
130 } $else {
131 // use f32 for f32/int/...
132 mut sum := f32(0)
133 for v in data {
134 sum += math.powf(f32(v), 2)
135 }
136 return T(math.sqrtf(sum / data.len))
137 }
138}
139
140// population_variance is the Measure of Dispersion / Spread
141// of the given input array
142// Based on
143// https://www.mathsisfun.com/data/standard-deviation.html
144@[inline]
145pub fn population_variance[T](data []T) T {
146 if data.len == 0 {
147 return T(0)
148 }
149 data_mean := mean[T](data)
150 return population_variance_mean[T](data, data_mean)
151}
152
153// population_variance_mean is the Measure of Dispersion / Spread
154// of the given input array, with the provided mean
155// Based on
156// https://www.mathsisfun.com/data/standard-deviation.html
157pub fn population_variance_mean[T](data []T, mean T) T {
158 if data.len == 0 {
159 return T(0)
160 }
161
162 mut sum := T(0)
163 for v in data {
164 sum += T((v - mean) * (v - mean))
165 }
166 return T(sum / data.len)
167}
168
169// sample_variance calculates the spread of dataset around the mean
170// Based on
171// https://www.mathsisfun.com/data/standard-deviation.html
172@[inline]
173pub fn sample_variance[T](data []T) T {
174 if data.len == 0 {
175 return T(0)
176 }
177 data_mean := mean[T](data)
178 return sample_variance_mean[T](data, data_mean)
179}
180
181// sample_variance calculates the spread of dataset around the provided mean
182// Based on
183// https://www.mathsisfun.com/data/standard-deviation.html
184pub fn sample_variance_mean[T](data []T, mean T) T {
185 if data.len == 0 {
186 return T(0)
187 }
188 mut sum := T(0)
189 for v in data {
190 sum += T((v - mean) * (v - mean))
191 }
192 return T(sum / (data.len - 1))
193}
194
195// population_stddev calculates how spread out the dataset is
196// Based on
197// https://www.mathsisfun.com/data/standard-deviation.html
198@[inline]
199pub fn population_stddev[T](data []T) T {
200 if data.len == 0 {
201 return T(0)
202 }
203 $if T is f64 {
204 return math.sqrt(population_variance[T](data))
205 } $else {
206 return T(math.sqrtf(f32(population_variance[T](data))))
207 }
208}
209
210// population_stddev_mean calculates how spread out the dataset is, with the provide mean
211// Based on
212// https://www.mathsisfun.com/data/standard-deviation.html
213@[inline]
214pub fn population_stddev_mean[T](data []T, mean T) T {
215 if data.len == 0 {
216 return T(0)
217 }
218 $if T is f64 {
219 return math.sqrt(population_variance_mean[T](data, mean))
220 } $else {
221 return T(math.sqrtf(f32(population_variance_mean[T](data, mean))))
222 }
223}
224
225// Measure of Dispersion / Spread
226// Sample Standard Deviation of the given input array
227// Based on
228// https://www.mathsisfun.com/data/standard-deviation.html
229@[inline]
230pub fn sample_stddev[T](data []T) T {
231 if data.len == 0 {
232 return T(0)
233 }
234 $if T is f64 {
235 return math.sqrt(sample_variance[T](data))
236 } $else {
237 return T(math.sqrtf(f32(sample_variance[T](data))))
238 }
239}
240
241// Measure of Dispersion / Spread
242// Sample Standard Deviation of the given input array
243// Based on
244// https://www.mathsisfun.com/data/standard-deviation.html
245@[inline]
246pub fn sample_stddev_mean[T](data []T, mean T) T {
247 if data.len == 0 {
248 return T(0)
249 }
250 $if T is f64 {
251 return math.sqrt(sample_variance_mean[T](data, mean))
252 } $else {
253 return T(math.sqrtf(sample_variance_mean[T](data, mean)))
254 }
255}
256
257// absdev calculates the average distance between each data point and the mean
258// Based on
259// https://en.wikipedia.org/wiki/Average_absolute_deviation
260@[inline]
261pub fn absdev[T](data []T) T {
262 if data.len == 0 {
263 return T(0)
264 }
265 data_mean := mean[T](data)
266 return absdev_mean[T](data, data_mean)
267}
268
269// absdev_mean calculates the average distance between each data point and the provided mean
270// Based on
271// https://en.wikipedia.org/wiki/Average_absolute_deviation
272pub fn absdev_mean[T](data []T, mean T) T {
273 if data.len == 0 {
274 return T(0)
275 }
276 mut sum := T(0)
277 for v in data {
278 sum += math.abs(v - mean)
279 }
280 return T(sum / data.len)
281}
282
283// tts, Sum of squares, calculates the sum over all squared differences between values and overall mean
284@[inline]
285pub fn tss[T](data []T) T {
286 if data.len == 0 {
287 return T(0)
288 }
289 data_mean := mean[T](data)
290 return tss_mean[T](data, data_mean)
291}
292
293// tts_mean, Sum of squares, calculates the sum over all squared differences between values and the provided mean
294pub fn tss_mean[T](data []T, mean T) T {
295 if data.len == 0 {
296 return T(0)
297 }
298 mut tss := T(0)
299 for v in data {
300 tss += T((v - mean) * (v - mean))
301 }
302 return tss
303}
304
305// min finds the minimum value from the dataset
306pub fn min[T](data []T) T {
307 if data.len == 0 {
308 return T(0)
309 }
310 mut min := data[0]
311 for v in data {
312 if v < min {
313 min = v
314 }
315 }
316 return min
317}
318
319// max finds the maximum value from the dataset
320pub fn max[T](data []T) T {
321 if data.len == 0 {
322 return T(0)
323 }
324 mut max := data[0]
325 for v in data {
326 if v > max {
327 max = v
328 }
329 }
330 return max
331}
332
333// minmax finds the minimum and maximum value from the dataset
334pub fn minmax[T](data []T) (T, T) {
335 if data.len == 0 {
336 return T(0), T(0)
337 }
338 mut max := data[0]
339 mut min := data[0]
340 for v in data[1..] {
341 if v > max {
342 max = v
343 }
344 if v < min {
345 min = v
346 }
347 }
348 return min, max
349}
350
351// min_index finds the first index of the minimum value
352pub fn min_index[T](data []T) int {
353 if data.len == 0 {
354 return 0
355 }
356 mut min := data[0]
357 mut min_index := 0
358 for i, v in data {
359 if v < min {
360 min = v
361 min_index = i
362 }
363 }
364 return min_index
365}
366
367// max_index finds the first index of the maximum value
368pub fn max_index[T](data []T) int {
369 if data.len == 0 {
370 return 0
371 }
372 mut max := data[0]
373 mut max_index := 0
374 for i, v in data {
375 if v > max {
376 max = v
377 max_index = i
378 }
379 }
380 return max_index
381}
382
383// minmax_index finds the first index of the minimum and maximum value
384pub fn minmax_index[T](data []T) (int, int) {
385 if data.len == 0 {
386 return 0, 0
387 }
388 mut min := data[0]
389 mut max := data[0]
390 mut min_index := 0
391 mut max_index := 0
392 for i, v in data {
393 if v < min {
394 min = v
395 min_index = i
396 }
397 if v > max {
398 max = v
399 max_index = i
400 }
401 }
402 return min_index, max_index
403}
404
405// range calculates the difference between the min and max
406// Range ( Maximum - Minimum ) of the given input array
407// Based on
408// https://www.mathsisfun.com/data/range.html
409pub fn range[T](data []T) T {
410 if data.len == 0 {
411 return T(0)
412 }
413 min, max := minmax[T](data)
414 return max - min
415}
416
417// covariance calculates directional association between datasets
418// positive value denotes variables move in same direction and negative denotes variables move in opposite directions
419@[inline]
420pub fn covariance[T](data1 []T, data2 []T) T {
421 mean1 := mean[T](data1)
422 mean2 := mean[T](data2)
423 return covariance_mean[T](data1, data2, mean1, mean2)
424}
425
426// covariance_mean computes the covariance of a dataset with means provided
427// the recurrence relation
428pub fn covariance_mean[T](data1 []T, data2 []T, mean1 T, mean2 T) T {
429 n := int(math.min(data1.len, data2.len))
430 if n == 0 {
431 return T(0)
432 }
433 mut covariance := T(0)
434 for i in 0 .. n {
435 delta1 := data1[i] - mean1
436 delta2 := data2[i] - mean2
437 covariance += T((delta1 * delta2 - covariance) / (T(i) + T(1)))
438 }
439 return covariance
440}
441
442// lag1_autocorrelation_mean calculates the correlation between values that are one time period apart
443// of a dataset, based on the mean
444@[inline]
445pub fn lag1_autocorrelation[T](data []T) T {
446 data_mean := mean[T](data)
447 return lag1_autocorrelation_mean[T](data, data_mean)
448}
449
450// lag1_autocorrelation_mean calculates the correlation between values that are one time period apart
451// of a dataset, using
452// the recurrence relation
453pub fn lag1_autocorrelation_mean[T](data []T, mean T) T {
454 if data.len == 0 {
455 return T(0)
456 }
457 mut q := T(0)
458 mut v := (data[0] * mean) - (data[0] * mean)
459 for i := 1; i < data.len; i++ {
460 delta0 := data[i - 1] - mean
461 delta1 := data[i] - mean
462 d01 := delta0 * delta1
463 d11 := delta1 * delta1
464 ti1 := T(i) + T(1)
465 q += T((d01 - q) / ti1)
466 v += T((d11 - v) / ti1)
467 }
468 return T(q / v)
469}
470
471// kurtosis calculates the measure of the 'tailedness' of the data by finding mean and standard of deviation
472@[inline]
473pub fn kurtosis[T](data []T) T {
474 data_mean := mean[T](data)
475 sd := population_stddev_mean[T](data, data_mean)
476 return kurtosis_mean_stddev[T](data, data_mean, sd)
477}
478
479// kurtosis_mean_stddev calculates the measure of the 'tailedness' of the data
480// using the fourth moment the deviations, normalized by the sd
481pub fn kurtosis_mean_stddev[T](data []T, mean T, sd T) T {
482 if data.len == 0 {
483 return T(0)
484 }
485 mut avg := T(0) // find the fourth moment the deviations, normalized by the sd
486 /*
487 we use a recurrence relation to stably update a running value so
488 * there aren't any large sums that can overflow
489 */
490 for i, v in data {
491 x := (v - mean) / sd
492 x4 := x * x * x * x
493 ti1 := (T(i) + T(1))
494 avg += T((x4 - avg) / ti1)
495 }
496 return avg - T(3)
497}
498
499// skew calculates the mean and standard of deviation to find the skew from the data
500@[inline]
501pub fn skew[T](data []T) T {
502 data_mean := mean[T](data)
503 sd := population_stddev_mean[T](data, data_mean)
504 return skew_mean_stddev[T](data, data_mean, sd)
505}
506
507// skew_mean_stddev calculates the skewness of data
508pub fn skew_mean_stddev[T](data []T, mean T, sd T) T {
509 if data.len == 0 {
510 return T(0)
511 }
512 mut skew := T(0) // find the sum of the cubed deviations, normalized by the sd.
513 /*
514 we use a recurrence relation to stably update a running value so
515 * there aren't any large sums that can overflow
516 */
517 for i, v in data {
518 x := (v - mean) / sd
519 x3 := x * x * x
520 skew += T((x3 - skew) / (T(i) + T(1)))
521 }
522 return skew
523}
524
525// quantile calculates quantile points
526// for more reference
527// https://en.wikipedia.org/wiki/Quantile
528pub fn quantile[T](sorted_data []T, f T) !T {
529 if sorted_data.len == 0 {
530 return T(0)
531 }
532 index := f * (sorted_data.len - 1)
533 lhs := int(index)
534 if lhs < 0 || lhs >= sorted_data.len {
535 return error('index out of range')
536 } else if lhs == sorted_data.len - 1 {
537 return sorted_data[lhs]
538 } else {
539 if lhs >= sorted_data.len - 1 {
540 return error('index out of range')
541 }
542 delta := index - T(lhs)
543 return T((1 - delta) * sorted_data[lhs] + delta * sorted_data[(lhs + 1)])
544 }
545}
546