1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00
v/vlib/math/stats/stats.v

485 lines
11 KiB
V
Raw Permalink Normal View History

module stats
import math
2022-05-20 08:45:54 +03:00
// freq calculates the Measure of Occurance
// Frequency of a given number
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/frequency-distribution.html
pub fn freq[T](data []T, val T) int {
if data.len == 0 {
return 0
}
mut count := 0
for v in data {
if v == val {
count++
}
}
return count
}
2022-05-20 08:45:54 +03:00
// mean calculates the average
// of the given input array, sum(data)/data.len
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/central-measures.html
pub fn mean[T](data []T) T {
if data.len == 0 {
return T(0)
}
mut sum := T(0)
for v in data {
sum += v
}
return sum / T(data.len)
}
2022-05-20 08:45:54 +03:00
// geometric_mean calculates the central tendency
// of the given input array, product(data)**1/data.len
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/numbers/geometric-mean.html
pub fn geometric_mean[T](data []T) T {
if data.len == 0 {
return T(0)
}
mut sum := 1.0
for v in data {
sum *= v
}
return math.pow(sum, 1.0 / T(data.len))
}
2022-05-20 08:45:54 +03:00
// harmonic_mean calculates the reciprocal of the average of reciprocals
// of the given input array
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/numbers/harmonic-mean.html
pub fn harmonic_mean[T](data []T) T {
if data.len == 0 {
return T(0)
}
mut sum := T(0)
for v in data {
sum += 1.0 / v
}
return T(data.len) / sum
}
2022-05-20 08:45:54 +03:00
// median returns the middlemost value of the given input array ( input array is assumed to be sorted )
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/central-measures.html
pub fn median[T](sorted_data []T) T {
if sorted_data.len == 0 {
return T(0)
}
if sorted_data.len % 2 == 0 {
mid := (sorted_data.len / 2) - 1
return (sorted_data[mid] + sorted_data[mid + 1]) / T(2)
} else {
return sorted_data[((sorted_data.len - 1) / 2)]
}
}
2022-05-20 08:45:54 +03:00
// mode calculates the highest occuring value of the given input array
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/central-measures.html
pub fn mode[T](data []T) T {
if data.len == 0 {
return T(0)
}
2020-04-26 14:49:31 +03:00
mut freqs := []int{}
for v in data {
freqs << freq(data, v)
}
mut max := 0
for i := 0; i < freqs.len; i++ {
if freqs[i] > freqs[max] {
max = i
}
}
return data[max]
}
2022-05-20 08:45:54 +03:00
// rms, Root Mean Square, calculates the sqrt of the mean of the squares of the given input array
2020-04-26 14:49:31 +03:00
// Based on
// https://en.wikipedia.org/wiki/Root_mean_square
pub fn rms[T](data []T) T {
if data.len == 0 {
return T(0)
}
mut sum := T(0)
for v in data {
sum += math.pow(v, 2)
}
return math.sqrt(sum / T(data.len))
}
2022-05-20 08:45:54 +03:00
// population_variance is the Measure of Dispersion / Spread
// of the given input array
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn population_variance[T](data []T) T {
if data.len == 0 {
return T(0)
}
data_mean := mean[T](data)
return population_variance_mean[T](data, data_mean)
}
2022-05-20 08:45:54 +03:00
// population_variance_mean is the Measure of Dispersion / Spread
// of the given input array, with the provided mean
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
pub fn population_variance_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut sum := T(0)
for v in data {
sum += (v - mean) * (v - mean)
}
return sum / T(data.len)
}
2022-05-20 08:45:54 +03:00
// sample_variance calculates the spread of dataset around the mean
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn sample_variance[T](data []T) T {
if data.len == 0 {
return T(0)
}
data_mean := mean[T](data)
return sample_variance_mean[T](data, data_mean)
}
2022-05-20 08:45:54 +03:00
// sample_variance calculates the spread of dataset around the provided mean
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
pub fn sample_variance_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut sum := T(0)
for v in data {
sum += (v - mean) * (v - mean)
}
return sum / T(data.len - 1)
}
2022-05-20 08:45:54 +03:00
// population_stddev calculates how spread out the dataset is
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn population_stddev[T](data []T) T {
if data.len == 0 {
return T(0)
}
return math.sqrt(population_variance[T](data))
}
2022-05-20 08:45:54 +03:00
// population_stddev_mean calculates how spread out the dataset is, with the provide mean
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn population_stddev_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
return T(math.sqrt(f64(population_variance_mean[T](data, mean))))
}
// Measure of Dispersion / Spread
// Sample Standard Deviation of the given input array
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn sample_stddev[T](data []T) T {
if data.len == 0 {
return T(0)
}
return T(math.sqrt(f64(sample_variance[T](data))))
}
// Measure of Dispersion / Spread
// Sample Standard Deviation of the given input array
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn sample_stddev_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
return T(math.sqrt(f64(sample_variance_mean[T](data, mean))))
}
2022-05-20 08:45:54 +03:00
// absdev calculates the average distance between each data point and the mean
// Based on
// https://en.wikipedia.org/wiki/Average_absolute_deviation
[inline]
pub fn absdev[T](data []T) T {
if data.len == 0 {
return T(0)
}
data_mean := mean[T](data)
return absdev_mean[T](data, data_mean)
}
2022-05-20 08:45:54 +03:00
// absdev_mean calculates the average distance between each data point and the provided mean
2020-04-26 14:49:31 +03:00
// Based on
// https://en.wikipedia.org/wiki/Average_absolute_deviation
pub fn absdev_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut sum := T(0)
for v in data {
sum += math.abs(v - mean)
}
return sum / T(data.len)
}
2022-05-20 08:45:54 +03:00
// tts, Sum of squares, calculates the sum over all squared differences between values and overall mean
[inline]
pub fn tss[T](data []T) T {
if data.len == 0 {
return T(0)
}
data_mean := mean[T](data)
return tss_mean[T](data, data_mean)
}
2022-05-20 08:45:54 +03:00
// tts_mean, Sum of squares, calculates the sum over all squared differences between values and the provided mean
pub fn tss_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut tss := T(0)
for v in data {
tss += (v - mean) * (v - mean)
}
return tss
}
2022-05-20 08:45:54 +03:00
// min finds the minimum value from the dataset
pub fn min[T](data []T) T {
if data.len == 0 {
return T(0)
}
mut min := data[0]
for v in data {
if v < min {
min = v
}
}
return min
}
2022-05-20 08:45:54 +03:00
// max finds the maximum value from the dataset
pub fn max[T](data []T) T {
if data.len == 0 {
return T(0)
}
mut max := data[0]
for v in data {
if v > max {
max = v
}
}
return max
}
2022-05-20 08:45:54 +03:00
// minmax finds the minimum and maximum value from the dataset
pub fn minmax[T](data []T) (T, T) {
if data.len == 0 {
return T(0), T(0)
}
mut max := data[0]
mut min := data[0]
for v in data[1..] {
if v > max {
max = v
}
if v < min {
min = v
}
}
return min, max
}
2022-05-20 08:45:54 +03:00
// min_index finds the first index of the minimum value
pub fn min_index[T](data []T) int {
if data.len == 0 {
return 0
}
mut min := data[0]
mut min_index := 0
for i, v in data {
if v < min {
min = v
min_index = i
}
}
return min_index
}
2022-05-20 08:45:54 +03:00
// max_index finds the first index of the maximum value
pub fn max_index[T](data []T) int {
if data.len == 0 {
return 0
}
mut max := data[0]
mut max_index := 0
for i, v in data {
if v > max {
max = v
max_index = i
}
}
return max_index
}
2022-05-20 08:45:54 +03:00
// minmax_index finds the first index of the minimum and maximum value
pub fn minmax_index[T](data []T) (int, int) {
if data.len == 0 {
return 0, 0
}
mut min := data[0]
mut max := data[0]
mut min_index := 0
mut max_index := 0
for i, v in data {
if v < min {
min = v
min_index = i
}
if v > max {
max = v
max_index = i
}
}
return min_index, max_index
}
2022-05-20 08:45:54 +03:00
// range calculates the difference between the min and max
// Range ( Maximum - Minimum ) of the given input array
2020-04-26 14:49:31 +03:00
// Based on
// https://www.mathsisfun.com/data/range.html
pub fn range[T](data []T) T {
if data.len == 0 {
return T(0)
}
min, max := minmax[T](data)
return max - min
}
2022-05-20 08:45:54 +03:00
// covariance calculates directional association between datasets
// positive value denotes variables move in same direction and negative denotes variables move in opposite directions
[inline]
pub fn covariance[T](data1 []T, data2 []T) T {
mean1 := mean[T](data1)
mean2 := mean[T](data2)
return covariance_mean[T](data1, data2, mean1, mean2)
}
2022-05-20 08:45:54 +03:00
// covariance_mean computes the covariance of a dataset with means provided
// the recurrence relation
pub fn covariance_mean[T](data1 []T, data2 []T, mean1 T, mean2 T) T {
n := int(math.min(data1.len, data2.len))
if n == 0 {
return T(0)
}
mut covariance := T(0)
for i in 0 .. n {
delta1 := data1[i] - mean1
delta2 := data2[i] - mean2
covariance += (delta1 * delta2 - covariance) / (T(i) + 1.0)
}
return covariance
}
2022-05-20 08:45:54 +03:00
// lag1_autocorrelation_mean calculates the correlation between values that are one time period apart
// of a dataset, based on the mean
[inline]
pub fn lag1_autocorrelation[T](data []T) T {
data_mean := mean[T](data)
return lag1_autocorrelation_mean[T](data, data_mean)
}
2022-05-20 08:45:54 +03:00
// lag1_autocorrelation_mean calculates the correlation between values that are one time period apart
// of a dataset, using
// the recurrence relation
pub fn lag1_autocorrelation_mean[T](data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut q := T(0)
mut v := (data[0] * mean) - (data[0] * mean)
for i := 1; i < data.len; i++ {
delta0 := data[i - 1] - mean
delta1 := data[i] - mean
q += (delta0 * delta1 - q) / (T(i) + 1.0)
v += (delta1 * delta1 - v) / (T(i) + 1.0)
}
return q / v
}
2022-05-20 08:45:54 +03:00
// kurtosis calculates the measure of the 'tailedness' of the data by finding mean and standard of deviation
[inline]
pub fn kurtosis[T](data []T) T {
data_mean := mean[T](data)
sd := population_stddev_mean[T](data, data_mean)
return kurtosis_mean_stddev[T](data, data_mean, sd)
}
2022-05-20 08:45:54 +03:00
// kurtosis_mean_stddev calculates the measure of the 'tailedness' of the data
// using the fourth moment the deviations, normalized by the sd
pub fn kurtosis_mean_stddev[T](data []T, mean T, sd T) T {
mut avg := T(0) // find the fourth moment the deviations, normalized by the sd
/*
we use a recurrence relation to stably update a running value so
* there aren't any large sums that can overflow
*/
for i, v in data {
x := (v - mean) / sd
avg += (x * x * x * x - avg) / (T(i) + 1.0)
}
return avg - T(3.0)
}
2022-05-20 08:45:54 +03:00
// skew calculates the mean and standard of deviation to find the skew from the data
[inline]
pub fn skew[T](data []T) T {
data_mean := mean[T](data)
sd := population_stddev_mean[T](data, data_mean)
return skew_mean_stddev[T](data, data_mean, sd)
}
2022-05-20 08:45:54 +03:00
// skew_mean_stddev calculates the skewness of data
pub fn skew_mean_stddev[T](data []T, mean T, sd T) T {
mut skew := T(0) // find the sum of the cubed deviations, normalized by the sd.
/*
we use a recurrence relation to stably update a running value so
* there aren't any large sums that can overflow
*/
for i, v in data {
x := (v - mean) / sd
skew += (x * x * x - skew) / (T(i) + 1.0)
}
return skew
}
pub fn quantile[T](sorted_data []T, f T) T {
if sorted_data.len == 0 {
return T(0)
}
index := f * (T(sorted_data.len) - 1.0)
lhs := int(index)
delta := index - T(lhs)
return if lhs == sorted_data.len - 1 {
sorted_data[lhs]
} else {
(1.0 - delta) * sorted_data[lhs] + delta * sorted_data[(lhs + 1)]
}
2019-07-29 19:21:36 +03:00
}