1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00
v/vlib/strings/similarity.v
2019-09-13 16:15:30 +03:00

60 lines
1.6 KiB
V
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

module strings
// use levenshtein distance algorithm to calculate
// the distance between between two strings (lower is closer)
pub fn levenshtein_distance(a, b string) int {
mut f := [int(0); b.len+1]
for ca in a {
mut j := 1
mut fj1 := f[0]
f[0]++
for cb in b {
mut mn := if f[j]+1 <= f[j-1]+1 { f[j]+1 } else { f[j-1]+1 }
if cb != ca {
mn = if mn <= fj1+1 { mn } else { fj1+1 }
} else {
mn = if mn <= fj1 { mn } else { fj1 }
}
fj1 = f[j]
f[j] = mn
j++
}
}
return f[f.len-1]
}
// use levenshtein distance algorithm to calculate
// how similar two strings are as a percentage (higher is closer)
pub fn levenshtein_distance_percentage(a, b string) f32 {
d := levenshtein_distance(a, b)
l := if a.len >= b.len { a.len } else { b.len }
return (1.00 - f32(d)/f32(l)) * 100.00
}
// implementation of SørensenDice coefficient.
// find the similarity between two strings.
// returns f64 between 0.0 (not similar) and 1.0 (exact match).
pub fn dice_coefficient(s1, s2 string) f32 {
if s1.len == 0 || s2.len == 0 { return 0.0 }
if s1 == s2 { return 1.0 }
if s1.len < 2 || s2.len < 2 { return 0.0 }
mut first_bigrams := map[string]int
for i := 0; i < s1.len-1; i++ {
a := s1[i]
b := s1[i+1]
bigram := (a+b).str()
first_bigrams[bigram] = if bigram in first_bigrams { first_bigrams[bigram]+1 } else { 1 }
}
mut intersection_size := 0
for i := 0; i < s2.len-1; i++ {
a := s2[i]
b := s2[i+1]
bigram := (a+b).str()
count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
if count > 0 {
intersection_size++
}
}
return (2.0 * intersection_size) / (f32(s1.len) + f32(s2.len) - 2)
}