Commit fb8ae07c authored by Meenakshi Aiswarya's avatar Meenakshi Aiswarya

New upstream version 0.0~git20170218.a3153f7

parents
language: go
go:
- 1.x
- 1.6
- 1.7
- master
script:
- cd tests && make
This diff is collapsed.
Copyright (C) 2016 Felipe da Cunha Gonçalves
All Rights Reserved.
MIT LICENSE
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# String metrics
This library contains implementations of the Levenshtein distance, Jaro-Winkler and Soundex algorithms written in Go (golang). Other algorithms related with string metrics (or string similarity, whatever) are welcome.
* master: [![Build Status](https://travis-ci.org/xrash/smetrics.svg?branch=master)](http://travis-ci.org/xrash/smetrics)
# Algorithms
## WagnerFischer
func WagnerFischer(a, b string, icost, dcost, scost int) int
The Wagner-Fischer algorithm for calculating the Levenshtein distance. It runs on O(mn) and needs O(2m) space where m is the size of the smallest string. This is kinda optimized so it should be used in most cases.
The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2.
#### Examples:
smetrics.WagnerFischer("POTATO", "POTATTO", 1, 1, 2)
>> 1, delete the second T on POTATTO
smetrics.WagnerFischer("MOUSE", "HOUSE", 2, 2, 4)
>> 4, substitute M for H
## Ukkonen
func Ukkonen(a, b string, icost, dcost, scost int) int
The Ukkonen algorithm for calculating the Levenshtein distance. The algorithm is described [here](http://www.cs.helsinki.fi/u/ukkonen/InfCont85.PDF). It runs on O(t . min(m, n)) where t is the actual distance between strings a and b, so this version should be preferred over the WagnerFischer for strings **very** similar. In practice, it's slower most of the times. It needs O(min(t, m, n)) space.
The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2.
#### Examples:
smetrics.Ukkonen("POTATO", "POTATTO", 1, 1, 2)
>> 1, delete the second T on POTATTO
smetrics.Ukkonen("MOUSE", "HOUSE", 2, 2, 4)
>> 4, substitute M for H
## Jaro
func Jaro(a, b string) float64
The Jaro distance. It is not very accurate, therefore you should prefer the JaroWinkler optimized version.
#### Examples:
smetrics.Jaro("AL", "AL")
>> 1, equal strings
smetrics.Jaro("MARTHA", "MARHTA")
>> 0.9444444444444445, very likely a typo
smetrics.Jaro("JONES", "JOHNSON")
>> 0.7904761904761904
## JaroWinkler
func JaroWinkler(a, b string, boostThreshold float64, prefixSize int) float64
The JaroWinkler distance. JaroWinkler returns a number between 0 and 1 where 1 means perfectly equal and 0 means completely different. It is commonly used on Record Linkage stuff, thus it tries to be accurate for real names and common typos. You should consider it on data such as person names and street names.
JaroWinkler is a more accurate version of the Jaro algorithm. It works by boosting the score of exact matches at the beginning of the strings. By doing this, Winkler says that typos are less common to happen at the beginning. For this to happen, it introduces two more parameters: the boostThreshold and the prefixSize. These are commonly set to 0.7 and 4, respectively.
#### Examples:
smetrics.JaroWinkler("AL", "AL", 0.7, 4)
>> 1, equal strings
smetrics.JaroWinkler("MARTHA", "MARHTA", 0.7, 4)
>> 0.9611111111111111, very likely a typo
smetrics.JaroWinkler("JONES", "JOHNSON", 0.7, 4)
>> 0.8323809523809523
## Soundex
func Soundex(s string) string
The Soundex encoding. It is a phonetic algorithm that considers how the words sound in english. Soundex maps a name to a 4-byte string consisting of the first letter of the original string and three numbers. Strings that sound similar should map to the same thing.
#### Examples:
smetrics.Soundex("Euler")
>> E460
smetrics.Soundex("Ellery")
>> E460
smetrics.Soundex("Lloyd")
>> L300
smetrics.Soundex("Ladd")
>> L300
## Hamming
func Hamming(a, b string) (int, error)
The Hamming distance is simply the minimum number of substitutions required to change one string into the other. Both strings must have the same size, of the function returns an error.
#### Examples:
smetrics.Hamming("aaa", "aaa")
>> 0, nil
smetrics.Hamming("aaa", "aab")
>> 1, nil
smetrics.Hamming("aaaa", "a")
>> -1, error
# TODO
- Accept cost functions instead of constant values in every Levenshtein implementation.
- Make a better interface.
- Moar algos!
/*
# String metrics
This library contains implementations of the Levenshtein distance, Jaro-Winkler and Soundex algorithms written in Go (golang). Other algorithms related with string metrics (or string similarity, whatever) are welcome.
# Algorithms
## WagnerFischer
func WagnerFischer(a, b string, icost, dcost, scost int) int
The Wagner-Fischer algorithm for calculating the Levenshtein distance. It runs on O(mn) and the currently non-optimized version also needs O(mn) space. This version should be preferred over the Ukkonen one for short strings.
The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2.
#### Examples:
smetrics.WagnerFischer("POTATO", "POTATTO", 1, 1, 2)
>> 1, delete the second T on POTATTO
smetrics.WagnerFischer("MOUSE", "HOUSE", 2, 2, 4)
>> 4, substitute M for H
## Ukkonen
func Ukkonen(a, b string, icost, dcost, scost int) int
The Ukkonen algorithm for calculating the Levenshtein distance. The algorithm is described [here](http://www.cs.helsinki.fi/u/ukkonen/InfCont85.PDF). It runs on O(t . min(m, n)) where t is the actual distance between strings a and b. It needs O(min(t, m, n)) space. This version should be preferred over the WagnerFischer one for very similar strings.
The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2.
#### Examples:
smetrics.Ukkonen("POTATO", "POTATTO", 1, 1, 2)
>> 1, delete the second T on POTATTO
smetrics.Ukkonen("MOUSE", "HOUSE", 2, 2, 4)
>> 4, substitute M for H
## Jaro
func Jaro(a, b string) float64
The Jaro distance. It is not very accurate, therefore you should prefer the JaroWinkler optimized version.
#### Examples:
smetrics.Jaro("AL", "AL")
>> 1, equal strings
smetrics.Jaro("MARTHA", "MARHTA")
>> 0.9444444444444445, very likely a typo
smetrics.Jaro("JONES", "JOHNSON")
>> 0.7904761904761904
## JaroWinkler
func JaroWinkler(a, b string, boostThreshold float64, prefixSize int) float64
The JaroWinkler distance. JaroWinkler returns a number between 0 and 1 where 1 means perfectly equal and 0 means completely different. It is commonly used on Record Linkage stuff, thus it tries to be accurate for real names and common typos. You should consider it on data such as person names and street names.
JaroWinkler is a more accurate version of the Jaro algorithm. It works by boosting the score of exact matches at the beginning of the strings. By doing this, Winkler says that typos are less common to happen at the beginning. For this to happen, it introduces two more parameters: the boostThreshold and the prefixSize. These are commonly set to 0.7 and 4, respectively.
#### Examples:
smetrics.JaroWinkler("AL", "AL", 0.7, 4)
>> 1, equal strings
smetrics.JaroWinkler("MARTHA", "MARHTA", 0.7, 4)
>> 0.9611111111111111, very likely a typo
smetrics.JaroWinkler("JONES", "JOHNSON", 0.7, 4)
>> 0.8323809523809523
## Soundex
func Soundex(s string) string
The Soundex encoding. It is a phonetic algorithm that considers how the words sound in english. Soundex maps a name to a 4-byte string consisting of the first letter of the original string and three numbers. Strings that sound similar should map to the same thing.
#### Examples:
smetrics.Soundex("Euler")
>> E460
smetrics.Soundex("Ellery")
>> E460
smetrics.Soundex("Lloyd")
>> L300
smetrics.Soundex("Ladd")
>> L300
## Hamming
func Hamming(a, b string) (int, error)
The Hamming distance is simply the minimum number of substitutions required to change one string into the other. Both strings must have the same size, of the function returns an error.
#### Examples:
smetrics.Hamming("aaa", "aaa")
>> 0, nil
smetrics.Hamming("aaa", "aab")
>> 1, nil
smetrics.Hamming("aaaa", "a")
>> -1, error
# TODO
- Optimize WagnerFischer for memory; currently it stores the whole matrix and so it needs O(mn) space. Only the previous row of the matrix needs to be stored, so it can be easily optimized to use O(m) space.
- Accept cost functions instead of constant values in every Levenshtein implementation.
- Moar algos!
*/
package smetrics
package smetrics
import (
"fmt"
)
func Hamming(a, b string) (int, error) {
al := len(a)
bl := len(b)
if al != bl {
return -1, fmt.Errorf("strings are not equal (len(a)=%d, len(b)=%d)", al, bl)
}
var difference = 0
for i := range a {
if a[i] != b[i] {
difference = difference + 1
}
}
return difference, nil
}
package smetrics
import (
"math"
)
func JaroWinkler(a, b string, boostThreshold float64, prefixSize int) float64 {
j := Jaro(a, b)
if j <= boostThreshold {
return j
}
prefixSize = int(math.Min(float64(len(a)), math.Min(float64(prefixSize), float64(len(b)))))
var prefixMatch float64
for i := 0; i < prefixSize; i++ {
if a[i] == b[i] {
prefixMatch++
}
}
return j + 0.1*prefixMatch*(1.0-j)
}
package smetrics
import (
"math"
)
func Jaro(a, b string) float64 {
la := float64(len(a))
lb := float64(len(b))
// match range = max(len(a), len(b)) / 2 - 1
matchRange := int(math.Floor(math.Max(la, lb)/2.0)) - 1
matchRange = int(math.Max(0, float64(matchRange-1)))
var matches, halfs float64
transposed := make([]bool, len(b))
for i := 0; i < len(a); i++ {
start := int(math.Max(0, float64(i-matchRange)))
end := int(math.Min(lb-1, float64(i+matchRange)))
for j := start; j <= end; j++ {
if transposed[j] {
continue
}
if a[i] == b[j] {
if i != j {
halfs++
}
matches++
transposed[j] = true
break
}
}
}
if matches == 0 {
return 0
}
transposes := math.Floor(float64(halfs / 2))
return ((matches / la) + (matches / lb) + (matches-transposes)/matches) / 3.0
}
package smetrics
import (
"strings"
)
func Soundex(s string) string {
m := map[byte]string{
'B': "1", 'P': "1", 'F': "1", 'V': "1",
'C': "2", 'S': "2", 'K': "2", 'G': "2", 'J': "2", 'Q': "2", 'X': "2", 'Z': "2",
'D': "3", 'T': "3",
'L': "4",
'M': "5", 'N': "5",
'R': "6",
}
s = strings.ToUpper(s)
r := string(s[0])
p := s[0]
for i := 1; i < len(s) && len(r) < 4; i++ {
c := s[i]
if (c < 'A' || c > 'Z') || (c == p) {
continue
}
p = c
if n, ok := m[c]; ok {
r += n
}
}
for i := len(r); i < 4; i++ {
r += "0"
}
return r
}
.PHONY : test
test :
go test -v
.PHONY : gdb
gdb :
go test -c -s -N -l
gdb ./tests.test
package tests
import (
"fmt"
"github.com/xrash/smetrics"
"testing"
)
func TestHamming(t *testing.T) {
cases := []hammingcase{
{"a", "a", 0},
{"a", "b", 1},
{"AAAA", "AABB", 2},
{"BAAA", "AAAA", 1},
{"BAAA", "CCCC", 4},
{"karolin", "kathrin", 3},
{"karolin", "kerstin", 3},
{"1011101", "1001001", 2},
{"2173896", "2233796", 3},
}
for _, c := range cases {
r, err := smetrics.Hamming(c.a, c.b)
if err != nil {
t.Fatalf("got error from hamming err=%s", err)
}
if r != c.diff {
fmt.Println(r, "instead of", c.diff)
t.Fail()
}
}
}
func TestHammingError(t *testing.T) {
res, err := smetrics.Hamming("a", "bbb")
if err == nil {
t.Fatalf("expected error from 'a' and 'bbb' on hamming")
}
if res != -1 {
t.Fatalf("erroring response wasn't -1, but %d", res)
}
}
package tests
import (
"fmt"
"github.com/xrash/smetrics"
"testing"
)
func TestJaroWinkler(t *testing.T) {
cases := []jarocase{
{"AL", "AL", 1.0},
{"MARTHA", "MARHTA", 0.9611111111111111},
{"JONES", "JOHNSON", 0.8323809523809523},
{"ABCVWXYZ", "CABVWXYZ", 0.9625},
{"A", "B", 0},
{"ABCDEF", "123456", 0},
{"AAAAAAAAABCCCC", "AAAAAAAAABCCCC", 1},
}
for _, c := range cases {
if r := smetrics.JaroWinkler(c.s, c.t, 0.7, 4); r != c.r {
fmt.Println(r, "instead of", c.r)
t.Fail()
}
}
}
package tests
import (
"fmt"
"github.com/xrash/smetrics"
"testing"
)
func TestJaro(t *testing.T) {
cases := []jarocase{
{"AL", "AL", 1.0},
{"MARTHA", "MARHTA", 0.9444444444444445},
{"JONES", "JOHNSON", 0.7904761904761904},
{"ABCVWXYZ", "CABVWXYZ", 0.9583333333333334},
{"A", "B", 0},
{"ABCDEF", "123456", 0},
{"AAAAAAAAABCCCC", "AAAAAAAAABCCCC", 1},
}
for _, c := range cases {
if r := smetrics.Jaro(c.s, c.t); r != c.r {
fmt.Println(r, "instead of", c.r)
t.Fail()
}
}
}
package tests
import (
"fmt"
"github.com/xrash/smetrics"
"testing"
)
func TestSoundex(t *testing.T) {
cases := []soundexcase{
{"Euler", "E460"},
{"Ellery", "E460"},
{"Gauss", "G200"},
{"Ghosh", "G200"},
{"Hilbert", "H416"},
{"Heilbrohn", "H416"},
{"Knuth", "K530"},
{"Kant", "K530"},
{"Lloyd", "L300"},
{"Ladd", "L300"},
{"Lukasiewicz", "L222"},
{"Lissjous", "L222"},
{"Ravi", "R100"},
{"Ravee", "R100"},
}
for _, c := range cases {
if r := smetrics.Soundex(c.s); r != c.t {
fmt.Println(r, "instead of", c.t)
t.Fail()
}
}
}
package tests
type jarocase struct {
s string
t string
r float64
}
type levenshteincase struct {
s string
t string
icost int
dcost int
scost int
r int
}
type soundexcase struct {
s string
t string
}
type hammingcase struct {
a string
b string
diff int
}
package tests
import (
"fmt"
"github.com/xrash/smetrics"
"testing"
)
func TestUkkonen(t *testing.T) {
cases := []levenshteincase{
{"RASH", "RASH", 1, 1, 2, 0},
{"POTATO", "POTTATO", 1, 1, 2, 1},
{"POTTATO", "POTATO", 1, 1, 2, 1},
{"HOUSE", "MOUSE", 1, 1, 2, 2},
{"MOUSE", "HOUSE", 2, 2, 4, 4},
{"abc", "xy", 2, 3, 5, 13},
{"xy", "abc", 2, 3, 5, 12},
}
for _, c := range cases {
if r := smetrics.Ukkonen(c.s, c.t, c.icost, c.dcost, c.scost); r != c.r {
fmt.Println(r, "instead of", c.r)
t.Fail()
}
}
}
package tests
import (
"fmt"
"github.com/xrash/smetrics"
"testing"
)
func TestWagnerFischer(t *testing.T) {
cases := []levenshteincase{
{"RASH", "RASH", 1, 1, 2, 0},
{"POTATO", "POTTATO", 1, 1, 2, 1},
{"POTTATO", "POTATO", 1, 1, 2, 1},
{"HOUSE", "MOUSE", 1, 1, 2, 2},
{"MOUSE", "HOUSE", 2, 2, 4, 4},
{"abc", "xy", 2, 3, 5, 13},
{"xy", "abc", 2, 3, 5, 12},
}
for _, c := range cases {
if r := smetrics.WagnerFischer(c.s, c.t, c.icost, c.dcost, c.scost); r != c.r {
fmt.Println(r, "instead of", c.r)
t.Fail()
}
}
}
package smetrics
import (
"math"
)
func Ukkonen(a, b string, icost, dcost, scost int) int {
var lowerCost int
if icost < dcost && icost < scost {
lowerCost = icost
} else if dcost < scost {
lowerCost = dcost
} else {
lowerCost = scost
}
infinite := math.MaxInt32 / 2
var r []int
var k, kprime, p, t int
var ins, del, sub int
if len(a) > len(b) {
t = (len(a) - len(b) + 1) * lowerCost
} else {
t = (len(b) - len(a) + 1) * lowerCost
}
for {
if (t / lowerCost) < (len(b) - len(a)) {
continue
}
// This is the right damn thing since the original Ukkonen
// paper minimizes the expression result only, but the uncommented version
// doesn't need to deal with floats so it's faster.
// p = int(math.Floor(0.5*((float64(t)/float64(lowerCost)) - float64(len(b) - len(a)))))
p = ((t / lowerCost) - (len(b) - len(a))) / 2
k = -p
kprime = k
rowlength := (len(b) - len(a)) + (2 * p)
r = make([]int, rowlength+2)
for i := 0; i < rowlength+2; i++ {
r[i] = infinite
}
for i := 0; i <= len(a); i++ {
for j := 0; j <= rowlength; j++ {
if i == j+k && i == 0 {
r[j] = 0
} else {
if j-1 < 0 {
ins = infinite
} else {
ins = r[j-1] + icost
}
del = r[j+1] + dcost
sub = r[j] + scost
if i-1 < 0 || i-1 >= len(a) || j+k-1 >= len(b) || j+k-1 < 0 {
sub = infinite
} else if a[i-1] == b[j+k-1] {
sub = r[j]
}
if ins < del && ins < sub {
r[j] = ins
} else if del < sub {
r[j] = del
} else {
r[j] = sub
}
}
}
k++
}
if r[(len(b)-len(a))+(2*p)+kprime] <= t {
break
} else {
t *= 2
}
}
return r[(len(b)-len(a))+(2*p)+kprime]
}
package smetrics
func WagnerFischer(a, b string, icost, dcost, scost int) int {
// Allocate both rows.
row1 := make([]int, len(b)+1)
row2 := make([]int, len(b)+1)
var tmp []int
// Initialize the first row.
for i := 1; i <= len(b); i++ {
row1[i] = i * icost
}
// For each row...
for i := 1; i <= len(a); i++ {
row2[0] = i * dcost
// For each column...
for j := 1; j <= len(b); j++ {
if a[i-1] == b[j-1] {
row2[j] = row1[j-1]
} else {
ins := row2[j-1] + icost
del := row1[j] + dcost
sub := row1[j-1] + scost