Skip to content

Commit

Permalink
Merge pull request #172 from adamdecaf/improve-name-search
Browse files Browse the repository at this point in the history
fix: improve name search by using cleaned name
  • Loading branch information
adamdecaf authored May 25, 2022
2 parents 9bcce06 + af1b9a5 commit 3853662
Show file tree
Hide file tree
Showing 6 changed files with 174 additions and 64 deletions.
18 changes: 11 additions & 7 deletions ACHDictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@ func (f *ACHDictionary) readJSON(r io.Reader) error {
PhoneNumber: fmt.Sprintf("%s%s%s", ps[i].CustomerAreaCode, ps[i].CustomerPhonePrefix, ps[i].CustomerPhoneSuffix),
StatusCode: ps[i].InstitutionStatusCode,
ViewCode: ps[i].DataViewCode,

// Our Custom Fields
CleanName: Normalize(ps[i].CustomerName),
}
f.IndexACHRoutingNumber[ps[i].RoutingNumber] = p
f.ACHParticipants = append(f.ACHParticipants, p)
Expand Down Expand Up @@ -250,6 +253,9 @@ func (f *ACHDictionary) parseACHParticipant(line string) error {
// ViewCode (1): 1
p.ViewCode = line[149:150]

// Our custom fields
p.CleanName = Normalize(p.CustomerName)

f.ACHParticipants = append(f.ACHParticipants, p)
f.IndexACHRoutingNumber[p.RoutingNumber] = p
return nil
Expand Down Expand Up @@ -324,26 +330,24 @@ func (f *ACHDictionary) RoutingNumberSearch(s string, limit int) ([]*ACHParticip
})
}
}
return reduceResult(out, limit), nil
return reduceACHResults(out, limit), nil
}

// FinancialInstitutionSearch returns a FEDACH participant based on a ACHParticipant.CustomerName
func (f *ACHDictionary) FinancialInstitutionSearch(s string, limit int) []*ACHParticipant {
s = strings.ToLower(s)

// Participants is a subset ACHDictionary.ACHParticipants that match the search based on JaroWinkler similarity
// and Levenshtein similarity
out := make([]*achParticipantResult, 0)

for _, achP := range f.ACHParticipants {
// JaroWinkler is a more accurate version of the Jaro algorithm. It works by boosting the
// score of exact matches at the beginning of the strings. By doing this, Winkler says that
// typos are less common to happen at the beginning.
jaroScore := strcmp.JaroWinkler(strings.ToLower(achP.CustomerName), s)
jaroScore := strcmp.JaroWinkler(strings.ToLower(achP.CleanName), s)

// Levenshtein is the "edit distance" between two strings. This is the count of operations
// (insert, delete, replace) needed for two strings to be equal.
levenScore := strcmp.Levenshtein(strings.ToLower(achP.CustomerName), s)
levenScore := strcmp.Levenshtein(strings.ToLower(achP.CleanName), s)

if jaroScore > ACHJaroWinklerSimilarity || levenScore > ACHLevenshteinSimilarity {
out = append(out, &achParticipantResult{
Expand All @@ -353,7 +357,7 @@ func (f *ACHDictionary) FinancialInstitutionSearch(s string, limit int) []*ACHPa
}
}

return reduceResult(out, limit)
return reduceACHResults(out, limit)
}

// ACHParticipantStateFilter filters ACHParticipant by State.
Expand Down Expand Up @@ -441,7 +445,7 @@ func (f *ACHDictionary) PostalCodeFilter(s string) []*ACHParticipant {
return nsl
}

func reduceResult(in []*achParticipantResult, limit int) []*ACHParticipant {
func reduceACHResults(in []*achParticipantResult, limit int) []*ACHParticipant {
sort.SliceStable(in, func(i, j int) bool { return in[i].highestMatch > in[j].highestMatch })

out := make([]*ACHParticipant, 0)
Expand Down
49 changes: 49 additions & 0 deletions ACHDictionary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"testing"

"github.com/moov-io/base"

"github.com/stretchr/testify/require"
)

func loadTestACHFiles(t *testing.T) (*ACHDictionary, *ACHDictionary) {
Expand Down Expand Up @@ -322,6 +324,53 @@ func TestACHRoutingNumberNumeric(t *testing.T) {
}
}

func TestACHFinancialInstitutionSearch__Examples(t *testing.T) {
_, plainDict := loadTestACHFiles(t)

cases := []struct {
input string
expected *ACHParticipant
}{
{
input: "Chase",
expected: &ACHParticipant{
RoutingNumber: "021000021",
CustomerName: "JPMORGAN CHASE",
},
},
{
input: "Wells",
expected: &ACHParticipant{
RoutingNumber: "101205940",
CustomerName: "WELLS BANK",
},
},
{
input: "Fargo",
expected: &ACHParticipant{
RoutingNumber: "291378392",
CustomerName: "FARGO VA FEDERAL CU",
},
},
{
input: "Wells Fargo",
expected: &ACHParticipant{
RoutingNumber: "011100106",
CustomerName: "WELLS FARGO BANK",
},
},
}

for i := range cases {
// The plain dictionary has 18k records, so search is more realistic
results := plainDict.FinancialInstitutionSearch(cases[i].input, 1)
require.Len(t, results, 1)

require.Equal(t, cases[i].expected.RoutingNumber, results[0].RoutingNumber)
require.Equal(t, cases[i].expected.CustomerName, results[0].CustomerName)
}
}

// TestACHFinancialInstitutionSearch tests search string `First Bank`
func TestACHFinancialInstitutionSearch(t *testing.T) {
jsonDict, plainDict := loadTestACHFiles(t)
Expand Down
96 changes: 57 additions & 39 deletions WIREDictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"encoding/json"
"io"
"io/ioutil"
"math"
"sort"
"strings"
"unicode/utf8"
Expand Down Expand Up @@ -145,6 +146,9 @@ func (f *WIREDictionary) readJSON(r io.Reader) error {
FundsSettlementOnlyStatus: ps[i].FundsSettlementOnlyStatus,
BookEntrySecuritiesTransferStatus: ps[i].SecuritiesEligibility,
Date: ps[i].ChangeDate,

// Our Custom Fields
CleanName: Normalize(ps[i].CustomerName),
}
f.WIREParticipants = append(f.WIREParticipants, p)
f.IndexWIRERoutingNumber[p.RoutingNumber] = p
Expand Down Expand Up @@ -200,6 +204,10 @@ func (f *WIREDictionary) parseWIREParticipant(line string) error {
p.BookEntrySecuritiesTransferStatus = line[92:93]
// Date YYYYMMDD (8): 122415
p.Date = line[93:101]

// Our custom fields
p.CleanName = Normalize(p.CustomerName)

f.WIREParticipants = append(f.WIREParticipants, p)
f.IndexWIRERoutingNumber[p.RoutingNumber] = p
return nil
Expand Down Expand Up @@ -232,7 +240,7 @@ func (f *WIREDictionary) FinancialInstitutionSearchSingle(s string) []*WIREParti
// RoutingNumberSearch returns FEDWIRE participants if WIREParticipant.RoutingNumber begins with prefix string s.
// The first 2 digits of the routing number are required.
// Based on https://www.frbservices.org/EPaymentsDirectory/search.html
func (f *WIREDictionary) RoutingNumberSearch(s string) ([]*WIREParticipant, error) {
func (f *WIREDictionary) RoutingNumberSearch(s string, limit int) ([]*WIREParticipant, error) {
s = strings.TrimSpace(s)

if utf8.RuneCountInString(s) < MinimumRoutingNumberDigits {
Expand All @@ -250,58 +258,52 @@ func (f *WIREDictionary) RoutingNumberSearch(s string) ([]*WIREParticipant, erro
f.errors.Add(ErrRoutingNumberNumeric)
return nil, f.errors
}
exactMatch := len(s) == 9

Participants := make([]*WIREParticipant, 0)

out := make([]*wireParticipantResult, 0)
for _, wireP := range f.WIREParticipants {
if strings.HasPrefix(wireP.RoutingNumber, s) {
Participants = append(Participants, wireP)
if exactMatch {
if wireP.RoutingNumber == s {
out = append(out, &wireParticipantResult{
WIREParticipant: wireP,
highestMatch: 1.0,
})
}
} else {
out = append(out, &wireParticipantResult{
WIREParticipant: wireP,
highestMatch: strcmp.JaroWinkler(wireP.RoutingNumber, s),
})
}
}

return Participants, nil
return reduceWIREResults(out, limit), nil
}

// FinancialInstitutionSearch returns a FEDWIRE participant based on a WIREParticipant.CustomerName
func (f *WIREDictionary) FinancialInstitutionSearch(s string) []*WIREParticipant {
func (f *WIREDictionary) FinancialInstitutionSearch(s string, limit int) []*WIREParticipant {
s = strings.ToLower(s)

// Participants is a subset WIREDictionary.WIREParticipants that match the search based on JaroWinkler similarity
// and Levenshtein similarity
Participants := make([]*WIREParticipant, 0)
out := make([]*wireParticipantResult, 0)

// JaroWinkler is a more accurate version of the Jaro algorithm. It works by boosting the
// score of exact matches at the beginning of the strings. By doing this, Winkler says that
// typos are less common to happen at the beginning.
for _, wireP := range f.WIREParticipants {
if strcmp.JaroWinkler(strings.ToLower(wireP.CustomerName), s) > WIREJaroWinklerSimilarity {
Participants = append(Participants, wireP)
// JaroWinkler is a more accurate version of the Jaro algorithm. It works by boosting the
// score of exact matches at the beginning of the strings. By doing this, Winkler says that
// typos are less common to happen at the beginning.
jaroScore := strcmp.JaroWinkler(strings.ToLower(wireP.CleanName), s)

// Levenshtein is the "edit distance" between two strings. This is the count of operations
// (insert, delete, replace) needed for two strings to be equal.
levenScore := strcmp.Levenshtein(strings.ToLower(wireP.CleanName), s)

if jaroScore > ACHJaroWinklerSimilarity || levenScore > ACHLevenshteinSimilarity {
out = append(out, &wireParticipantResult{
WIREParticipant: wireP,
highestMatch: math.Max(jaroScore, levenScore),
})
}
}

// Levenshtein is the "edit distance" between two strings. This is the count of operations
// (insert, delete, replace) needed for two strings to be equal.
for _, wireP := range f.WIREParticipants {
if strcmp.Levenshtein(strings.ToLower(wireP.CustomerName), s) > WIRELevenshteinSimilarity {

// Only append if the not included in the Participant sub-set
if len(Participants) != 0 {
for _, p := range Participants {
if p.CustomerName == wireP.CustomerName && p.RoutingNumber == wireP.RoutingNumber {
break
}
}
Participants = append(Participants, wireP)

} else {
Participants = append(Participants, wireP)
}
}
}
// Sort the result
sort.SliceStable(Participants, func(i, j int) bool { return Participants[i].CustomerName < Participants[j].CustomerName })

return Participants
return reduceWIREResults(out, limit)
}

// WIREParticipantRoutingNumberFilter filters WIREParticipant by Routing Number
Expand Down Expand Up @@ -365,3 +367,19 @@ func (f *WIREDictionary) CityFilter(s string) []*WIREParticipant {
}
return nsl
}

type wireParticipantResult struct {
*WIREParticipant

highestMatch float64
}

func reduceWIREResults(in []*wireParticipantResult, limit int) []*WIREParticipant {
sort.SliceStable(in, func(i, j int) bool { return in[i].highestMatch > in[j].highestMatch })

out := make([]*WIREParticipant, 0)
for i := 0; i < limit && i < len(in); i++ {
out = append(out, in[i].WIREParticipant)
}
return out
}
Loading

0 comments on commit 3853662

Please sign in to comment.