Skip to content
This repository has been archived by the owner on Aug 26, 2024. It is now read-only.

Issue 10 and refactoring #11

Merged
merged 8 commits into from
Jul 9, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
*.pyc
env
dist
File renamed without changes.
3 changes: 2 additions & 1 deletion MANIFEST
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# file GENERATED by distutils, do NOT edit
README.txt
LICENSE
README.textile
setup.py
fuzzywuzzy/__init__.py
fuzzywuzzy/benchmarks.py
Expand Down
2 changes: 2 additions & 0 deletions README.textile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
!https://​pullstat.us/seatgeek/fuzzywuzzy/pull/5(Pull Request #5)!:https://github.com/seatgeek/fuzzywuzzy/pull/5 - Speed improvements

h1. FuzzyWuzzy

Fuzzy string matching like a boss.
Expand Down
22 changes: 11 additions & 11 deletions fuzzywuzzy/benchmarks.py → benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# -*- coding: utf8 -*-

from timeit import timeit
import utils
from fuzzywuzzy import utils

iterations=100000*10
iterations=100000

cirque_strings = [
"cirque du soleil - zarkana - las vegas",
Expand Down Expand Up @@ -36,36 +36,36 @@

for s in choices:
print 'Test for string: "%s"' % s
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "import utils",number=iterations),4)
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)

print

for s in mixed_strings:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "import utils",number=iterations),4)
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)

print

for s in mixed_strings+cirque_strings+choices:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4)
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)

### benchmarking the core matching methods...

for s in cirque_strings:
print 'Test fuzz.ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)

for s in cirque_strings:
print 'Test fuzz.partial_ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)

for s in cirque_strings:
print 'Test fuzz.WRatio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4)
print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
21 changes: 10 additions & 11 deletions fuzzywuzzy/fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,9 @@
import sys
import os
import re
import utils
from utils import *

try:
import Levenshtein
from StringMatcher import StringMatcher as SequenceMatcher
except:
from difflib import SequenceMatcher
Expand All @@ -48,7 +47,7 @@ def ratio(s1, s2):
if s2 is None: raise TypeError("s2 is None")

m = SequenceMatcher(None, s1, s2)
return int(100 * m.ratio())
return intr(100 * m.ratio())

# todo: skip duplicate indexes for a little more speed
def partial_ratio(s1, s2):
Expand Down Expand Up @@ -178,20 +177,20 @@ def partial_token_set_ratio(s1, s2):

# q is for quick
def QRatio(s1, s2):
if not utils.validate_string(s1): return 0
if not utils.validate_string(s2): return 0
if not validate_string(s1): return 0
if not validate_string(s2): return 0

p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
p1 = full_process(s1)
p2 = full_process(s2)

return ratio(p1, p2)

# w is for weighted
def WRatio(s1, s2):
p1 = utils.full_process(s1)
p2 = utils.full_process(s2)
if not utils.validate_string(p1): return 0
if not utils.validate_string(p2): return 0
p1 = full_process(s1)
p2 = full_process(s2)
if not validate_string(p1): return 0
if not validate_string(p2): return 0

# should we look at partials?
try_partial = True
Expand Down
4 changes: 3 additions & 1 deletion fuzzywuzzy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def full_process(s):
s = asciidammit(s)
return s.translate(trans_table, bad_chars).strip()


def intr(n):
'''Returns a correctly rounded integer'''
return int(round(n))


54 changes: 27 additions & 27 deletions fuzzywuzzy/tests.py → tests.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# -*- coding: utf8 -*-

from fuzz import *
import process
import utils
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy import utils

import itertools
import unittest
Expand Down Expand Up @@ -75,77 +75,77 @@ def tearDown(self):
pass

def testEqual(self):
self.assertEqual(ratio(self.s1, self.s1a),100)
self.assertEqual(fuzz.ratio(self.s1, self.s1a),100)

def testCaseInsensitive(self):
self.assertNotEqual(ratio(self.s1, self.s2),100)
self.assertEqual(ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
self.assertNotEqual(fuzz.ratio(self.s1, self.s2),100)
self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)

def testPartialRatio(self):
self.assertEqual(partial_ratio(self.s1, self.s3),100)
self.assertEqual(fuzz.partial_ratio(self.s1, self.s3),100)

def testTokenSortRatio(self):
self.assertEqual(token_sort_ratio(self.s1, self.s1a),100)
self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a),100)

def testPartialTokenSortRatio(self):
self.assertEqual(partial_token_sort_ratio(self.s1, self.s1a),100)
self.assertEqual(partial_token_sort_ratio(self.s4, self.s5),100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a),100)
self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5),100)

def testTokenSetRatio(self):
self.assertEqual(token_set_ratio(self.s4, self.s5),100)
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)

def testPartialTokenSetRatio(self):
self.assertEqual(partial_token_set_ratio(self.s4, self.s5),100)
self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)

def testQuickRatioEqual(self):
self.assertEqual(QRatio(self.s1, self.s1a), 100)
self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100)

def testQuickRatioCaseInsensitive(self):
self.assertEqual(QRatio(self.s1, self.s2), 100)
self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100)

def testQuickRatioNotEqual(self):
self.assertNotEqual(QRatio(self.s1, self.s3), 100)
self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100)

def testWRatioEqual(self):
self.assertEqual(WRatio(self.s1, self.s1a), 100)
self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)

def testWRatioCaseInsensitive(self):
self.assertEqual(WRatio(self.s1, self.s2), 100)
self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100)

def testWRatioPartialMatch(self):
# a partial match is scaled by .9
self.assertEqual(WRatio(self.s1, self.s3), 90)
self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90)

def testWRatioMisorderedMatch(self):
# misordered full matches are scaled by .95
self.assertEqual(WRatio(self.s4, self.s5), 95)
self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95)

def testWRatioUnicode(self):
self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)

def testQRatioUnicode(self):
self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100)
self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100)

def testIssueSeven(self):
s1 = "HSINCHUANG"
s2 = "SINJHUAN"
s3 = "LSINJHUANG DISTRIC"
s4 = "SINJHUANG DISTRICT"

self.assertGreater(partial_ratio(s1, s2), 75)
self.assertGreater(partial_ratio(s1, s3), 75)
self.assertGreater(partial_ratio(s1, s4), 75)
self.assertTrue(fuzz.partial_ratio(s1, s2) > 75)
self.assertTrue(fuzz.partial_ratio(s1, s3) > 75)
self.assertTrue(fuzz.partial_ratio(s1, s4) > 75)

def testWRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
score = WRatio(s1, s2)
score = fuzz.WRatio(s1, s2)
self.assertEqual(0, score)

def testQRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
score = QRatio(s1, s2)
score = fuzz.QRatio(s1, s2)
self.assertEqual(0, score)

# test processing methods
Expand Down Expand Up @@ -218,7 +218,7 @@ def testWithScorer(self):

# in this hypothetical example we care about ordering, so we use quick ratio
query = "new york mets at chicago cubs"
scorer = QRatio
scorer = fuzz.QRatio

# first, as an example, the normal way would select the "more 'complete' match of choices[1]"

Expand Down