diff --git a/.gitignore b/.gitignore index 0d20b648..c169d172 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ *.pyc +env +dist diff --git a/LICENSE.txt b/LICENSE similarity index 100% rename from LICENSE.txt rename to LICENSE diff --git a/MANIFEST b/MANIFEST index e3821fc5..11a90aad 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1,5 +1,6 @@ # file GENERATED by distutils, do NOT edit -README.txt +LICENSE +README.textile setup.py fuzzywuzzy/__init__.py fuzzywuzzy/benchmarks.py diff --git a/README.textile b/README.textile index 05b4d98c..aba5b2ca 100644 --- a/README.textile +++ b/README.textile @@ -1,3 +1,5 @@ +!https://​pullstat.us/seatgeek/fuzzywuzzy/pull/5(Pull Request #5)!:https://github.com/seatgeek/fuzzywuzzy/pull/5 - Speed improvements + h1. FuzzyWuzzy Fuzzy string matching like a boss. diff --git a/fuzzywuzzy/benchmarks.py b/benchmarks.py similarity index 74% rename from fuzzywuzzy/benchmarks.py rename to benchmarks.py index da7ae91f..2fd48b4a 100644 --- a/fuzzywuzzy/benchmarks.py +++ b/benchmarks.py @@ -1,9 +1,9 @@ # -*- coding: utf8 -*- from timeit import timeit -import utils +from fuzzywuzzy import utils -iterations=100000*10 +iterations=100000 cirque_strings = [ "cirque du soleil - zarkana - las vegas", @@ -36,36 +36,36 @@ for s in choices: print 'Test for string: "%s"' % s - # print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "import utils",number=iterations),4) - print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "import utils",number=iterations),4) + # print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4) + print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4) print for s in mixed_strings: print 'Test for string: "%s"' % s - #print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "import utils",number=iterations),4) - print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "import utils",number=iterations),4) + #print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4) + print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4) print for s in mixed_strings+cirque_strings+choices: print 'Test for string: "%s"' % s - #print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "import utils",number=iterations),4) - print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "import utils",number=iterations),4) + #print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4) + print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4) ### benchmarking the core matching methods... for s in cirque_strings: print 'Test fuzz.ratio for string: "%s"' % s print '-------------------------------' - print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4) + print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4) for s in cirque_strings: print 'Test fuzz.partial_ratio for string: "%s"' % s print '-------------------------------' - print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4) + print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4) for s in cirque_strings: print 'Test fuzz.WRatio for string: "%s"' % s print '-------------------------------' - print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "import fuzz",number=iterations/100),4) + print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4) diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py index b537c313..43cc1ed6 100644 --- a/fuzzywuzzy/fuzz.py +++ b/fuzzywuzzy/fuzz.py @@ -28,10 +28,9 @@ import sys import os import re -import utils +from utils import * try: - import Levenshtein from StringMatcher import StringMatcher as SequenceMatcher except: from difflib import SequenceMatcher @@ -48,7 +47,7 @@ def ratio(s1, s2): if s2 is None: raise TypeError("s2 is None") m = SequenceMatcher(None, s1, s2) - return int(100 * m.ratio()) + return intr(100 * m.ratio()) # todo: skip duplicate indexes for a little more speed def partial_ratio(s1, s2): @@ -178,20 +177,20 @@ def partial_token_set_ratio(s1, s2): # q is for quick def QRatio(s1, s2): - if not utils.validate_string(s1): return 0 - if not utils.validate_string(s2): return 0 + if not validate_string(s1): return 0 + if not validate_string(s2): return 0 - p1 = utils.full_process(s1) - p2 = utils.full_process(s2) + p1 = full_process(s1) + p2 = full_process(s2) return ratio(p1, p2) # w is for weighted def WRatio(s1, s2): - p1 = utils.full_process(s1) - p2 = utils.full_process(s2) - if not utils.validate_string(p1): return 0 - if not utils.validate_string(p2): return 0 + p1 = full_process(s1) + p2 = full_process(s2) + if not validate_string(p1): return 0 + if not validate_string(p2): return 0 # should we look at partials? try_partial = True diff --git a/fuzzywuzzy/utils.py b/fuzzywuzzy/utils.py index b07a2845..67bd6bef 100644 --- a/fuzzywuzzy/utils.py +++ b/fuzzywuzzy/utils.py @@ -33,6 +33,8 @@ def full_process(s): s = asciidammit(s) return s.translate(trans_table, bad_chars).strip() - +def intr(n): + '''Returns a correctly rounded integer''' + return int(round(n)) diff --git a/fuzzywuzzy/tests.py b/tests.py similarity index 83% rename from fuzzywuzzy/tests.py rename to tests.py index 5466d278..79f880e2 100644 --- a/fuzzywuzzy/tests.py +++ b/tests.py @@ -1,8 +1,8 @@ # -*- coding: utf8 -*- -from fuzz import * -import process -import utils +from fuzzywuzzy import fuzz +from fuzzywuzzy import process +from fuzzywuzzy import utils import itertools import unittest @@ -75,56 +75,56 @@ def tearDown(self): pass def testEqual(self): - self.assertEqual(ratio(self.s1, self.s1a),100) + self.assertEqual(fuzz.ratio(self.s1, self.s1a),100) def testCaseInsensitive(self): - self.assertNotEqual(ratio(self.s1, self.s2),100) - self.assertEqual(ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100) + self.assertNotEqual(fuzz.ratio(self.s1, self.s2),100) + self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100) def testPartialRatio(self): - self.assertEqual(partial_ratio(self.s1, self.s3),100) + self.assertEqual(fuzz.partial_ratio(self.s1, self.s3),100) def testTokenSortRatio(self): - self.assertEqual(token_sort_ratio(self.s1, self.s1a),100) + self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a),100) def testPartialTokenSortRatio(self): - self.assertEqual(partial_token_sort_ratio(self.s1, self.s1a),100) - self.assertEqual(partial_token_sort_ratio(self.s4, self.s5),100) + self.assertEqual(fuzz.partial_token_sort_ratio(self.s1, self.s1a),100) + self.assertEqual(fuzz.partial_token_sort_ratio(self.s4, self.s5),100) def testTokenSetRatio(self): - self.assertEqual(token_set_ratio(self.s4, self.s5),100) + self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100) def testPartialTokenSetRatio(self): - self.assertEqual(partial_token_set_ratio(self.s4, self.s5),100) + self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100) def testQuickRatioEqual(self): - self.assertEqual(QRatio(self.s1, self.s1a), 100) + self.assertEqual(fuzz.QRatio(self.s1, self.s1a), 100) def testQuickRatioCaseInsensitive(self): - self.assertEqual(QRatio(self.s1, self.s2), 100) + self.assertEqual(fuzz.QRatio(self.s1, self.s2), 100) def testQuickRatioNotEqual(self): - self.assertNotEqual(QRatio(self.s1, self.s3), 100) + self.assertNotEqual(fuzz.QRatio(self.s1, self.s3), 100) def testWRatioEqual(self): - self.assertEqual(WRatio(self.s1, self.s1a), 100) + self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100) def testWRatioCaseInsensitive(self): - self.assertEqual(WRatio(self.s1, self.s2), 100) + self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100) def testWRatioPartialMatch(self): # a partial match is scaled by .9 - self.assertEqual(WRatio(self.s1, self.s3), 90) + self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90) def testWRatioMisorderedMatch(self): # misordered full matches are scaled by .95 - self.assertEqual(WRatio(self.s4, self.s5), 95) + self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95) def testWRatioUnicode(self): - self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100) + self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100) def testQRatioUnicode(self): - self.assertEqual(WRatio(unicode(self.s1), unicode(self.s1a)), 100) + self.assertEqual(fuzz.WRatio(unicode(self.s1), unicode(self.s1a)), 100) def testIssueSeven(self): s1 = "HSINCHUANG" @@ -132,20 +132,20 @@ def testIssueSeven(self): s3 = "LSINJHUANG DISTRIC" s4 = "SINJHUANG DISTRICT" - self.assertGreater(partial_ratio(s1, s2), 75) - self.assertGreater(partial_ratio(s1, s3), 75) - self.assertGreater(partial_ratio(s1, s4), 75) + self.assertTrue(fuzz.partial_ratio(s1, s2) > 75) + self.assertTrue(fuzz.partial_ratio(s1, s3) > 75) + self.assertTrue(fuzz.partial_ratio(s1, s4) > 75) def testWRatioUnicodeString(self): s1 = u"\u00C1" s2 = "ABCD" - score = WRatio(s1, s2) + score = fuzz.WRatio(s1, s2) self.assertEqual(0, score) def testQRatioUnicodeString(self): s1 = u"\u00C1" s2 = "ABCD" - score = QRatio(s1, s2) + score = fuzz.QRatio(s1, s2) self.assertEqual(0, score) # test processing methods @@ -218,7 +218,7 @@ def testWithScorer(self): # in this hypothetical example we care about ordering, so we use quick ratio query = "new york mets at chicago cubs" - scorer = QRatio + scorer = fuzz.QRatio # first, as an example, the normal way would select the "more 'complete' match of choices[1]"