Skip to content
This repository has been archived by the owner on Aug 26, 2024. It is now read-only.

Uniform treatment of strings in Unicode. #20

Merged
merged 13 commits into from
May 3, 2013
Merged
47 changes: 29 additions & 18 deletions benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

from timeit import timeit
from fuzzywuzzy import utils
import math

iterations=100000
iterations = 100000

cirque_strings = [
"cirque du soleil - zarkana - las vegas",
Expand All @@ -27,45 +28,55 @@
mixed_strings = [
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
"C\\'est la vie",
"Ça va?",
"Cães danados",
u"Ça va?",
u"Cães danados",
u"\xacCamarões assados",
u"a\xac\u1234\u20ac\U00008000"
]

common_setup = "from fuzzywuzzy import fuzz, utils; "
basic_setup = "from fuzzywuzzy.string_processing import StringProcessor;"

for s in choices:
print 'Test for string: "%s"' % s
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
"""
Clean function to know how much time took the execution of one statement
"""
units = ["s", "ms", "us", "ns"]
duration = timeit(stmt, setup, number=number)
avg_duration = duration/float(number)
thousands = int(math.floor(math.log(avg_duration, 1000)))

print
print "Total time: %fs. Average run: %.3f%s." \
% (duration, avg_duration * (1000**-thousands), units[-thousands])

for s in mixed_strings:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
for s in choices:
print 'Test validate_string for: "%s"' % s
print_result_from_timeit('utils.validate_string(\'%s\')' \
% s, common_setup, number=iterations)

print

for s in mixed_strings+cirque_strings+choices:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'Test full_process for: "%s"' % s
print_result_from_timeit('utils.full_process(u\'%s\')' \
% s, common_setup + basic_setup, number=iterations)

### benchmarking the core matching methods...

for s in cirque_strings:
print 'Test fuzz.ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' \
% s, common_setup + basic_setup, number=iterations/100)

for s in cirque_strings:
print 'Test fuzz.partial_ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')' \
% s, common_setup + basic_setup, number=iterations/100)

for s in cirque_strings:
print 'Test fuzz.WRatio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' \
% s, common_setup + basic_setup, number=iterations/100)
25 changes: 14 additions & 11 deletions fuzzywuzzy/fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

import sys
import os
import re
from utils import *

Expand All @@ -35,8 +33,6 @@
except:
from difflib import SequenceMatcher

REG_TOKEN = re.compile("[\w\d]+")

###########################
# Basic Scoring Functions #
###########################
Expand Down Expand Up @@ -96,8 +92,8 @@ def _token_sort(s1, s2, partial=True):
if s2 is None: raise TypeError("s2 is None")

# pull tokens
tokens1 = REG_TOKEN.findall(s1)
tokens2 = REG_TOKEN.findall(s2)
tokens1 = full_process(s1).split()
tokens2 = full_process(s2).split()

# sort tokens and join
sorted1 = u" ".join(sorted(tokens1))
Expand Down Expand Up @@ -128,11 +124,15 @@ def _token_set(s1, s2, partial=True):
if s1 is None: raise TypeError("s1 is None")
if s2 is None: raise TypeError("s2 is None")

if not (validate_string(s1) and validate_string(s2)): return 0
p1 = full_process(s1)
p2 = full_process(s2)

if not validate_string(p1): return 0
if not validate_string(p2): return 0

# pull tokens
tokens1 = set(REG_TOKEN.findall(s1))
tokens2 = set(REG_TOKEN.findall(s2))
tokens1 = set(full_process(p1).split())
tokens2 = set(full_process(p2).split())

intersection = tokens1.intersection(tokens2)
diff1to2 = tokens1.difference(tokens2)
Expand Down Expand Up @@ -179,18 +179,21 @@ def partial_token_set_ratio(s1, s2):

# q is for quick
def QRatio(s1, s2):
if not validate_string(s1): return 0
if not validate_string(s2): return 0

p1 = full_process(s1)
p2 = full_process(s2)

if not validate_string(p1): return 0
if not validate_string(p2): return 0

return ratio(p1, p2)

# w is for weighted
def WRatio(s1, s2):

p1 = full_process(s1)
p2 = full_process(s2)

if not validate_string(p1): return 0
if not validate_string(p2): return 0

Expand Down
5 changes: 2 additions & 3 deletions fuzzywuzzy/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"""
from fuzz import *

import sys, os
import utils

#######################################
Expand All @@ -46,9 +45,9 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
if choices is None or len(choices) == 0:
return []

# default, turn whatever the choice is into a string
# default, turn whatever the choice is into a workable string
if processor is None:
processor = lambda x: utils.asciidammit(x)
processor = lambda x: utils.full_process(x)

# default: wratio
if scorer is None:
Expand Down
41 changes: 41 additions & 0 deletions fuzzywuzzy/string_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import re
import string
import unicodedata

class StringProcessor(object):
"""
This class defines method to process strings in the most
efficient way. Ideally all the methods below use unicode strings
for both input and output.
"""

@classmethod
def replace_non_lettters_non_numbers_with_whitespace(cls, a_string):
"""
This function replaces any sequence of non letters and non numbers with a single white space.
"""
regex = re.compile(r"(?ui)[\W]+")
return regex.sub(u" ", a_string)

@classmethod
def strip(cls, a_string):
"""
This function strips leading and trailing white space.
"""

return a_string.strip()

@classmethod
def to_lower_case(cls, a_string):
"""
This function returns the lower-cased version of the string given.
"""
return a_string.lower()

@classmethod
def to_upper_case(cls, a_string):
"""
This function returns the upper-cased version of the string given.
"""
return a_string.upper()

59 changes: 36 additions & 23 deletions fuzzywuzzy/utils.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
import string

bad_chars=''
for i in range(128,256):
bad_chars+=chr(i)
table_from=string.punctuation+string.ascii_uppercase
table_to=' '*len(string.punctuation)+string.ascii_lowercase
trans_table=string.maketrans(table_from, table_to)


def asciionly(s):
return s.translate(None, bad_chars)

# remove non-ASCII characters from strings
def asciidammit(s):
if type(s) is str:
return asciionly(s)
elif type(s) is unicode:
return asciionly(s.encode('ascii', 'ignore'))
else:
return asciidammit(unicode(s))
from string_processing import StringProcessor

# Old FuzzyWizzy code commented for archives, to be removed once changes has been notified
# asciidammit prevents any proper unicode processing
# and would return u"None" when fed with None.
#
# import string
#
# bad_chars=''
# for i in range(128,256):
# bad_chars+=chr(i)
# table_from=string.punctuation+string.ascii_uppercase
# table_to=' '*len(string.punctuation)+string.ascii_lowercase
# trans_table=string.maketrans(table_from, table_to)
#
#
# def asciionly(s):
# return s.translate(None, bad_chars)
#
# remove non-ASCII characters from strings
# def asciidammit(s):
# if type(s) is str:
# return asciionly(s)
# elif type(s) is unicode:
# return asciionly(s.encode('ascii', 'ignore'))
# else:
# return asciidammit(unicode(s))

def validate_string(s):
try:
Expand All @@ -30,8 +36,15 @@ def validate_string(s):
return False

def full_process(s):
s = asciidammit(s)
return s.translate(trans_table, bad_chars).strip()
if s is None:
return u""
# Keep only Letters and Numbres (see Unicode docs).
string_out = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s)
# Force into lowercase.
string_out = StringProcessor.to_lower_case(string_out)
# Remove leading and trailing whitespaces.
string_out = StringProcessor.strip(string_out)
return string_out

def intr(n):
'''Returns a correctly rounded integer'''
Expand Down
59 changes: 42 additions & 17 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,20 @@
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzywuzzy import utils
from fuzzywuzzy.string_processing import StringProcessor

import itertools
import unittest
import re

class StringProcessingTest(unittest.TestCase):
def test_replace_non_lettters_non_numbers_with_whitespace(self):
strings = [u"new york mets - atlanta braves", u"Cães danados", u"New York //// Mets $$$", u"Ça va?"]
for string in strings:
proc_string = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(string)
regex = re.compile(r"(?ui)[\W]+")
for expr in regex.finditer(proc_string):
self.assertEquals(expr.group(), " ")

class UtilsTest(unittest.TestCase):
def setUp(self):
Expand All @@ -17,10 +28,10 @@ def setUp(self):
self.s5 = "atlanta braves vs new york mets"
self.s6 = "new york mets - atlanta braves"
self.mixed_strings = [
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
"C'est la vie",
"Ça va?",
"Cães danados",
u"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
u"C'est la vie",
u"Ça va?",
u"Cães danados",
u"\xacCamarões assados",
u"a\xac\u1234\u20ac\U00008000",
u"\u00C1"
Expand All @@ -30,16 +41,6 @@ def setUp(self):
def tearDown(self):
pass

def test_asciidammit(self):
for s in self.mixed_strings:
utils.asciidammit(s)

def test_asciionly(self):
for s in self.mixed_strings:
# ascii only only runs on strings
s = utils.asciidammit(s)
utils.asciionly(s)

def test_fullProcess(self):
for s in self.mixed_strings:
utils.full_process(s)
Expand Down Expand Up @@ -138,16 +139,40 @@ def testIssueSeven(self):

def testWRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
s2 = u"ABCD"
score = fuzz.WRatio(s1, s2)
self.assertEqual(0, score)

# Cyrillic.
s1 = u"\u043f\u0441\u0438\u0445\u043e\u043b\u043e\u0433"
s2 = u"\u043f\u0441\u0438\u0445\u043e\u0442\u0435\u0440\u0430\u043f\u0435\u0432\u0442"
score = fuzz.WRatio(s1, s2)
self.assertNotEqual(0, score)

# Chinese.
s1 = u"\u6211\u4e86\u89e3\u6570\u5b66"
s2 = u"\u6211\u5b66\u6570\u5b66"
score = fuzz.WRatio(s1, s2)
self.assertNotEqual(0, score)

def testQRatioUnicodeString(self):
s1 = u"\u00C1"
s2 = "ABCD"
s2 = u"ABCD"
score = fuzz.QRatio(s1, s2)
self.assertEqual(0, score)

# Cyrillic.
s1 = u"\u043f\u0441\u0438\u0445\u043e\u043b\u043e\u0433"
s2 = u"\u043f\u0441\u0438\u0445\u043e\u0442\u0435\u0440\u0430\u043f\u0435\u0432\u0442"
score = fuzz.QRatio(s1, s2)
self.assertNotEqual(0, score)

# Chinese.
s1 = u"\u6211\u4e86\u89e3\u6570\u5b66"
s2 = u"\u6211\u5b66\u6570\u5b66"
score = fuzz.QRatio(s1, s2)
self.assertNotEqual(0, score)

# test processing methods
def testGetBestChoice1(self):
query = "new york mets at atlanta braves"
Expand Down Expand Up @@ -282,4 +307,4 @@ def testNullStrings(self):


if __name__ == '__main__':
unittest.main() # run all tests
unittest.main() # run all tests