Skip to content
This repository has been archived by the owner on Aug 26, 2024. It is now read-only.

Commit

Permalink
Merge pull request #23 from seatgeek/pr/20
Browse files Browse the repository at this point in the history
Pull Request #20 Augmented With force_ascii parameter
  • Loading branch information
Adam Cohen committed May 3, 2013
2 parents 6af3b31 + a52d149 commit b486605
Show file tree
Hide file tree
Showing 6 changed files with 234 additions and 89 deletions.
47 changes: 29 additions & 18 deletions benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

from timeit import timeit
from fuzzywuzzy import utils
import math

iterations=100000
iterations = 100000

cirque_strings = [
"cirque du soleil - zarkana - las vegas",
Expand All @@ -27,45 +28,55 @@
mixed_strings = [
"Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
"C\\'est la vie",
"Ça va?",
"Cães danados",
u"Ça va?",
u"Cães danados",
u"\xacCamarões assados",
u"a\xac\u1234\u20ac\U00008000"
]

common_setup = "from fuzzywuzzy import fuzz, utils; "
basic_setup = "from fuzzywuzzy.string_processing import StringProcessor;"

for s in choices:
print 'Test for string: "%s"' % s
# print 'Old: %f' % round(timeit('utils.validate_stringold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.validate_string(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
def print_result_from_timeit(stmt='pass', setup='pass', number=1000000):
"""
Clean function to know how much time took the execution of one statement
"""
units = ["s", "ms", "us", "ns"]
duration = timeit(stmt, setup, number=number)
avg_duration = duration/float(number)
thousands = int(math.floor(math.log(avg_duration, 1000)))

print
print "Total time: %fs. Average run: %.3f%s." \
% (duration, avg_duration * (1000**-thousands), units[-thousands])

for s in mixed_strings:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.asciidammitold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.asciidammit(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
for s in choices:
print 'Test validate_string for: "%s"' % s
print_result_from_timeit('utils.validate_string(\'%s\')' \
% s, common_setup, number=iterations)

print

for s in mixed_strings+cirque_strings+choices:
print 'Test for string: "%s"' % s
#print 'Old: %f' % round(timeit('utils.full_processold(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'New: %f' % round(timeit('utils.full_process(\'%s\')' % s, "from fuzzywuzzy import utils",number=iterations),4)
print 'Test full_process for: "%s"' % s
print_result_from_timeit('utils.full_process(u\'%s\')' \
% s, common_setup + basic_setup, number=iterations)

### benchmarking the core matching methods...

for s in cirque_strings:
print 'Test fuzz.ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
print_result_from_timeit('fuzz.ratio(u\'cirque du soleil\', u\'%s\')' \
% s, common_setup + basic_setup, number=iterations/100)

for s in cirque_strings:
print 'Test fuzz.partial_ratio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.partial_ratio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
print_result_from_timeit('fuzz.partial_ratio(u\'cirque du soleil\', u\'%s\')' \
% s, common_setup + basic_setup, number=iterations/100)

for s in cirque_strings:
print 'Test fuzz.WRatio for string: "%s"' % s
print '-------------------------------'
print 'New: %f' % round(timeit('fuzz.WRatio(\'cirque du soleil\', \'%s\')' % s, "from fuzzywuzzy import fuzz",number=iterations/100),4)
print_result_from_timeit('fuzz.WRatio(u\'cirque du soleil\', u\'%s\')' \
% s, common_setup + basic_setup, number=iterations/100)
75 changes: 35 additions & 40 deletions fuzzywuzzy/fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

import sys
import os
import re
from utils import *

Expand All @@ -35,8 +33,6 @@
except:
from difflib import SequenceMatcher

REG_TOKEN = re.compile("[\w\d]+")

###########################
# Basic Scoring Functions #
###########################
Expand Down Expand Up @@ -90,14 +86,14 @@ def partial_ratio(s1, s2):
# find all alphanumeric tokens in the string
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
def _token_sort(s1, s2, partial=True):
def _token_sort(s1, s2, partial=True, force_ascii=True):

if s1 is None: raise TypeError("s1 is None")
if s2 is None: raise TypeError("s2 is None")

# pull tokens
tokens1 = REG_TOKEN.findall(s1)
tokens2 = REG_TOKEN.findall(s2)
tokens1 = full_process(s1, force_ascii=force_ascii).split()
tokens2 = full_process(s2, force_ascii=force_ascii).split()

# sort tokens and join
sorted1 = u" ".join(sorted(tokens1))
Expand All @@ -111,28 +107,32 @@ def _token_sort(s1, s2, partial=True):
else:
return ratio(sorted1, sorted2)

def token_sort_ratio(s1, s2):
return _token_sort(s1, s2, False)
def token_sort_ratio(s1, s2, force_ascii=True):
return _token_sort(s1, s2, partial=False, force_ascii=force_ascii)

def partial_token_sort_ratio(s1, s2):
return _token_sort(s1, s2, True)
def partial_token_sort_ratio(s1, s2, force_ascii=True):
return _token_sort(s1, s2, partial=True, force_ascii=force_ascii)

# Token Set
# find all alphanumeric tokens in each string...treat them as a set
# construct two strings of the form
# <sorted_intersection><sorted_remainder>
# take ratios of those two strings
# controls for unordered partial matches
def _token_set(s1, s2, partial=True):
def _token_set(s1, s2, partial=True, force_ascii=True):

if s1 is None: raise TypeError("s1 is None")
if s2 is None: raise TypeError("s2 is None")

if not (validate_string(s1) and validate_string(s2)): return 0
p1 = full_process(s1, force_ascii=force_ascii)
p2 = full_process(s2, force_ascii=force_ascii)

if not validate_string(p1): return 0
if not validate_string(p2): return 0

# pull tokens
tokens1 = set(REG_TOKEN.findall(s1))
tokens2 = set(REG_TOKEN.findall(s2))
tokens1 = set(full_process(p1).split())
tokens2 = set(full_process(p2).split())

intersection = tokens1.intersection(tokens2)
diff1to2 = tokens1.difference(tokens2)
Expand All @@ -157,19 +157,11 @@ def _token_set(s1, s2, partial=True):
]
return max(pairwise)

# if partial:
# # partial_token_set_ratio
#
# else:
# # token_set_ratio
# tsr = ratio(combined_1to2, combined_2to1)
# return tsr

def token_set_ratio(s1, s2):
return _token_set(s1, s2, False)
def token_set_ratio(s1, s2, force_ascii=True):
return _token_set(s1, s2, partial=False, force_ascii=force_ascii)

def partial_token_set_ratio(s1, s2):
return _token_set(s1, s2, True)
def partial_token_set_ratio(s1, s2, force_ascii=True):
return _token_set(s1, s2, partial=True, force_ascii=force_ascii)

# TODO: numerics

Expand All @@ -178,19 +170,22 @@ def partial_token_set_ratio(s1, s2):
###################

# q is for quick
def QRatio(s1, s2):
if not validate_string(s1): return 0
if not validate_string(s2): return 0
def QRatio(s1, s2, force_ascii=True):

p1 = full_process(s1)
p2 = full_process(s2)
p1 = full_process(s1, force_ascii=force_ascii)
p2 = full_process(s2, force_ascii=force_ascii)

if not validate_string(p1): return 0
if not validate_string(p2): return 0

return ratio(p1, p2)

# w is for weighted
def WRatio(s1, s2):
p1 = full_process(s1)
p2 = full_process(s2)
def WRatio(s1, s2, force_ascii=True):

p1 = full_process(s1, force_ascii=force_ascii)
p2 = full_process(s2, force_ascii=force_ascii)

if not validate_string(p1): return 0
if not validate_string(p2): return 0

Expand All @@ -209,14 +204,14 @@ def WRatio(s1, s2):
if len_ratio > 8: partial_scale = .6

if try_partial:
partial = partial_ratio(p1, p2) * partial_scale
ptsor = partial_token_sort_ratio(p1, p2) * unbase_scale * partial_scale
ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale
partial = partial_ratio(p1, p2) * partial_scale
ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale
ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale

return int(max(base, partial, ptsor, ptser))
else:
tsor = token_sort_ratio(p1, p2) * unbase_scale
tser = token_set_ratio(p1, p2) * unbase_scale
tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

return int(max(base, tsor, tser))

7 changes: 3 additions & 4 deletions fuzzywuzzy/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
"""
from fuzz import *

import sys, os
import itertools
import utils

Expand All @@ -47,9 +46,9 @@ def extract(query, choices, processor=None, scorer=None, limit=5):
if choices is None or len(choices) == 0:
return []

# default, turn whatever the choice is into a string
# default, turn whatever the choice is into a workable string
if processor is None:
processor = lambda x: utils.asciidammit(x)
processor = lambda x: utils.full_process(x)

# default: wratio
if scorer is None:
Expand Down Expand Up @@ -82,7 +81,7 @@ def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, li
return list(itertools.takewhile(lambda x: x[1] > score_cutoff, best_list))
else:
return []

##########################
# Find Single Best Match #
##########################
Expand Down
41 changes: 41 additions & 0 deletions fuzzywuzzy/string_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import re
import string
import unicodedata

class StringProcessor(object):
"""
This class defines method to process strings in the most
efficient way. Ideally all the methods below use unicode strings
for both input and output.
"""

@classmethod
def replace_non_lettters_non_numbers_with_whitespace(cls, a_string):
"""
This function replaces any sequence of non letters and non numbers with a single white space.
"""
regex = re.compile(r"(?ui)[\W]+")
return regex.sub(u" ", a_string)

@classmethod
def strip(cls, a_string):
"""
This function strips leading and trailing white space.
"""

return a_string.strip()

@classmethod
def to_lower_case(cls, a_string):
"""
This function returns the lower-cased version of the string given.
"""
return a_string.lower()

@classmethod
def to_upper_case(cls, a_string):
"""
This function returns the upper-cased version of the string given.
"""
return a_string.upper()

57 changes: 38 additions & 19 deletions fuzzywuzzy/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import string
from string_processing import StringProcessor

def validate_string(s):
try:
if len(s) > 0:
return True
else:
return False
except:
return False

bad_chars=''
for i in range(128,256):
bad_chars+=chr(i)
table_from=string.punctuation+string.ascii_uppercase
table_to=' '*len(string.punctuation)+string.ascii_lowercase
trans_table=string.maketrans(table_from, table_to)


def asciionly(s):
return s.translate(None, bad_chars)

# remove non-ASCII characters from strings
def asciidammit(s):
if type(s) is str:
return asciionly(s)
Expand All @@ -20,21 +24,36 @@ def asciidammit(s):
else:
return asciidammit(unicode(s))

def validate_string(s):
try:
if len(s)>0:
return True
else:
return False
except:
return False
def make_type_consistent(s1, s2):
if isinstance(s1, str) and isinstance(s2, str):
return s1, s2

elif isinstance(s1, unicode) and isinstance(s2, unicode):
return s1, s2

def full_process(s):
s = asciidammit(s)
return s.translate(trans_table, bad_chars).strip()
else:
return unicode(s1), unicode(s2)

def full_process(s, force_ascii=False):
"""Process string by
-- removing all but letters and numbers
-- trim whitespace
-- force to lower case
if force_ascii == True, force convert to ascii"""

if s is None:
return u""

if force_ascii:
s = asciidammit(s)
# Keep only Letters and Numbres (see Unicode docs).
string_out = StringProcessor.replace_non_lettters_non_numbers_with_whitespace(s)
# Force into lowercase.
string_out = StringProcessor.to_lower_case(string_out)
# Remove leading and trailing whitespaces.
string_out = StringProcessor.strip(string_out)
return string_out

def intr(n):
'''Returns a correctly rounded integer'''
return int(round(n))


Loading

0 comments on commit b486605

Please sign in to comment.