-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguage_match.py
76 lines (68 loc) · 2.93 KB
/
language_match.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
""" A set of functions for analyzing groups of languages
Written by Colin Hamilton, May 2016
"""
import json
from language import Language
def match(unknown, known):
""" Compares the unknown language with all known languages.
Args:
unknown: A Language object to be compared.
known: A dict mapping language names to Language objects.
Returns:
A list of tuples of the form (language_name, score), sorted from
best to worst matches.
"""
results = [(name, unknown.compare(known[name])) for name in known]
return sorted(results, key=lambda x: -x[1])
def read_languages(file_dict, n_max, cachefiles={}):
""" Creates a set of Language objects with the given languages and files.
Args:
file_dict: A dict mapping language names to a list of filenames
n_max: The maximum length n-grams for the Languages to track
Returns:
A dict mapping language names to Language objects populated with the
contents of the files specified.
Notes:
If a file cannot be read, an error message will be printed, and an
exception will not be thrown.
"""
results = {}
for lang in file_dict:
results[lang] = Language(n_max)
cachefile = ".{}.ngramcache".format(lang)
try:
with open(cachefile, "r") as cache:
if (results[lang].read_cache(json.load(cache),
expected=file_dict[lang])):
print("Cache up to date for {}".format(lang))
continue # success
except Exception:
pass
results[lang] = Language(n_max) # Overwrite any old results
print("Cache not up to date for {}".format(lang))
# if cache does not exist, is not up to date, or is corrupted
for filename in file_dict[lang]:
try:
results[lang].add_file(filename)
except Exception:
print("Could not read file", filename)
with open(cachefile, "w") as cachefile:
cachefile.write(json.dumps(results[lang].to_cache(file_dict[lang])))
return results
def best_matches(filename, reference_langs, n_max, amt=None):
""" Finds the closest matches for a document from among a set of Languages.
Args:
filename: The name of the file of the document to classify.
reference_langs: A dict mapping language names to Language objects.
n_max: The length of n-grams to classify the unknown document on.
amt: The number of results to return (or None, to return all)
Returns:
A list of tuples of the form (language_name, score), sorted from
best to worst matches. Only the top amt are in the list.
"""
if amt is None:
amt = len(reference_langs)
unknown = Language(n_max)
unknown.add_file(filename)
comparisons = match(unknown, reference_langs)
return comparisons[: min(amt, len(comparisons))]