-
Notifications
You must be signed in to change notification settings - Fork 3
/
hebrew_spellcheck.py
169 lines (138 loc) · 5.56 KB
/
hebrew_spellcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# coding: utf-8
import codecs
import json
import re
dictionary_for_suffixes = {
u"כינוי": {
u"אני": u"אותי",
u"אתה": u"אותך",
u"את": u"אותך",
u"הוא": u"אותו",
u"היא": u"אותה",
u"אנו": u"אותנו",
u"אנחנו": u"אותנו",
u"אתם": u"אתכם",
u"אתן": u"אתכן",
u"הם": u"אותם",
u"הן": u"אותן",
},
u"של": {
u"אני": u"שלי",
u"אתה": u"שלך",
u"את": u"שלך",
u"הוא": u"שלו",
u"היא": u"שלה",
u"אנו": u"שלנו",
u"אנחנו": u"שלנו",
u"אתם": u"שלכם",
u"אתן": u"שלכן",
u"הם": u"שלהם",
u"הן": u"שלהן",
},
u"סמיכות": u"של"
}
def we_havent_added_prefix_or_suffix(word):
"""
Receives a modified word. This word should have been expanded into prefix - root - suffix.
If the modified word only has one word, that means there has been no modification.
:param word: The modified word
:return: Boolean Value determining if the word has not been modified
"""
return len(word.split()) == 1
def determine_beginning_stopword(beginning_stopword):
"""
Appends the approproate beginning stopword. There are three possibilites,
therefore the correct option needs to be determine
:param beginning_stopword: The modified word
:return: Appropriate beginning stopword
"""
word = beginning_stopword.group(1).strip(',')
if word == u"הווה,רבים":
return u"אנחנו"
elif word == u"הווה,יחיד":
return u"אני"
else:
return word
def get_stopword_at_beginning(conj):
"""
Get the beginning stopword
:param conj: conjugation information of word
:return: Stopword if known, otherwise None
"""
pronouns = '|'.join([u"אני", u"אתה", u"את", u"הוא", u"היא", u"אנו", u"אתם", u"אתן", u"הם", u"הן"])
stopword_beginning_pattern = re.compile(ur"(,({})(,|$)|(הווה,רבים|הווה,יחיד))".format(pronouns))
return stopword_beginning_pattern.search(conj.strip())
def get_stopword_at_end(conj):
"""
Get the end stopword
:param conj: conjugation information of word
:return: Stopword if known, otherwise None
"""
last_word = conj.split(',')[-1].strip()
if last_word == u"סמיכות":
return dictionary_for_suffixes[last_word]
elif u'/' in last_word:
shel_or_kinui = last_word.split(u'/')[0]
pronoun = last_word.split(u'/')[1]
return dictionary_for_suffixes[shel_or_kinui][pronoun]
else:
return None
def create_each_replacement(one_possible_conjugation):
"""
Replaces a word with its prefix-root-suffix
:param one_possible_conjugation: dict of a word and its meta-information
:return: prefix-root-suffix replacement
"""
full_replacement = {'root': one_possible_conjugation['root'].strip()}
conj = one_possible_conjugation['conjugation']
beginning_stopword = get_stopword_at_beginning(conj)
if beginning_stopword:
full_replacement['prefix'] = determine_beginning_stopword(beginning_stopword)
end_stopword = get_stopword_at_end(conj)
if end_stopword:
full_replacement['suffix'] = end_stopword
return full_replacement
def choose_one_replacement(replacements):
"""
Takes a list of possible replacements. Each replacement is the expansion of a word into its prefix-root-suffix.
Replacement is returned only when there is no ambiguity However, if there is any level of ambiguity, None is returned.
This methods tries to find the least common denominator of agreement. If all replacement options agree on the suffix but there is ambiguity in the prefix, the root-suffix is returned.
:param replacements: List of possible replacements to be selected
:return: Selected Replacement without Ambiguity. In case of ambiguity, None is returned
"""
if any(len(d) == 1 for d in replacements):
return None
if len(replacements) == 1:
d = replacements.pop()
full_replacement = u"{} {} {}".format(d.get('prefix', u''), d.get('root', u''), d.get('suffix', u''))
return full_replacement.strip()
shoreshim = set([x['root'] for x in replacements])
if len(shoreshim) > 1:
return None
full_replacement = shoreshim.pop()
if all('prefix' in d for d in replacements):
full_replacement = u"{} {}".format(replacements[0]['prefix'], full_replacement)
if all('suffix' in d for d in replacements):
full_replacement = u"{} {}".format(full_replacement, replacements[0]['suffix'])
if we_havent_added_prefix_or_suffix(full_replacement):
return None
return full_replacement
def read_in_json_file(filename):
"""
Return the entire content of a json file
:param filename: Name of file to return
:return: Dict which is the entire JSON file
"""
with codecs.open(filename, encoding='utf8') as hspell:
whole_file = json.load(hspell)
return whole_file
def split_word_into_root_and_suffix():
whole_file = read_in_json_file('./all_hebrew_inv.json')
word_expander = {}
for full_word, all_conjugations in whole_file.items():
replacements = [create_each_replacement(each_conj) for each_conj in all_conjugations]
replacement = choose_one_replacement(replacements)
if replacement:
word_expander[full_word] = replacement
return word_expander
word_expander = split_word_into_root_and_suffix()