-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdict_generate.py
58 lines (48 loc) · 1.69 KB
/
dict_generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def read_dict(dict_path):
dict = []
with open(dict_path, 'r', encoding='utf-8') as f_dict:
for line in f_dict:
entry = line[9:].strip().split(' ')
if entry:
dict.append(entry)
return dict
def read_eval_dataset(data_path):
sentences = []
difficult_words = []
with open(data_path, 'r', encoding='utf-8') as reader:
while True:
line = reader.readline()
if not line:
break
row = line.strip().split('\t')
sentence, difficult_word = row[0], row[1]
sentences.append(''.join(sentence.split(' ')))
difficult_words.append(difficult_word)
return sentences, difficult_words
def save_results(result, output_path):
with open(output_path, 'a', encoding='utf-8') as f_result:
f_result.write(' '.join(result) + '\n')
def main():
DICT_PATH = './dict/HIT-dict=.txt'
DATA_PATH = './dataset/annotation_data.csv'
OUTPUT_PATH = './data/dict_output.csv'
dict_path = DICT_PATH
data_path = DATA_PATH
output_path = OUTPUT_PATH
dict = read_dict(dict_path)
sentences, difficult_words = read_eval_dataset(data_path)
substitution_words = []
for difficult_word in difficult_words:
isFound = False
substitution_words = []
for entry in dict:
if difficult_word in entry:
isFound = True
for word in entry:
substitution_words.append(word)
if (isFound == False):
substitution_words.append('NULL')
print(substitution_words)
save_results(substitution_words, output_path)
if __name__ == '__main__':
main()