-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvector_generate.py
57 lines (48 loc) · 1.78 KB
/
vector_generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gensim
def read_freq_dict(freq_dict_path):
freq_dict = {}
with open(freq_dict_path, 'r', encoding='utf-8') as f_dict:
for line in f_dict:
key, value = line.strip().split(',')
freq_dict[key] = value
return freq_dict
def read_dataset(path):
sentences = []
difficult_words = []
mask_indexs = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
row = line.strip().split('\t')
sentences.append(''.join(row[0].split(' ')))
difficult_words.append(row[1])
return sentences, difficult_words
def read_dict(dict_path):
dict = {}
with open(dict_path, 'r', encoding='utf-8') as f_dict:
for line in f_dict:
key, value = line.strip().split(',')
dict[key] = value
return dict
def save_results(sim_words, output_path):
with open(output_path, 'a', encoding='utf-8') as f_result:
f_result.write(' '.join(sim_words) + '\n')
def main():
EVAL_FILE = './dataset/annotation_data.csv'
OUTPUT_PATH = './data/vector_output.csv'
WORD_2_VECTOR_MODEL_DIR = './model/merge_sgns_bigram_char300.txt'
eval_file = EVAL_FILE
output_path = OUTPUT_PATH
word_2_vector_model_dir = WORD_2_VECTOR_MODEL_DIR
sentences, difficult_words = read_dataset(eval_file)
for difficult_word in difficult_words:
sim_words = []
model_word2vector = gensim.models.KeyedVectors.load_word2vec_format(word_2_vector_model_dir, binary=False)
try:
sim_words = model_word2vector.most_similar(difficult_word)
sim_words = [item[0] for item in sim_words]
except:
sim_words.append('NULL')
print(sim_words)
save_results(sim_words, output_path)
if __name__ == '__main__':
main()