-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsememe.py
119 lines (103 loc) · 3.77 KB
/
sememe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#coding: utf-8
import random
class Sememe(object):
def __init__(self, hownet_dir, sememe_dir, lemma_dir, filename=None, lower=True, drop_rate = 1, meaningless = False, wordnet = False):
#self.idxToLabel = {}
self.labelToIdx = {}
self.lower = lower
if wordnet:
hownet_dir = 'new_hownet.txt'
sememe_dir = 'new_sememe.txt'
filename = 'new_sememe.txt'
self.lemma_dict = self.read_lemmatization(lemma_dir)
hownet_ori = self.read_hownet(hownet_dir)
if not meaningless:
if drop_rate < 1:
self.hownet = {k: v for k,v in hownet_ori.items() if random.uniform() <= drop_rate}
else:
self.hownet = hownet_ori
else:
self.hownet = hownet_ori
all_sememes = []
new_hownet = {}
a = [1,2,3,4]
for k in self.hownet:
for v in self.hownet[k]:
if v not in all_sememes:
all_sememes.append(v)
for k in self.hownet:
new_hownet[k] = []
for _ in range(random.choice(a)):
new_hownet[k].append(all_sememes[random.randint(1, len(all_sememes) - 1)])
self.hownet = new_hownet
if filename is not None:
self.loadFile(filename)
def size(self):
return len(self.labelToIdx)
# Load entries from a file.
def loadFile(self, file_dir):
f = open(file_dir, 'r')
line = f.readline()
while(line):
line = f.readline()
a = line.strip().split('\t')
for item in a:
if item not in self.labelToIdx:
self.labelToIdx[item] = len(self.labelToIdx)
line = f.readline()
print(len(self.labelToIdx))
def getIndex(self, key):
if key in self.labelToIdx:
return self.labelToIdx[key]
else:
return None
# Optionally insert `bosWord` at the beginning and `eosWord` at the .
def convertToIdx(self, labels):
vec = []
vec += [self.getIndex(label) for label in labels]
return vec
'''
input: a list of each word in a single sentence
output: a list of each word's sememe list in a single sentence
'''
def read_sememe(self, labels):
sentence = []
sentence += [self.read_word_sememe(label) for label in labels]
return sentence
'''
input: a word
output: a list of the word's sememe
'''
def read_word_sememe(self, word):
labels = []
if word in self.hownet:
for item in self.hownet[word]:
if self.getIndex(item) not in labels:
labels.append(self.getIndex(item))
elif word in self.lemma_dict:
if self.lemma_dict[word] in self.hownet:
for item in self.hownet[self.lemma_dict[word]]:
if self.getIndex(item) not in labels:
labels.append(self.getIndex(item))
return labels
def read_hownet(self, hownet_dir):
dic_hownet = {}
f1 = open(hownet_dir, 'r')
line = f1.readline()
while(line):
word = line.strip()
if word not in dic_hownet:
dic_hownet[word] = []
line = f1.readline()
sememes = line.strip().split('\t')
for item in sememes:
if item not in dic_hownet[word]:
dic_hownet[word].append(item)
line = f1.readline()
return dic_hownet
def read_lemmatization(self, lemma_dir):
dic_lemma = {}
for line in open(lemma_dir):
line = line.strip().split()
dic_lemma[line[1]] = line[0]
return dic_lemma