-
Notifications
You must be signed in to change notification settings - Fork 8
/
w2v_utils.py
45 lines (32 loc) · 1.37 KB
/
w2v_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pandas as pd
import numpy as np
from util import tprint
from gensim.models import KeyedVectors
def read_pretrained_w2v(pretrained_w2v_fname, is_glove):
"""
reads pretrained word embedding from the given file name
is_glove: if True, assumes the format of GloVe text file. otherwise, word2vec bin file is assumed.
in Glove case: returns a dictionary which maps a word to its vector
in word2vec case: returns Word2VecKeyedVectors (of gensim)
in addition, mean_vec is returned, which is the mean of all vectors (can be used for <unk>)
"""
tprint("reading file: {}".format(pretrained_w2v_fname))
if is_glove:
w2vec = pd.read_csv(pretrained_w2v_fname, header=None, sep=' ', quoting=3, encoding="ISO-8859-1")
tprint("done")
w2v_words = w2vec.iloc[:, 0].values
w2v_vectors = w2vec.iloc[:, 1:].values
num_words, dim = w2v_vectors.shape
mean_vec = np.mean(w2v_vectors, 0)
w2v = {}
for word_i, word in enumerate(w2v_words):
w2v[word] = w2v_vectors[word_i, :]
else:
w2v = KeyedVectors.load_word2vec_format(pretrained_w2v_fname, binary=True)
tprint("done")
num_words = len(w2v.vocab)
dim = w2v.vector_size
mean_vec = np.mean(w2v.syn0, 0)
print("dim: {}".format(dim))
print("num_words: {}".format(num_words))
return w2v, mean_vec