-
Notifications
You must be signed in to change notification settings - Fork 16
/
fasttext_cos.py
46 lines (45 loc) · 1.66 KB
/
fasttext_cos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import pandas as pd
import numpy as np
import random as rn
from tqdm import tqdm, tqdm_notebook
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
import gc
import time
from gensim.models import Word2Vec
import fasttext
from gensim.models import Word2Vec
import scipy.spatial.distance as ssd
tqdm.pandas()
input_path = "/home/kesci/input/bytedance/"
out_work_path = '/home/kesci/work/zhifeng/'
out_path = '/home/kesci/zhifeng/'
w2v = fasttext.load_model(out_work_path+'corpus.fasttext.model')
train_cosine_list = []
with open(out_path+'train.smaller.csv', 'r') as fin:
for line in tqdm(fin):
_, q, _, a, _ = line.strip().split(',')
v1 = w2v.get_sentence_vector(q)
v2 = w2v.get_sentence_vector(a)
train_cosine_list.append(ssd.cosine(v1, v2))
pd.to_pickle(np.array(train_cosine_list),
out_work_path+'train.cosine.fasttext.pkl')
val_cosine_list = []
with open(out_path+'val.csv', 'r') as fin:
for line in tqdm(fin):
_, q, _, a, _ = line.strip().split(',')
v1 = w2v.get_sentence_vector(q)
v2 = w2v.get_sentence_vector(a)
val_cosine_list.append(ssd.cosine(v1, v2))
pd.to_pickle(np.array(val_cosine_list),
out_work_path+'val.cosine.fasttext.pkl')
test_cosine_list = []
with open(input_path+'test_final_part1.csv', 'r') as fin:
for line in tqdm(fin):
_, q, _, a = line.strip().split(',')
v1 = w2v.get_sentence_vector(q)
v2 = w2v.get_sentence_vector(a)
test_cosine_list.append(ssd.cosine(v1, v2))
pd.to_pickle(np.array(test_cosine_list),
out_work_path+'test.cosine.fasttext.pkl')