-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature_extraction.py
195 lines (152 loc) · 8.22 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#%%
import os
import pandas as pd
from nltk.corpus import stopwords
from pre_processing import preprocess
from fuzzywuzzy import fuzz
import distance
from nltk.metrics import jaccard_distance
SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words("english")
def get_token_features(q1, q2):
token_features = [0.0]*10
# Converting the Sentence into Tokens:
q1_tokens = q1.split()
q2_tokens = q2.split()
if len(q1_tokens) == 0 or len(q2_tokens) == 0:
return token_features
# Get the non-stopwords in Questions
q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
#Get the stopwords in Questions
q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
# Get the common non-stopwords from Question pair
common_word_count = len(q1_words.intersection(q2_words))
# Get the common stopwords from Question pair
common_stop_count = len(q1_stops.intersection(q2_stops))
# Get the common Tokens from Question pair
common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
# Last word of both question is same or not
token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
# First word of both question is same or not
token_features[7] = int(q1_tokens[0] == q2_tokens[0])
token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
#Average Token Length of both Questions
token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
return token_features
def process_and_extract_features(file_path,rows_to_train):
if os.path.isfile(file_path):
data = pd.read_csv(file_path, encoding='latin-1')
else:
data = pd.read_csv("data/train.csv")
data = data[:rows_to_train]
data.dropna(subset=['question1', 'question2'], inplace=True)
data['freq_qid1'] = data.groupby('qid1')['qid1'].transform('count')
data['freq_qid2'] = data.groupby('qid2')['qid2'].transform('count')
data['q1len'] = data['question1'].str.len()
data['q2len'] = data['question2'].str.len()
data['q1_n_words'] = data['question1'].apply(lambda row: len(row.split(" ")))
data['q2_n_words'] = data['question2'].apply(lambda row: len(row.split(" ")))
def normalized_word_Common(row):
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
return 1.0 * len(w1 & w2)
data['word_Common'] = data.apply(normalized_word_Common, axis=1)
def normalized_word_Total(row):
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
return 1.0 * (len(w1) + len(w2))
data['word_Total'] = data.apply(normalized_word_Total, axis=1)
def normalized_word_share(row):
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
return 1.0 * len(w1 & w2)/(len(w1) + len(w2))
data['word_share'] = data.apply(normalized_word_share, axis=1)
data['freq_q1+q2'] = data['freq_qid1']+data['freq_qid2']
data['freq_q1-q2'] = abs(data['freq_qid1']-data['freq_qid2'])
# preprocessing each question
data["question1"] = data["question1"].fillna("").apply(preprocess)
data["question2"] = data["question2"].fillna("").apply(preprocess)
print("token features...")
# Merging Features with dataset
token_features = data.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
data["cwc_min"] = list(map(lambda x: x[0], token_features))
data["cwc_max"] = list(map(lambda x: x[1], token_features))
data["csc_min"] = list(map(lambda x: x[2], token_features))
data["csc_max"] = list(map(lambda x: x[3], token_features))
data["ctc_min"] = list(map(lambda x: x[4], token_features))
data["ctc_max"] = list(map(lambda x: x[5], token_features))
data["last_word_eq"] = list(map(lambda x: x[6], token_features))
data["first_word_eq"] = list(map(lambda x: x[7], token_features))
data["abs_len_diff"] = list(map(lambda x: x[8], token_features))
data["mean_len"] = list(map(lambda x: x[9], token_features))
print("fuzzy features..")
data["token_set_ratio"] = data.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
data["token_sort_ratio"] = data.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
data["fuzz_ratio"] = data.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
data["fuzz_partial_ratio"] = data.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
data["longest_substr_ratio"] = data.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
print("Adding Additional features.....")
data['ratio_q_lengths'] = data.apply(lambda row: ratio_of_question_lengths(row['question1'], row['question2']), axis=1)
data['common_prefix'] = data.apply(lambda row: common_prefix(row['question1'], row['question2']), axis=1)
data['common_suffix'] = data.apply(lambda row: common_suffix(row['question1'], row['question2']), axis=1)
data['diff_words'] = data.apply(lambda row: abs(row['q1_n_words'] - row['q2_n_words']), axis=1)
data['diff_chars'] = data.apply(lambda row: abs(len(str(row['question1'])) - len(str(row['question2']))), axis=1)
data['jaccard_similarity'] = data.apply(lambda row: jaccard_similarity(row['question1'], row['question2']), axis=1)
data['longest_common_subsequence'] = data.apply(lambda row: longest_common_subsequence(row['question1'], row['question2']), axis=1)
data.to_csv(file_path, index=False)
return data
def ratio_of_question_lengths(q1, q2):
# Function to calculate the ratio of question lengths
len_q1 = len(str(q1))
len_q2 = len(str(q2))
if len_q2 == 0:
return 0.0
return len_q1 / len_q2
def common_prefix(q1, q2):
# Function to find the length of the common prefix
i = 0
while i < min(len(q1), len(q2)) and q1[i] == q2[i]:
i += 1
return i
def common_suffix(q1, q2):
# Function to find the length of the common suffix
i, j = len(q1) - 1, len(q2) - 1
while i >= 0 and j >= 0 and q1[i] == q2[j]:
i -= 1
j -= 1
return len(q1) - i - 1
def jaccard_similarity(q1, q2):
# Function to calculate Jaccard's Similarity
q1_tokens = set(q1.split())
q2_tokens = set(q2.split())
if not q1_tokens or not q2_tokens:
return 0.0
return 1.0 - jaccard_distance(q1_tokens, q2_tokens)
def longest_common_subsequence(q1, q2):
# Function to calculate Longest Common Subsequence
seq1 = list(q1)
seq2 = list(q2)
m, n = len(seq1), len(seq2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
def get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
#%%