-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
267 lines (203 loc) · 9.29 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# -*- coding: utf-8 -*-
"""Contact preprocessing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1tqs1WFeRybwTIjTiAzqk6pUTBlImsvtM
"""
#libraries and initial settings
!pip install tqdm
from tqdm import tqdm
import os
from collections import defaultdict
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
""" needed sent_tokenize too because word2vec wants to be trained with a list of sentences. it also accepts sentences as list so sent_tokenize splits each sentence as a sublist"""
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('turkish'))
!pip install stanza
import stanza
stanza.download('tr')
""" Stanza's turkish model was extremely helpful for this task to get the PoS-tag and lemma of each word; however, it appears to that it is kinda slow since the preprocessing took 5 hours."""
import gensim
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
""" to train the model """
#!gdown 1N27pZ9u70Sqc6O0OVkAUq7YLu3IApm_8 # downloads the corpus from google drive. but realized it only works on private drive.
"""
The code below reads the corpus and saves as a string 'wiki_tr_corpus'. I utilized tqdm and os to see the progress.
"""
wiki_tr = "/content/wiki.tr.txt"
file_size = os.path.getsize(wiki_tr)
with tqdm(total=file_size, unit='B', unit_scale=True, desc='Loading the corpus') as pbar:
with open(wiki_tr, 'r') as f:
wiki_tr_corpus = f.read()
pbar.update(len(wiki_tr_corpus.encode('utf-8')))
def is_verb(pos):
"""
The function checks whether the word is a verb. If it is not a verb, it returns False, otherwise it returns its parts of speech (V).
"""
return pos.startswith('V')
def find_last_vowel(lemma):
"""
The function detects the last_vowel of a lemma.
"""
vowels = ['a', 'e', 'ı', 'i', 'o', 'ö', 'u', 'ü']
for i in range(len(lemma)-1, -1, -1):
if lemma[i] in vowels:
last_vowel = lemma[i]
break # Exit the loop after finding the last vowel
return last_vowel
def add_infinitival_marker(lemma, last_vowel):
"""
The function takes the lemma and its last_vowel, and it adds infinitival marker if it is a verbal lemma
"""
front_vowels = ['e', 'i', 'ü', 'ö']
back_vowels = ['a', 'ı', 'u', 'o']
if last_vowel in back_vowels:
return lemma + 'mak'
elif last_vowel in front_vowels:
return lemma + 'mek'
else:
return lemma
nlp = stanza.Pipeline('tr', processors='tokenize,mwt,pos,lemma', use_gpu=False) # stanza pipeline for Turkish
def preprocessing(corpus):
"""
Takes the corpus and returns sentences with lemmas and infinitival markers for verbs.
"""
sentences = []
raw_sentences = sent_tokenize(corpus) # splits the corpus by sentence boundaries
for raw_sentence in raw_sentences:
tokens = nltk.word_tokenize(raw_sentence) # tokenizes each sentence
cleaned_tokens = [token.lower() for token in tokens if token.lower() not in stop_words] # removes the stop words and returns the lowered version of tokens
cleaned_words = [token for token in cleaned_tokens if len(token) > 1 and (token.isalpha() or token.isnumeric()) and token != '.'] # deletes a token if it's not longer than a letter and not alphanumeric.
doc = nlp(' '.join(cleaned_words)) # applies the stanza pipeline into cleaned_words
lemma_sentence = [] # to store lemmatized sentences
for sentence in doc.sentences:
for word in sentence.words:
if is_verb(word.upos):
last_vowel = find_last_vowel(word.lemma) # finds the last vowel of a verb
if last_vowel:
lemma_with_marker = add_infinitival_marker(word.lemma, last_vowel)
lemma_sentence.append(lemma_with_marker)
else:
lemma_sentence.append(word.lemma)
if lemma_sentence:
sentences.append(lemma_sentence)
return sentences
#control tests
word_list = ["deneme", "yapalım", "test", "türkiye", "ölürüm"]
verb_lemma_list = ["yap", "öl"]
for word in word_list:
doc = nlp(word)
upos = doc.sentences[0].words[0].upos
result = is_verb(upos)
print(f"{word}: {result}")
for word in word_list:
print(f"{word} ---> {find_last_vowel(word)}")
for lemma in verb_lemma_list:
print(f"{lemma} ---> {add_infinitival_marker(lemma, find_last_vowel(lemma))}")
preprocessing("deneme yapalım bakalım. test deneme türkiye ölürüm sana. vazgeçtim belki de ölmem")
"""
The code below seems to be complex yet it has a useful back-up system in case of any crash while using Colab.
"""
import pickle
batch_size = 100000
batches = [wiki_tr_corpus[i:i+batch_size] for i in range(0, len(wiki_tr_corpus), batch_size)]
normal_corpus = []
start_from_batch = 0 # Variable to store the index of the last processed batch
try:
# Load the saved batches if available
with open("saved_batches.pkl", "rb") as f:
saved_batches = pickle.load(f)
normal_corpus.extend(saved_batches)
start_from_batch = len(saved_batches)
except FileNotFoundError:
saved_batches = []
for i in tqdm(range(start_from_batch, len(batches)), desc='Processing Batches'):
batch = batches[i]
batch_lemmas = preprocessing(batch)
normal_corpus.extend(batch_lemmas)
# Save the processed batch
saved_batches.extend(batch_lemmas)
with open("saved_batches.pkl", "wb") as f:
pickle.dump(saved_batches, f)
# Cleanup the saved batches file after completing all the processing
if os.path.exists("saved_batches.pkl"):
os.remove("saved_batches.pkl")
#Postprocessing
""" We need to do a postprocess because we forgot deleting suffixes after apostrophes. Those suffixes were recognized as tokens.
Also some english words are found in the corpus. We will delete them by checking each word in english nltk dictionary. """
""" The following lines checks for non-words and unintended words like pronouns.
The output list will be used to filter them from the corpus. """
tokens = [token for sentence in normal_corpus for token in sentence if len(token) > 1] # necessary since normal_corpus has many sublists
token_count = Counter(tokens) # to count the occurences of items
sorted_tokens = sorted(token_count.items(), key=lambda x: x[1], reverse=True) # sorts the tokens by frequency
sorted_tokens = [token for token in sorted_tokens if token[1] >= 5] # removes the tokens whose count is below 5
suspected_affixes = []
for token, count in sorted_tokens:
if len(token) < 4:
suspected_affixes.append((token, count))
suspected_non_words = []
for token, count in suspected_affixes:
doc = nlp(token)
if doc.sentences[0].words[0].upos not in ["NOUN", "VERB", "ADV", "ADJ", "DET", "NUM"]:
suspected_non_words.append(token)
# to finally remove unintended and pseudo words
sorted_tokens = [(token, count) for token, count in sorted_tokens if token not in suspected_non_words]
# to remove english words
nltk.download('words')
english_words = set(nltk.corpus.words.words())
sorted_tokens = [(token, count) for token, count in sorted_tokens if token not in english_words]
sorted_tokens[100:200]
""" I realized I need to filter the same words from the corpus too :D """
normal_corpus = [
[token for token in sentence if token not in suspected_non_words]
for sentence in normal_corpus
]
normal_corpus = [
[token for token in sentence if token.lower() not in english_words]
for sentence in normal_corpus
]
""" This code saves the lemma_corpus as pickle file. I preferred pickle way over built-in one since pickle keeps the sublist hierarchy. """
output_file = '/content/lemma_corpus.pkl'
with open(output_file, 'wb') as file:
pickle.dump(normal_corpus, file)
""" This code saves the sorted_lemmas """
output_file = '/content/sorted_lemmas.pkl'
with open(output_file, "wb") as file:
pickle.dump(sorted_tokens, file)
"""
Parameters for the word2vec model
"""
import multiprocessing
# Set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 5 # Minimum word count
num_workers = multiprocessing.cpu_count() # Number of threads to run in parallel
context = 5 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model (this will take some time)
print("Training model...")
model_300 = word2vec.Word2Vec(normal_corpus,
workers = num_workers,
vector_size = num_features,
min_count = min_word_count,
window = context,
sample = downsampling
)
model_300.save('wiki_tr.embedding_300') # to save the model
# An instance of performance of the model
word = "inşa"
similar_words = model_300.wv.most_similar(word)
print(f"'{word}, için yakın kelimeler:':")
for word, similarity in similar_words:
print(f"{word}: {similarity}")
"""The following two lines compare the tests of the model with a turkish model trained on the whole wikipedia."""
" Scores from my model"
model_300.wv.most_similar(positive=['gotik'], topn = 5)
""" Scores from the pre-trained turkish model """
#word_vectors.most_similar(positive=['gotik'], topn = 5)