preprocessing.py

# -*- coding: utf-8 -*-
"""Contact preprocessing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1tqs1WFeRybwTIjTiAzqk6pUTBlImsvtM
"""

#libraries and initial settings
!pip install tqdm 
from tqdm import tqdm
import os

from collections import defaultdict
from collections import Counter

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
""" needed sent_tokenize too because word2vec wants to be trained with a list of sentences. it also accepts sentences as list so sent_tokenize splits each sentence as a sublist"""

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('turkish'))

!pip install stanza
import stanza
stanza.download('tr')
""" Stanza's turkish model was extremely helpful for this task to get the PoS-tag and lemma of each word; however, it appears to that it is kinda slow since the preprocessing took 5 hours."""

import gensim
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
""" to train the model """

#!gdown 1N27pZ9u70Sqc6O0OVkAUq7YLu3IApm_8 # downloads the corpus from google drive. but realized it only works on private drive.

"""
The code below reads the corpus and saves as a string 'wiki_tr_corpus'. I utilized tqdm and os to see the progress.
"""

wiki_tr = "/content/wiki.tr.txt"

file_size = os.path.getsize(wiki_tr)
with tqdm(total=file_size, unit='B', unit_scale=True, desc='Loading the corpus') as pbar:
  with open(wiki_tr, 'r') as f:
    wiki_tr_corpus = f.read()
    pbar.update(len(wiki_tr_corpus.encode('utf-8')))

def is_verb(pos):
  """
  The function checks whether the word is a verb. If it is not a verb, it returns False, otherwise it returns its parts of speech (V).
  """
  return pos.startswith('V') 

def find_last_vowel(lemma):
    """
    The function detects the last_vowel of a lemma.
    """
    vowels = ['a', 'e', 'ı', 'i', 'o', 'ö', 'u', 'ü']
    for i in range(len(lemma)-1, -1, -1):
        if lemma[i] in vowels:
            last_vowel = lemma[i]
            break  # Exit the loop after finding the last vowel
    return last_vowel

def add_infinitival_marker(lemma, last_vowel):
  """ 
  The function takes the lemma and its last_vowel, and it adds infinitival marker if it is a verbal lemma
  """
  front_vowels = ['e', 'i', 'ü', 'ö']
  back_vowels = ['a', 'ı', 'u', 'o']
  
  if last_vowel in back_vowels:
      return lemma + 'mak'
  elif last_vowel in front_vowels:
      return lemma + 'mek'
  else:
      return lemma

nlp = stanza.Pipeline('tr', processors='tokenize,mwt,pos,lemma', use_gpu=False)  # stanza pipeline for Turkish

def preprocessing(corpus):
  """
  Takes the corpus and returns sentences with lemmas and infinitival markers for verbs.
  """
  sentences = [] 

  raw_sentences = sent_tokenize(corpus) # splits the corpus by sentence boundaries
  for raw_sentence in raw_sentences:
    tokens = nltk.word_tokenize(raw_sentence) # tokenizes each sentence
    cleaned_tokens = [token.lower() for token in tokens if token.lower() not in stop_words] # removes the stop words and returns the lowered version of tokens
    cleaned_words = [token for token in cleaned_tokens if len(token) > 1 and (token.isalpha() or token.isnumeric()) and token != '.'] # deletes a token if it's not longer than a letter and not alphanumeric.

    doc = nlp(' '.join(cleaned_words)) # applies the stanza pipeline into cleaned_words
    lemma_sentence = [] # to store lemmatized sentences
    for sentence in doc.sentences:
      for word in sentence.words:
        if is_verb(word.upos): 
          last_vowel = find_last_vowel(word.lemma) # finds the last vowel of a verb
          if last_vowel: 
            lemma_with_marker = add_infinitival_marker(word.lemma, last_vowel) 
            lemma_sentence.append(lemma_with_marker)
        else:
          lemma_sentence.append(word.lemma)

    if lemma_sentence:
        sentences.append(lemma_sentence)

  return sentences

#control tests

word_list = ["deneme", "yapalım", "test", "türkiye", "ölürüm"]
verb_lemma_list = ["yap", "öl"]

for word in word_list:
    doc = nlp(word)
    upos = doc.sentences[0].words[0].upos
    result = is_verb(upos)
    print(f"{word}: {result}")

for word in word_list:
  print(f"{word} ---> {find_last_vowel(word)}")

for lemma in verb_lemma_list: 
  print(f"{lemma} ---> {add_infinitival_marker(lemma, find_last_vowel(lemma))}")

preprocessing("deneme yapalım bakalım. test deneme türkiye ölürüm sana. vazgeçtim belki de ölmem")

"""
The code below seems to be complex yet it has a useful back-up system in case of any crash while using Colab.
"""

import pickle

batch_size = 100000
batches = [wiki_tr_corpus[i:i+batch_size] for i in range(0, len(wiki_tr_corpus), batch_size)]
normal_corpus = []

start_from_batch = 0  # Variable to store the index of the last processed batch

try:
    # Load the saved batches if available
    with open("saved_batches.pkl", "rb") as f:
        saved_batches = pickle.load(f)
        normal_corpus.extend(saved_batches)
        start_from_batch = len(saved_batches)
except FileNotFoundError:
    saved_batches = []

for i in tqdm(range(start_from_batch, len(batches)), desc='Processing Batches'):
    batch = batches[i]
    batch_lemmas = preprocessing(batch)
    normal_corpus.extend(batch_lemmas)

    # Save the processed batch
    saved_batches.extend(batch_lemmas)
    with open("saved_batches.pkl", "wb") as f:
        pickle.dump(saved_batches, f)

# Cleanup the saved batches file after completing all the processing
if os.path.exists("saved_batches.pkl"):
    os.remove("saved_batches.pkl")

#Postprocessing
""" We need to do a postprocess because we forgot deleting suffixes after apostrophes. Those suffixes were recognized as tokens.
Also some english words are found in the corpus. We will delete them by checking each word in english nltk dictionary.  """

""" The following lines checks for non-words and unintended words like pronouns.
 The output list will be used to filter them from the corpus. """

tokens = [token for sentence in normal_corpus for token in sentence if len(token) > 1] # necessary since normal_corpus has many sublists

token_count = Counter(tokens) # to count the occurences of items
sorted_tokens = sorted(token_count.items(), key=lambda x: x[1], reverse=True) # sorts the tokens by frequency
sorted_tokens = [token for token in sorted_tokens if token[1] >= 5] # removes the tokens whose count is below 5

suspected_affixes = []
for token, count in sorted_tokens:
  if len(token) < 4:
    suspected_affixes.append((token, count))

suspected_non_words = []
for token, count in suspected_affixes:
  doc = nlp(token)
  if doc.sentences[0].words[0].upos not in ["NOUN", "VERB", "ADV", "ADJ", "DET", "NUM"]:
    suspected_non_words.append(token)

# to finally remove unintended and pseudo words
sorted_tokens = [(token, count) for token, count in sorted_tokens if token not in suspected_non_words]

# to remove english words
nltk.download('words')
english_words = set(nltk.corpus.words.words())
sorted_tokens = [(token, count) for token, count in sorted_tokens if token not in english_words]

sorted_tokens[100:200]

""" I realized I need to filter the same words from the corpus too :D """

normal_corpus = [
    [token for token in sentence if token not in suspected_non_words]
    for sentence in normal_corpus
]

normal_corpus = [
    [token for token in sentence if token.lower() not in english_words]
    for sentence in normal_corpus
]

""" This code saves the lemma_corpus as pickle file. I preferred pickle way over built-in one since pickle keeps the sublist hierarchy. """

output_file = '/content/lemma_corpus.pkl'

with open(output_file, 'wb') as file:
    pickle.dump(normal_corpus, file)

""" This code saves the sorted_lemmas """

output_file = '/content/sorted_lemmas.pkl'

with open(output_file, "wb") as file:
  pickle.dump(sorted_tokens, file)

"""
Parameters for the word2vec model
"""
import multiprocessing

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = multiprocessing.cpu_count() # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print("Training model...")
model_300 = word2vec.Word2Vec(normal_corpus,
                          workers = num_workers,
                          vector_size = num_features,
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling
                          )

model_300.save('wiki_tr.embedding_300') # to save the model

# An instance of performance of the model

word = "inşa"
similar_words = model_300.wv.most_similar(word)

print(f"'{word}, için yakın kelimeler:':")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

"""The following two lines compare the tests of the model with a turkish model trained on the whole wikipedia."""

" Scores from my model"
model_300.wv.most_similar(positive=['gotik'], topn = 5)

""" Scores from the pre-trained turkish model """
#word_vectors.most_similar(positive=['gotik'], topn = 5)