-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess_books.py
42 lines (32 loc) · 1.28 KB
/
preprocess_books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import re
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
import random
import torch
def load_book_data(filepath):
with open(filepath, 'r') as file:
data = json.load(file)
text = ' '.join(book['EN'] for book in data)
return text
def preprocess_text(text, min_freq=50):
# Clean
words = re.sub(r'[^\w\s]', ' ', text.lower()).split()
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if w not in stop_words]
# Get counts and filter out rare words
counts = Counter(words)
frequent_words = {word for word, count in counts.items() if count > min_freq}
# Replace rare words with <unk> and create final vocab
processed_text = ['<unk>' if word not in frequent_words else word for word in words]
final_counts = Counter(processed_text)
vocab = {word: idx for idx, word in enumerate(final_counts)}
return processed_text, final_counts, vocab
def prepare_training_data(words, window_size):
# Trim to full windows
words_arr = np.array(words)
words_arr = words_arr[:len(words_arr) - len(words_arr) % window_size]
# Reshape into windows
return words_arr.reshape(-1, window_size)