-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcreate_docs_for_doc2vec.py
329 lines (264 loc) · 11.9 KB
/
create_docs_for_doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import codecs
import regex as re
import os.path
import csv
import local_settings
import django
from Constants import ALL_CLEAN_DOCS_FILENAME, DICTA_HEBREW_WIKI_FILENAME, DICTA_SEFARIA_FILENAME, HEBREW_WIKI
django.setup()
from sefaria.model import *
from sefaria.system.exceptions import InputError, PartialRefInputError
import hebrew_spellcheck
word_expander = hebrew_spellcheck.word_expander
stopwords = codecs.open('./hebrew_stopwords.txt', encoding='utf8').read().strip().split('\n')
stopwords_regex = u"(?:\s|^)({})(?=\s|$)".format(u"|".join(stopwords))
stopwords_regex = re.compile(stopwords_regex)
phrases = codecs.open('./select_phrases.txt', encoding='utf8').read().strip().split('\n')
phrases_regex = u"(\s|^)({})(?=\s|$)".format(u"|".join(phrases))
phrases_regex = re.compile(phrases_regex)
def get_talmud_topic_ranged():
"""
Reads every file in the sugyot directory.
Each file in this directory contains semantically meaningful ranged refs for Talmud
:return: List of all Talmudic Ranged Refs
"""
all_gemara_topic_ranges = []
for filename in os.listdir('./sugyot/'):
with codecs.open('./sugyot/' + filename, 'rb', encoding='utf8') as csvfile:
reader = csv.reader(csvfile)
reader.next()
all_gemara_topic_ranges += [row[0] for row in reader]
return all_gemara_topic_ranges
def create_list_off_talmud_books():
"""
Uses Sefaria Project to create a list of titles of every Talmud Tractate
:return: Set of all Talmud book titles
"""
indexes = library.all_index_records()
talmud_books = set(library.get_indexes_in_category("Bavli"))
talmud_books_modified = {u'Avodah Zarah', u'Bava Batra', u'Bava Kamma',
u'Bava Metzia', u'Beitzah', u'Berakhot', u'Chagigah', u'Eruvin', u'Gittin', u'Horayot',
u'Ketubot', u'Kiddushin', u'Makkot', u'Megillah', u'Menachot', u'Moed Katan',
u'Nazir', u'Nedarim', u'Pesachim', u'Rosh Hashanah', u'Sanhedrin', u'Shabbat', u'Shevuot',
u'Sotah', u'Sukkah', u'Taanit', u'Yevamot', u'Yoma', u'Zevachim'}
return talmud_books_modified
def create_list_off_tanakh_books():
"""
Uses Sefaria Project to create a list of titles of every book in Tanakh
:return: Set of all Tanakh book titles
"""
indexes = library.all_index_records()
tanakh_books = set(library.get_indexes_in_category("Tanakh"))
tanakh_books_modified = set([u'Judges', u'Deuteronomy', u'Genesis', u'Exodus',
u'Leviticus', u'II Kings', u'Joshua',
u'I Samuel', u'Numbers', u'I Kings', u'II Samuel'])
return tanakh_books_modified
def get_tanakh_topic_ranges():
"""
Creates semantically meaningful ranged refs for Tanakh using Herzog's breakdown
:return: List of all Tanakh Ranged Refs
"""
with codecs.open('level_3_wo_overlaps.json', 'r', encoding='utf8') as the_file:
segs = json.load(the_file, encoding='utf8')
tanakh_topic_ranged_refs = []
for seg in segs:
b_ref = seg['b_ref']
e_ref = seg['e_ref']
book = b_ref.rsplit(' ', 1)[0]
b_chapter = b_ref.rsplit(' ', 1)[1].split(':')[0]
b_verse = b_ref.rsplit(' ', 1)[1].split(':')[1]
e_chapter = e_ref.rsplit(' ', 1)[1].split(':')[0]
e_verse = e_ref.rsplit(' ', 1)[1].split(':')[1]
verses = "{}:{}-{}".format(b_chapter, b_verse, e_verse) if b_chapter == e_chapter else "{}:{}-{}:{}".format(
b_chapter, b_verse, e_chapter, e_verse)
tanakh_topic_ranged_refs.append("{} {}".format(book, verses))
return tanakh_topic_ranged_refs
def segment_range_dicts(topic_ranged_refs):
"""
Creates two dictionaries. These are mappings between ranged refs and segments refs.
This receives a list of ranged refs. It iterates over every ranged ref and creates a mapping between the ranged refs and the segment refs.
:param topic_ranged_refs: list of ranged refs
:return: tuple of dicts. The first is dict mapping ranged refs to all segment refs. The second is a dict mapping each segment ref to its corresponding ranged ref.
"""
ranged_to_segment = {}
segment_to_ranged = {}
for topic_ranged_ref in topic_ranged_refs:
ranged_to_segment[topic_ranged_ref] = {}
topic_seg_refs = Ref(topic_ranged_ref).range_list()
for seg_ref in topic_seg_refs:
ranged_to_segment[topic_ranged_ref][seg_ref.normal()] = ""
segment_to_ranged[seg_ref.normal()] = topic_ranged_ref
return ranged_to_segment, segment_to_ranged
def is_from_category(ref, books_in_category):
"""
Checks to see if a tref belongs to a list of book within a particular category
For Example: Is Genesis 12:3 within Tanakh
:param ref: tref to be examined
:param books_in_category: List of books in category
:return: Boolean Value determining if Tref is within category
"""
try:
if Ref(ref).index.title in books_in_category:
return True
else:
return False
except InputError:
return False
def pull_out_suffix(string):
"""
Receives a string of Hebrew Text. Iterates over every hebrew word and splits each
word into its root and suffix.
:param string: String of Hebrew Text
:return: String with Hebrew compound words split into Root and Suffix
"""
string = string.split()
string = ' '.join([word_expander.get(word, word) for word in string])
return string
def remove_stopwords(string):
"""
Replaces Stopwords with a space character
:param string: String of Hebrew Text
:return: String without stopwords
"""
return re.sub(stopwords_regex, u' ', string)
def remove_dicta_prefix(string, marker):
"""
Removes prefixes that were detected by Dicta
:param string: String of Hebrew Text
:return: String without prefixes
"""
return re.sub(ur'[\u05d0-\u05ea]+{}'.format(marker), u'', string)
# return re.sub(ur'[\u05d0-\u05ea]+┉', u'', string)
def remove_punctuation(data):
"""
Removes various punctation from Hebrew text.
:param data: String of Hebrew Text
:return: String without punctuation
"""
data = re.sub(ur'־', u' ', data)
data = re.sub(ur'\([^)]+\)', u' ', data)
data = re.sub(ur'<[^>]+>', u' ', data)
data = re.sub(ur'\[[^\]]+\]', u' ', data)
data = re.sub(ur'[^ \u05d0-\u05ea"\'״׳]', u' ', data)
data = re.sub(ur'(^|\s)["\'״׳]+', u' ', data)
data = re.sub(ur'["\'״׳]+(\s|$)', u' ', data)
return data
def strip_stopwords_and_remove_punctuation(data):
"""
This method takes a string of hebrew text and does all necessary cleaning for Word2Vec.
:param data: String of Hebrew Text
:return: String ready for Word2Vec model
"""
data = data.strip().split(u'~~')[1]
data = remove_dicta_prefix(data, u"┉")
data = remove_punctuation(data)
data = pull_out_suffix(data)
data = remove_stopwords(data)
data = u' '.join(data.split())
return data
def create_multiple_word_phrases(data):
"""
Combines selected multiple word phrases with underscore.
For example: New York ----> New_York
This allows Word2Vec to handle multiple word phrases
:param data: String of Hebrew Text
:return: String with connected multiple word phrases
"""
def connect_with_underscore(matchobj):
return matchobj.group(1) + matchobj.group(2).replace(u" ", u"_")
return re.sub(phrases_regex, connect_with_underscore, data)
def this_is_a_bad_line(data):
"""
Checks to see if a line from the file should be included in the Word2Vec model.
There are some lines within the Dicta Prefix file that we cannot or do not want to include.
:param data: line of hebrew text from Dicta File
:return: Boolean Value determining if this line valid
"""
if u'~~' not in data:
return True
if data.strip().split(u'~~')[1].strip().startswith(u"<br><br><big><strong>הדרן עלך"):
return True
return False
def extract_reference(data):
"""
Extract Sefaria Ref from a line in the Dicta Prefix file
:param data: A line from the dicta file
:return: The Corresponding tref
"""
return data.split(u'~')[0]
def concatenate_sematically_linked_segments(topic_ranged_refs, ranged_to_segment):
"""
Combines multiple Sefaria Segments into one larger segment based on sematical meaning
:param topic_ranged_refs: List of ranged trefs that define the semantic separation
:param ranged_to_segment: Nested dict. First layer points from Ranged Refs to all sub-seg-refs. The nested dict points from the sub_seg_ref to the text of said sub_seg_ref
:return: Dict containing semantically define ranged refs corresponding to their concatenated text
"""
semantic_linked_segments = {}
for text_ranged_ref in topic_ranged_refs:
object_ref = Ref(text_ranged_ref)
all_text_subrefs = [seg_ref.normal() for seg_ref in object_ref.range_list()]
all_verses = [ranged_to_segment[text_ranged_ref][seg_ref] for seg_ref in all_text_subrefs]
semantic_linked_segments[text_ranged_ref] = u' '.join(all_verses)
return semantic_linked_segments
def get_segments(filename):
"""
Combs through the entire Sefarias Hebrew Library and cleans the text for Doc2Vec.
Creates a dict:
Key: Ref
Value: The text of that ref cleaned and ready for Doc2Vec
:param filename: Dicta Prefix Filename
:return: Dict for Doc2Vec
"""
all_tanakh_books = create_list_off_tanakh_books()
all_talmud_books = create_list_off_talmud_books()
tanakh_topic_ranged_refs = get_tanakh_topic_ranges()
talmud_topic_ranged_refs = get_talmud_topic_ranged()
tanakh_ranged_to_segment, tanakh_segment_to_ranged = segment_range_dicts(tanakh_topic_ranged_refs)
talmud_ranged_to_segment, talmud_segment_to_ranged = segment_range_dicts(talmud_topic_ranged_refs)
all_data = {}
for index, data in enumerate(codecs.open(filename, encoding='utf8')):
if this_is_a_bad_line(data):
continue
ref = extract_reference(data)
data = strip_stopwords_and_remove_punctuation(data)
data = create_multiple_word_phrases(data)
if index % 100000 == 0:
print index
if is_from_category(ref, all_tanakh_books):
this_ref_ranged_reg = tanakh_segment_to_ranged[ref]
tanakh_ranged_to_segment[this_ref_ranged_reg][ref] = data
elif is_from_category(ref, all_talmud_books):
this_ref_ranged_reg = talmud_segment_to_ranged[ref]
talmud_ranged_to_segment[this_ref_ranged_reg][ref] = data
else:
all_data[ref] = data
all_data.update(concatenate_sematically_linked_segments(tanakh_topic_ranged_refs, tanakh_ranged_to_segment))
all_data.update(concatenate_sematically_linked_segments(talmud_topic_ranged_refs, talmud_ranged_to_segment))
return all_data
def get_wiki_segs(filename):
all_data = {}
for index, data in enumerate(codecs.open(filename, encoding='utf8')):
ref = u"Random {}".format(index)
data = remove_dicta_prefix(data, u"\|")
data = remove_punctuation(data)
data = pull_out_suffix(data)
data = remove_stopwords(data)
data = u' '.join(data.split())
data = create_multiple_word_phrases(data)
if index % 100000 == 0:
print index
all_data[ref] = data
return all_data
if __name__ == "__main__":
segments = get_segments(DICTA_SEFARIA_FILENAME)
if HEBREW_WIKI:
wiki_segs = get_wiki_segs(DICTA_HEBREW_WIKI_FILENAME)
with codecs.open(ALL_CLEAN_DOCS_FILENAME, 'wb', encoding='utf8') as the_file:
for k, v in segments.items():
the_file.write(u""+k+u"||||"+v+u"\n")
if HEBREW_WIKI:
for k, v in wiki_segs.items():
the_file.write(u""+k+u"||||"+v+u"\n")