-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathit.py
57 lines (46 loc) · 1.92 KB
/
it.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
import unicodedata
from languages.lang import Lang
class It(Lang):
def delimiters(self):
def d1(delimiter, line):
return line.count(delimiter) * 2
return {'--': d1}
def process_file(self, paragraph_list, delimiter):
dialogs = []
count_doubles = 0
# After some amount of characters interpret utterance as new dialog.
chars_since_dialog = self.cfg.dialog_gap + 1
for p in paragraph_list:
# If the paragraph potentially contains dialog.
if len(p) > 1:
if delimiter in p[:2]:
text = ''
for i, segment in enumerate(p.split(delimiter)[1:]):
if not i % 2:
text += segment + ' '
# Extra filtering.
if p.count(delimiter) > 1:
count_doubles += 1
# If max chars exceeded start new dialog.
if chars_since_dialog > self.cfg.dialog_gap:
dialogs.append([])
dialogs[-1].append(' '.join(text.split()))
chars_since_dialog = 0
else:
# Add the whole paragraph since there were no dialog.
chars_since_dialog += len(p)
num_words = sum([len(p.split()) for p in paragraph_list])
if count_doubles / num_words * 10000 > self.cfg.min_double_delim:
self.dialogs.extend(dialogs)
def clean_line(self, line):
line = re.sub(' \' ', '\'', line)
line = unicodedata.normalize('NFKD', line)
line = re.sub('[.]', ' . ', line)
line = re.sub('[?]', ' ? ', line)
line = re.sub('[!]', ' ! ', line)
line = re.sub('[-]', ' - ', line)
line = re.sub('["]', ' " ', line)
line = re.sub('[:]', ' : ', line)
line = re.sub('[,]', ' , ', line)
return line