-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy path51.sent_extract.py
executable file
·140 lines (112 loc) · 3.76 KB
/
51.sent_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
#
# (C) Copyright 2017 Hojin Choi <[email protected]>
#
from __future__ import absolute_import, print_function, unicode_literals
from io import open
from bs4 import BeautifulSoup
import sys
import os
import re
lastchunk = ''
def analyze_chunk(outf, text):
global lastchunk
samples = []
count = 0
try:
text = text.replace("::", "")
# for f in text:
# f = f.replace("::", "")
# for line in f.split('\n'):
# tabidx = line.find('\t')
# if tabidx < 0:
# raise Exception("Invalid line: [%s]" % line)
# a = line[tabidx + 1:]
# samples.append(a)
# for b in re.split(' ?\+ ?', a):
# b = b.strip()
# slashidx = b.rfind('/')
# if slashidx < 0:
# continue
# outf.write('%s\n' % b)
# print(text)
outf.write('%s\n' % text)
count += 1
except Exception as e:
pass
if len(samples):
lastchunk = u'\n'.join(samples)
# In python3, type(u'') is str
# In python2, type(u'') is unicode
# print() with sys.stderr function requires 'bytes' in python2, so encode() needed.
# print() in python3, keep it not touched
if not isinstance(lastchunk, str):
lastchunk = lastchunk.encode('utf-8')
return count
def analyze_type1(outf, text):
count = 0
content = ''
for t in text.split('\n'):
try:
# Screening only tab contained lines
(dummy, line, morph) = t.split('\t', 2)
content = content + line + ' '
except Exception as e:
pass
count += analyze_chunk(outf, content)
return count
def analyze_type2(outf, text):
count = 0
content = []
for t in text.split('\n'):
try:
# Screening only tab contained lines
t = re.sub('<phon>.*</phon>', '', t)
(dummy, line, morph) = t.split('\t', 2)
content.append(line)
except Exception as e:
pass
count += analyze_chunk(outf, ' '.join(content))
return count
def extract(outfile, path, idx, total):
outf = open(outfile, mode='at', encoding='utf-8')
content = open(path, mode='rt', encoding='utf-8').read()
doc = BeautifulSoup(content, 'html.parser')
count = 0
texts = doc.select('text body p, text p')
for text in texts:
text = text.get_text()
count += analyze_type1(outf, text)
texts = doc.select('text s')
for text in texts:
text = text.get_text()
count += analyze_type2(outf, text)
print("(%d/%d) Extract %s >> %s %d sentences" % (idx, total, path, outfile, count))
if not sys.stderr.isatty():
print("(%d/%d) Extract %s >> %s %d sentences" % (idx, total, path, outfile, count), file=sys.stderr)
print("Last chunk", file=sys.stderr)
print(lastchunk, file=sys.stderr)
if __name__ == '__main__':
try:
if len(sys.argv) != 3:
print("Usage: %s <corpus file> [<corpus file>...] <sentence extract file>" % sys.argv[0])
sys.exit(0)
files = sys.argv[1:-1]
outfile = sys.argv[-1]
# Truncate
outf = open(outfile, mode='w')
outf and outf.close()
total = len(files)
count = 0
for p in files:
count += 1
if 'TOTAL' in os.environ and 'COUNT' in os.environ:
count = int(os.environ['COUNT'] or 0)
total = int(os.environ['TOTAL'] or 0)
extract(outfile, p, count, total)
except KeyboardInterrupt:
print("\nOk, take a rest!")
except:
raise
outf.close()
# vim: ts=4 noexpandtab sw=4 sts=4