-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy path41.extract.py
executable file
·126 lines (109 loc) · 3.01 KB
/
41.extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
#
# (C) Copyright 2017 Hojin Choi <[email protected]>
#
from __future__ import absolute_import, print_function, unicode_literals
from io import open
from bs4 import BeautifulSoup
import sys
import os
import re
lastchunk = ''
def analyze_chunk(outf, text):
global lastchunk
samples = []
count = 0
try:
for f in text:
f = f.replace("::","")
for line in f.split('\n'):
tabidx = line.find('\t')
if tabidx < 0:
raise Exception("Invalid line: [%s]" % line)
a = line[tabidx+1:]
samples.append(a)
for b in re.split(' ?\+ ?', a):
b = b.strip()
slashidx = b.rfind('/')
if slashidx < 0:
continue
outf.write( '%s\n' % b )
count += 1
except Exception as e:
pass
if len(samples):
lastchunk = u'\n'.join(samples)
#In python3, type(u'') is str
#In python2, type(u'') is unicode
#print() with sys.stderr function requires 'bytes' in python2, so encode() needed.
#print() in python3, keep it not touched
if not isinstance(lastchunk, str):
lastchunk = lastchunk.encode('utf-8')
return count
def analyze_type1(outf, text):
count = 0
content = ''
for t in text.split('\n'):
try:
#Screening only tab contained lines
(dummy,line) = t.split('\t',1)
content = content + line + '\n'
except Exception as e:
pass
count += analyze_chunk(outf, [content])
return count
def analyze_type2(outf, text):
count = 0
content = []
for t in text.split('\n'):
try:
#Screening only tab contained lines
t = re.sub('<phon>.*</phon>', '', t)
(dummy,line) = t.split('\t',1)
content.append(line)
except Exception as e:
pass
count += analyze_chunk(outf, content)
return count
def extract(outfile, path, idx, total):
outf = open(outfile, mode='at', encoding='utf-8')
content = open(path, mode='rt', encoding='utf-8').read()
doc = BeautifulSoup(content, 'html.parser')
count = 0
texts = doc.select('text body p, text p')
for text in texts:
text = text.get_text()
count += analyze_type1(outf, text)
texts = doc.select('text s')
for text in texts:
text = text.get_text()
count += analyze_type2(outf, text)
print( "(%d/%d) Extract %s >> %s %d morphemes" % (idx, total, path, outfile, count) )
if not sys.stderr.isatty():
print("(%d/%d) Extract %s >> %s %d morphemes" % (idx, total, path, outfile, count), file=sys.stderr)
print("Last chunk", file=sys.stderr)
print(lastchunk, file=sys.stderr)
if __name__ == '__main__':
try:
if len(sys.argv) != 3:
print("Usage: %s <corpus file> [<corpus file>...] <morpheme extract file>" % sys.argv[0])
sys.exit(0)
files = sys.argv[1:-1]
outfile = sys.argv[-1]
#Truncate
outf = open(outfile, mode='w')
outf and outf.close()
total = len(files)
count = 0
for p in files:
count += 1
if 'TOTAL' in os.environ and 'COUNT' in os.environ:
count = int(os.environ['COUNT'] or 0)
total = int(os.environ['TOTAL'] or 0)
extract(outfile, p, count, total)
except KeyboardInterrupt:
print("\nOk, take a rest!")
except:
raise
outf.close()
# vim: ts=4 noexpandtab sw=4 sts=4