-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathsspostproc.py
133 lines (100 loc) · 4.88 KB
/
sspostproc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# from brat (http://brat.nlplab.org), MIT licenced.
# Python version of geniass-postproc.pl. Originally developed as a
# heuristic postprocessor for the geniass sentence splitter, drawing
# in part on Yoshimasa Tsuruoka's medss.pl.
import re
INPUT_ENCODING = "UTF-8"
OUTPUT_ENCODING = "UTF-8"
DEBUG_SS_POSTPROCESSING = False
__initial = []
# TODO: some cases that heuristics could be improved on
# - no split inside matched quotes
# - "quoted." New sentence
# - 1 mg .\nkg(-1) .
# breaks sometimes missing after "?", "safe" cases
__initial.append((re.compile(r'\b([a-z]+\?) ([A-Z][a-z]+)\b'), r'\1\n\2'))
# breaks sometimes missing after "." separated with extra space, "safe" cases
__initial.append((re.compile(r'\b([a-z]+ \.) ([A-Z][a-z]+)\b'), r'\1\n\2'))
# join breaks creating lines that only contain sentence-ending punctuation
__initial.append((re.compile(r'\n([.!?]+)\n'), r' \1\n'))
# no breaks inside parens/brackets. (To protect against cases where a
# pair of locally mismatched parentheses in different parts of a large
# document happen to match, limit size of intervening context. As this
# is not an issue in cases where there are no interveining brackets,
# allow an unlimited length match in those cases.)
__repeated = []
# unlimited length for no intevening parens/brackets
__repeated.append((re.compile(r'(\([^\[\]\(\)]*)\n([^\[\]\(\)]*\))'),r'\1 \2'))
__repeated.append((re.compile(r'(\[[^\[\]\(\)]*)\n([^\[\]\(\)]*\])'),r'\1 \2'))
# standard mismatched with possible intervening
__repeated.append((re.compile(r'(\([^\(\)]{0,250})\n([^\(\)]{0,250}\))'), r'\1 \2'))
__repeated.append((re.compile(r'(\[[^\[\]]{0,250})\n([^\[\]]{0,250}\])'), r'\1 \2'))
# nesting to depth one
__repeated.append((re.compile(r'(\((?:[^\(\)]|\([^\(\)]*\)){0,250})\n((?:[^\(\)]|\([^\(\)]*\)){0,250}\))'), r'\1 \2'))
__repeated.append((re.compile(r'(\[(?:[^\[\]]|\[[^\[\]]*\]){0,250})\n((?:[^\[\]]|\[[^\[\]]*\]){0,250}\])'), r'\1 \2'))
__final = []
# no break after periods followed by a non-uppercase "normal word"
# (i.e. token with only lowercase alpha and dashes, with a minimum
# length of initial lowercase alpha).
__final.append((re.compile(r'\.\n([a-z]{3}[a-z-]{0,}[ \.\:\,\;])'), r'. \1'))
# no break in likely species names with abbreviated genus (e.g.
# "S. cerevisiae"). Differs from above in being more liberal about
# separation from following text.
__final.append((re.compile(r'\b([A-Z]\.)\n([a-z]{3,})\b'), r'\1 \2'))
# no break in likely person names with abbreviated middle name
# (e.g. "Anton P. Chekhov", "A. P. Chekhov"). Note: Won't do
# "A. Chekhov" as it yields too many false positives.
__final.append((re.compile(r'\b((?:[A-Z]\.|[A-Z][a-z]{3,}) [A-Z]\.)\n([A-Z][a-z]{3,})\b'), r'\1 \2'))
# no break before CC ..
__final.append((re.compile(r'\n((?:and|or|but|nor|yet) )'), r' \1'))
# or IN. (this is nothing like a "complete" list...)
__final.append((re.compile(r'\n((?:of|in|by|as|on|at|to|via|for|with|that|than|from|into|upon|after|while|during|within|through|between|whereas|whether) )'), r' \1'))
# no sentence breaks in the middle of specific abbreviations
__final.append((re.compile(r'\b(e\.)\n(g\.)'), r'\1 \2'))
__final.append((re.compile(r'\b(i\.)\n(e\.)'), r'\1 \2'))
__final.append((re.compile(r'\b(i\.)\n(v\.)'), r'\1 \2'))
# no sentence break after specific abbreviations
__final.append((re.compile(r'\b(e\. ?g\.|i\. ?e\.|i\. ?v\.|vs\.|cf\.|Dr\.|Mr\.|Ms\.|Mrs\.)\n'), r'\1 '))
# or others taking a number after the abbrev
__final.append((re.compile(r'\b([Aa]pprox\.|[Nn]o\.)\n(\d+)'), r'\1 \2'))
def refine_split(s):
"""
Given a string with sentence splits as newlines, attempts to
heuristically improve the splitting. Heuristics tuned for geniass
sentence splitting errors.
"""
if DEBUG_SS_POSTPROCESSING:
orig = s
for r, t in __initial:
s = r.sub(t, s)
for r, t in __repeated:
while True:
n = r.sub(t, s)
if n == s: break
s = n
for r, t in __final:
s = r.sub(t, s)
# Only do final comparison in debug mode.
if DEBUG_SS_POSTPROCESSING:
# revised must match original when differences in space<->newline
# substitutions are ignored
r1 = orig.replace('\n', ' ')
r2 = s.replace('\n', ' ')
if r1 != r2:
print("refine_split(): error: text mismatch (returning original):\nORIG: '%s'\nNEW: '%s'" % (orig, s), file=sys.stderr)
s = orig
return s
if __name__ == "__main__":
import sys
import codecs
# for testing, read stdin if no args
if len(sys.argv) == 1:
sys.argv.append('/dev/stdin')
for fn in sys.argv[1:]:
try:
with codecs.open(fn, encoding=INPUT_ENCODING) as f:
s = "".join(f.read())
sys.stdout.write(refine_split(s).encode(OUTPUT_ENCODING))
except Exception as e:
print("Failed to read", fn, ":", e, file=sys.stderr)