-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhilda_wrapper.py
executable file
·53 lines (37 loc) · 1.41 KB
/
hilda_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python2
"""
Simple wrapper around the original hilda.py (Hernault et al. 2010) that
outputs parse tree in a string format that can easily be parsed into
nltk.tree.ParentedTree objects.
Original output:
(Contrast[S][N]
Although they did n't like it ,
they accepted the offer .)
What we actually want:
ParseTree('Contrast[S][N]', ["Although they did n't like it ,", 'they accepted the offer .'])
"""
import sys
import tempfile
from hilda import main
import nltk
SENT_DETECTOR = nltk.data.load('tokenizers/punkt/english.pickle')
def parse_file(input_filepath):
"""Takes plain text input, splits it into sentences, marks them in
HILDA's input format, calls HILDA on the preprocessed input and returns
a string representation of the parse tree.
"""
with open(input_filepath) as input_file:
input_text = input_file.read()
with tempfile.NamedTemporaryFile() as tokenized_input_file:
for sent in SENT_DETECTOR.tokenize(input_text):
tokenized_input_file.write(u"{0}<s>\n".format(sent.rstrip()))
tokenized_input_file.flush()
# sys.argv[1] is where the original hilda.py expects the input file
sys.argv = ['hilda.py', tokenized_input_file.name]
parse_tree = main()
return parse_tree.__repr__()
if __name__ == "__main__":
INPUT = sys.argv[-1]
OUTPUT = parse_file(INPUT)
sys.stdout.write(OUTPUT)
sys.stdout.write("\n")