-
Notifications
You must be signed in to change notification settings - Fork 6
/
volume.py
190 lines (166 loc) · 6.01 KB
/
volume.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from xml.etree import ElementTree as xml
import re, helpers, copy
class Volume():
''' each volume gets its own Akoma Ntoso file '''
def __init__(self):
''' get the full text for that volume '''
self.akoma_ntoso = xml.Element("akomaNtoso")
self.debate = xml.SubElement(self.akoma_ntoso, "debate")
self.meta = xml.SubElement(self.debate, "meta")
self.references = xml.SubElement(self.meta, "references")
self.debateBody = xml.SubElement(self.debate, "debateBody")
self.full_text = []
def debateSection(self, heading):
''' properly label the section '''
self.debateSection = xml.SubElement(self.debateBody, "debateSection")
self.heading = xml.SubElement(self.debateSection, "heading")
self.heading.text = heading
def remove_cover_pages(self):
''' id and remove cover pages '''
# Every coverpage ends with this phone number
for i, line in enumerate(self.full_text):
if "(314) 615-2600" in line["text"]:
end_of_cover = i
break
# Include the line with the number and the newline after it
end_of_cover = end_of_cover + 2
new_text_wo_cover_page = []
# once we've counted down to it, start copying lines
for line in self.full_text:
if not end_of_cover:
new_text_wo_cover_page.append(line)
else:
end_of_cover = end_of_cover - 1
self.full_text = new_text_wo_cover_page
def get_speakers(self):
''' Find all the speakers '''
print "Getting speakers"
for line in self.full_text:
# Look for peoples names
already_found = False
name = re.search("(M[RS]\.* [A-Z]+):", line["text"])
qna = re.search("([QA]) ", line["text"])
if name:
showAs = name.group(1)
if qna:
showAs = qna.group(1)
# Check that the person isn't in yet.
if name or qna:
for child in self.references:
if showAs == child.attrib["showAs"]:
already_found = True
break
if already_found:
continue
id = showAs.lower().replace(".","").replace(" ","-")
attr = {
"href" : "/ontology/person/ferguson.sayit.mysociety.org/" + id,
"id" : id,
"showAs" : showAs
}
self.TLCPerson = xml.SubElement(self.references, "TLCPerson", attr)
def get_speeches(self):
''' Find all the sections and speeches '''
print "Getting speeches"
# Look for a speakers name. Start recording the rest of that line
# record all other lines until another name is found.
speeches = []
speech = {
"speech" : []
}
for line in self.full_text:
# clean up
line["text"] = line["text"].replace("(at new indentation)", "")
line["text"] = line["text"].strip()
# Look for speakers names
name = re.search("(M[RS]\.* [A-Z]+):(.*)", line["text"])
# Look for Q&A
qna = re.match("([AQ]) (.*)", line["text"])
if name:
speech = {
"speaker" : name.group(1),
"speech" : [{
"text" : name.group(2),
"pos" : line["pos"]
}]
}
speeches.append(speech)
elif qna:
speech = {
"speaker" : qna.group(1),
"speech" : [{
"text" : qna.group(2),
"pos" : line["pos"]
}]
}
speeches.append(speech)
else:
# Add line to current speech
speech['speech'].append(line)
self.speeches = speeches
def fix_indented_qna_speeches(self):
''' some q speeches are found by a large indent
use the pos to find these
'''
for i, speech in enumerate(self.speeches):
where_to_insert = False
if speech["speaker"] == "A":
for paragraph in speech["speech"]:
if where_to_insert:
self.speeches[where_to_insert]["speech"].append(paragraph)
if paragraph["pos"]:
# 300 is for big indents
if int(paragraph["pos"][0:3]) > 300:
where_to_insert = i + 1 # Insert after this
new_speech = {
"speaker" : "Q",
"speech" : [ paragraph ]
}
self.speeches.insert(where_to_insert, new_speech)
if where_to_insert:
# remove the fixed line from the answer
speech_copy = copy.copy(self.speeches[i]["speech"])
for paragraph in self.speeches[i]["speech"]:
if paragraph in self.speeches[i+1]["speech"]:
speech_copy.remove(paragraph)
self.speeches[i]["speech"] = speech_copy
def remove_pos(self):
''' only needed pos for the qna formatting'''
for speech in self.speeches:
text = []
for paragraph in speech["speech"]:
text.append(paragraph["text"])
speech["speech"] = text
def build_speeches(self):
''' build speech xml elements '''
for speech in self.speeches:
attr = {
"by" : "#" + speech["speaker"].lower().replace(" ","-").replace(".","")
}
self.speech = xml.SubElement(self.debateSection, "speech", attr)
self.add_paragraphs(self.speech, speech)
def add_paragraphs(self, elem, speech):
''' build paragraphs '''
speech["speech"] = " ".join(speech["speech"])
speech["speech"] = speech["speech"].split("--++ new paragraph!")
for paragraph in speech["speech"]:
paragraph = re.sub("\n","",paragraph) # get rid of leading newlines
paragraph = paragraph.strip()
if paragraph:
p = xml.SubElement(elem, "p")
p.text = paragraph
def indent(self, elem, level=0):
''' pretty printing for xml '''
i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
self.indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i