forked from ZhangChengX/PDFBoT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractTextFromHTML.py
438 lines (381 loc) · 15.3 KB
/
extractTextFromHTML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#!/usr/bin/env python
# coding: utf-8
from bs4 import BeautifulSoup
import cssutils
from copy import deepcopy
from tool.tool import isValidParagraph, isValidtext
# check if text is a valid css text
def isCSStext(text):
flag = False
texts = text.splitlines()
for txt in texts:
if txt.startswith('.'):
return True
return flag
# return a list of css
def getCSStext(path):
cssText = []
htmlfile = open(path, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
# soup = BeautifulSoup(str(htmlhandle), 'html.parser').find(name='style', type='text/css').find_all(recursive=False)
soup = BeautifulSoup(str(htmlhandle), 'html.parser').find_all(name='style')
for style in soup:
# print("soup = " + str(style))
if isCSStext(str(style)):
cssText.append(str(style))
return cssText
pass
def isCSStext(text):
flag = False
texts = text.splitlines()
for txt in texts:
if txt.startswith('.'):
return True
return flag
def getValueOfClass(cssTexts):
classDict = {}
for cssText in cssTexts:
sheet=cssutils.parseString(cssText.replace('<style type="text/css">','').replace('</style>',''))
for rule in sheet:
try:
if rule.selectorText.startswith('.x'):
for property in rule.style:
if property.name == 'left':
classDict[rule.selectorText[1:]]=float(property.value.replace('px',''))
if rule.selectorText.startswith('.h'):
for property in rule.style:
if property.name == 'height':
classDict[rule.selectorText[1:]]=float(property.value.replace('px',''))
if rule.selectorText.startswith('.y'):
for property in rule.style:
if property.name == 'bottom':
classDict[rule.selectorText[1:]]=float(property.value.replace('px',''))
if rule.selectorText.startswith('.fs'):
for property in rule.style:
if property.name == 'font-size':
classDict[rule.selectorText[1:]]=float(property.value.replace('px',''))
except:
pass
# print('class dict = ' + str(classDict))
return classDict
# return a list of page div
def getSoup(path):
htmlfile = open(path, 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(str(htmlhandle), 'html.parser').find(name='div', id='page-container')
# print('num of children soup = '+ str(len(soup)))
return soup
#calculate how many children tag have
def numOfTextTag(soup):
num = 0
for item in soup:
if len(item['class']) == 11:
num = num+1
return num
#store pages in the form of class of each div
# input soup
# return pagesDivClasses if a list of each page in form of list of class of each div
def getPagesDivClass(inputSoup, classDict):
soup = inputSoup.find_all(recursive=False)
pagesDivClasses = [] # list of pageDivClasses
pageDivClasses = [] # list of list of each div classes
divDict = {}
totalPages = len(soup)
for item in soup:
item.find_all('div', recursive=False)
pageID = item['data-page-no']
print('page id = '+str(pageID))
for subItem in item:
if len(subItem['class']) >= 4:
print('num of text div = '+str(numOfTextTag(subItem)))
if numOfTextTag(subItem)>0:
for subItem01 in subItem:
if len(subItem01['class']) == 11 and len(subItem01.getText().strip())>1:
divDict['text']=subItem01.getText()
divDict['pageID']= pageID
divDict['totalPages'] = totalPages
divDict['fs'] = classDict[getClassName(subItem01['class'],'fs')]
divDict['x'] = classDict[getClassName(subItem01['class'], 'x')]
divDict['y'] = classDict[getClassName(subItem01['class'], 'y')]
pageDivClasses.append(deepcopy(divDict))
divDict.clear()
else:
for subItem01 in subItem:
if numOfTextTag(subItem01)==0:
continue
for item02 in subItem01:
if len(item02['class']) == 11 and len(item02.getText().strip()) > 1:
divDict['text'] = item02.getText()
divDict['pageID'] = pageID
divDict['totalPages'] = totalPages
divDict['fs'] = classDict[getClassName(item02['class'], 'fs')]
divDict['x'] = classDict[getClassName(item02['class'], 'x')]
divDict['y'] = classDict[getClassName(item02['class'], 'y')]
print('div dict test = '+str(divDict))
pageDivClasses.append(deepcopy(divDict))
divDict.clear()
# pagesDivClasses.append(deepcopy(calculateLineSpace(pageDivClasses)))
for divDictitem in pageDivClasses:
print('divDictitem = '+str(divDictitem))
pagesDivClasses.append(calFreFSandLineSpace(calculatePreSpace(sortDivInPage(pageDivClasses,classDict))))
pageDivClasses.clear()
cleanPagesDivClasses = []
# remove empty page, the page without text div
for pageItem in pagesDivClasses:
if len(pageItem)>0:
cleanPagesDivClasses.append(deepcopy(pageItem))
# return pagesDivClasses
return cleanPagesDivClasses
# calculate the space between the div and this div's previous div.
def calculatePreSpace(page):
newPage = {}
lastBottom = 0
for item in page:
index = item
DivDict = page[index]
# print('bottom = '+str(lastBottom))
if item == 0 :
preSpace = lastBottom
lastBottom = DivDict['y']
else:
preSpace = abs(lastBottom - DivDict['y'])
lastBottom=DivDict['y']
DivDict['preSpace']=preSpace
newPage[item]=deepcopy(DivDict)
return newPage
# calculate most frequent font-size and lineSpace
def calFreFSandLineSpace(page):
newPage={}
for key in page:
divDict = page[key]
divDict['lineSpace'] = getMostFreValueOfKey(page, 'preSpace')
divDict['freqFS'] = getMostFreValueOfKey(page,'fs')
newPage[key]=deepcopy(divDict)
return newPage
# calculate base/macro font-size and base line space
def calBaseFSandLineSpace(pages):
newPages = []
baseFS = 0
baseLineSpace =0
countFS={}
countLineSpace = {}
for page in pages:
fs = page[0]['freqFS']
ls = page[0]['lineSpace']
if fs not in countFS:
countFS[fs] = 1
else:
countFS[fs] = countFS[fs]+1
if ls not in countLineSpace:
countLineSpace[ls] = 1
else:
countLineSpace[ls] = countLineSpace[ls]+1
baseLineSpace = max(countLineSpace, key=countLineSpace.get)
baseFS = max(countFS, key=countFS.get)
for page in pages:
for key in page:
page[key]['macroFS'] = baseFS
page[key]['macroLineSpace'] = baseLineSpace
newPages.append(deepcopy(page))
return newPages
# input page is a list of divDict
# return a newPage, it is a dictionary,
# newPage{ '1': 'divDict01'
# '2': 'divDict02'
# '3': 'divDict03'
# ...........
# }
def sortDivInPage(page, classDict):
newPage={}
i = 0
while page:
tempDivDict = page[0]
frontIndex = 0
# print('length of page = ' + str(len(page)))
for index, divDict in enumerate(page):
if tempDivDict['y']>=divDict['y']:
pass
else:
tempDivDict = divDict
frontIndex = index
page.remove(tempDivDict)
# del page[frontIndex]
# print('temp div dict = '+str(tempDivDict))
newPage[i]=deepcopy(tempDivDict)
i=i+1
return newPage
# page is a dictionary of value is divdict
def getMostFreValueOfKey(page, key):
if len(page)==0:
return 0
if len(page)==1:
return page[0][key]
countKey = {}
for index in page:
value = page[index][key]
if key == 'preSpace' and value<1:
continue
if value not in countKey:
countKey[value] = 1
else:
countKey[value] = countKey[value]+1
return max(countKey, key=countKey.get)
# get text x(left of new paragraph) of page
# page is a list of class of the div in that page
def getSecondMostX(page):
countX = {}
for divDict in page:
fs = getClassName(divDict['class'], 'x')
if fs not in countX:
countX[fs] = 1
else:
countX[fs] = countX[fs]+1
maxX= max(countX, key=countX.get)
del countX[maxX]
return max(countX, key=countX.get)
# get the full name of specific class by the first few letters
# if you want get the fs(font-size), pass in symbol = fs
def getClassName(listOfClass, symbol):
for item in listOfClass:
if item.startswith(symbol):
return item
return None
# get selection string
def getSelectStr(divDict, tag):
selectStr=tag
for item in divDict['class']:
selectStr=selectStr+'.'+str(item)
return selectStr
def getTextFromHTML(htmlFilePath):
text=[]
# para=''
para={}
# get the css in html
cssText = getCSStext(htmlFilePath)
# parse css text to dictionary
classDict = getValueOfClass(cssText)
# print('class dict = '+ str(classDict))
soup = getSoup(htmlFilePath)
print('length of soup = ' + str(len(soup)))
#store pages in the form of class of each div
pagesDivClasses = getPagesDivClass(soup, classDict)
pagesDivClasses = calBaseFSandLineSpace(pagesDivClasses)
preDivFS = 0
preDivText = ''
for index, page in enumerate(pagesDivClasses):
print('---------------------------------'+"index = "+str(index)+'-----------------------------------------')
pageFS = page[0]['macroFS']
lineSpace = page[0]['macroLineSpace']
print('font size = '+str(pageFS))
print('line space = '+str(lineSpace))
i = 0
for key in page:
if page[key]['fs'] >= pageFS-2 and len(page[key]['text'].strip())>1:
divText_temp = page[key]['text']
divText = divText_temp
# check if it is first text line in page
if str(i)=='0':
i = i+1
print('i ====== 0')
if page[key]['fs']>pageFS+5:
text.append(deepcopy(para))
# para = divText
print('para 01 = '+str(para))
para.clear()
para['text']=divText
para['fs']= deepcopy(page[key]['fs'])
para['macroFS'] = deepcopy(page[key]['macroFS'])
elif len(preDivText.strip())<=2:
print('len = '+str(len(preDivText.strip())))
text.append(deepcopy(para))
print('para 02 = ' + str(para))
# para = divText
para.clear()
para['text'] = divText
para['fs'] = deepcopy(page[key]['fs'])
para['macroFS'] = deepcopy(page[key]['macroFS'])
elif preDivText.strip()[-1].isalpha():
print('last letter = '+str(preDivText.strip()[-1]))
# para = para + ' ' + divText
if para['text'].endswith('-'):
para['text'] = deepcopy(para['text'][:-1]) + divText
else:
para['text'] = deepcopy(para['text'])+ ' ' + divText
else:
text.append(deepcopy(para))
# para = divText
para.clear()
para['text'] = divText
para['fs'] = deepcopy(page[key]['fs'])
para['macroFS'] = deepcopy(page[key]['macroFS'])
else:
# if classDict[getClassName(page['class'], 'x')]==secondMostX:
if page[key]['preSpace'] > page[key]['macroLineSpace'] + 2 and abs(
page[key]['fs'] - page[key]['macroFS']) < 5:
text.append(deepcopy(para))
print('para 04 = ' + str(para))
# para = divText
para.clear()
para['text'] = divText
para['fs'] = deepcopy(page[key]['fs'])
para['macroFS'] = deepcopy(page[key]['macroFS'])
elif abs(page[key]['fs'] - preDivFS) > 5: #
text.append(deepcopy(para))
# para = divText
print('para 05 = ' + str(para))
para.clear()
para['text'] = divText
para['fs'] = deepcopy(page[key]['fs'])
para['macroFS'] = deepcopy(page[key]['macroFS'])
else:
# para = para + ' ' + divText
if para['text'].endswith('-'):
para['text'] = deepcopy(para['text'][:-1]) + divText
else:
para['text'] = deepcopy(para['text'])+ ' ' + divText
pass
pass
preDivFS = deepcopy(page[key]['fs'])
preDivText = deepcopy(page[key]['text'])
text.append(deepcopy(para))
newText={}
tempPara={}
i=0
outputfile = open('output/outputFile.txt', 'w')
# for page in listOfpage:
for pg in text:
if 'text' not in pg.keys():
continue
if not isValidtext(pg['text']):
tempPara['text']=pg['text']
tempPara['fs'] = pg['fs']
tempPara['macroFS']= pg['macroFS']
tempPara['type'] = 'noise'
# print('paragraph = ' + str(pg))
elif abs(pg['fs']-pg['macroFS'])>6:
tempPara['text'] = '<title>'+pg['text']+'</title>'
tempPara['fs'] = pg['fs']
tempPara['macroFS'] = pg['macroFS']
tempPara['type'] = 'title'
print('pg title = '+str(tempPara))
newText[i] = deepcopy(tempPara['text'])
i = i + 1
elif not isValidParagraph(pg['text']):
tempPara['text'] = pg['text']
tempPara['fs'] = pg['fs']
tempPara['macroFS'] = pg['macroFS']
tempPara['type'] = 'noise'
else:
tempPara['text'] = pg['text']
tempPara['fs'] = pg['fs']
tempPara['macroFS'] = pg['macroFS']
tempPara['type'] = 'normal'
newText[i] = deepcopy(pg['text'])
i=i+1
for key in newText:
outputfile.write(newText[key]+ '\n')
outputfile.write(' -------------\n')
outputfile.close()
return newText
if __name__ == '__main__':
pass