-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
126 lines (109 loc) · 4.16 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from bs4 import BeautifulSoup
from bs4.element import Tag
from bs4.element import NavigableString
import re
from collections import *
from character import Character
from os import path
import os
# Is a string all spaces (ignores parenthesis)
def is_all_spaces(s):
return len(s.lstrip()) == 0 or s.lstrip().find('(') == 0
def num_leading_spaces(s):
return len(s) - len(s.lstrip())
# Utility function, is some string a number
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
# Formats a character name - removes parens and anything in between parens,
# ignores character names that are just numbers
def format_char_name(s):
paren_index = s.find('(')
if paren_index > 0:
return s[:paren_index].strip()
if is_number(s.strip()):
return ''
return s.strip()
# Determine the number of leading spaces before 'dialogue' lines
def get_dialogue_leading_spaces(script):
nls = []
# Count every number of every length of leading spaces
for item in script.contents:
if type(item) != Tag:
for line in item.split('\n'):
if not is_all_spaces(line):
nls += [num_leading_spaces(line)]
# Determine the number of spaces before the most indented type of line
max_num_spaces = 0
c = Counter(nls)
for num_spaces in c:
# Make sure this number of spaces occurs enough, so that we're not
# picking something like right-aligned text at the beginning of a script
if c[num_spaces] > 100:
max_num_spaces = max(max_num_spaces, num_spaces)
return max_num_spaces
def all_words_in(bloblist):
words = set().union(*[blob.words for blob in bloblist])
return words
def print_cos_sim(characters):
bloblist = [char.blob for char in characters]
all_words = all_words_in(bloblist)
for char in characters:
char.gen_tf_idf_vec(bloblist, all_words)
print(char.name)
for c1 in characters:
print('\n========== ' + c1.name + ' ==========')
cosine_sim = {}
for c2 in characters:
cosine_sim[c2.name] = c1.cosine_sim(c2)
sorted_chars = sorted(characters, key=lambda a : -cosine_sim[a.name])
for i in range(len(sorted_chars)):
c2 = sorted_chars[i]
print((str(i + 1) + '. ' + c2.name).ljust(20) + '{:.4f}'.format(cosine_sim[c2.name]))
def scrape_characters(filepath):
# Really mediocre initial scraping code
char_lines = {}
with open(filepath, 'r') as f:
soup = BeautifulSoup(f.read(), 'html5lib')
# find pre tags until we are in the deepest pre tag
script = soup.find('pre')
if script is None:
return []
while script.find('pre') is not None:
script = script.find('pre')
nls_dialogue = get_dialogue_leading_spaces(script)
current_person = ''
for item in script.contents:
# Figure out who says a line
if type(item) == Tag:
char_name = format_char_name(item.text)
if len(char_name) > 0:
current_person = char_name
else:
actual_text = ''
text_lines = item.split('\n')
for line in text_lines:
nls_line = num_leading_spaces(line)
if nls_line == nls_dialogue:
actual_text += ' ' + line.strip()
# Remove extraneous spaces
actual_text = re.sub(' +', ' ', actual_text).strip()
if len(actual_text) > 0:
# Add to character's lines
if current_person not in char_lines:
char_lines[current_person] = []
char_lines[current_person].append(actual_text)
characters = []
for char in char_lines:
if len(char_lines[char]) > 10:
characters.append(Character(char.title(), char_lines[char]))
return characters
files = [f for f in os.listdir('./scripts/')]
characters = []
for file in files:
characters += scrape_characters('./scripts/' + file)
print(len(characters))
#print_cos_sim(characters)