-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddress_spider.py
91 lines (76 loc) · 2.72 KB
/
address_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from scrapy import Spider, Request
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup, SoupStrainer
from extract2 import parsing_functions_addresses, new_parsing_fct
import pandas as pd
from ner import recognize_m
from urllib.parse import urlparse
map = {}
class AddressSpider(Spider):
'''
Address Spider scrapes the list of given urls
and extracts the addresses similarly to extract2.py, but
a lot faster.
max_urls is the number of urls from the parquet file it should scrape
Run as such
| scrapy runspider address_spider.py
'''
max_urls = 50
name = 'quote-spdier'
parquet_urls = pd.read_parquet('list.parquet', engine='pyarrow')
start_urls = []
for url in parquet_urls.iterrows():
max_urls -= 1
if max_urls == 0:
break
a, b = url
print(b)
start_urls.append(f"https://{(b['domain'])}")
print(start_urls, file=open("spider_logs/TEST.txt", 'w'))
depth = 3
parsing_fct = parsing_functions_addresses
print('', file=open("spider_logs/ADDRESSES.txt", 'w'))
def parse(self, response):
'''
Parsing function, similar to ex(...) from extract2
'''
links = LinkExtractor(allow=()).extract_links(response)
webpage = response.body
try:
soup = BeautifulSoup(webpage, 'html.parser')
except:
print(f"Non-utf chars found [perhaps] as/in {url}")
return []
keyed_tokens = parsing_functions_addresses(soup=soup)
data = recognize_m(keyed_tokens)
index = -1
domain = urlparse(response.url).netloc
print("\n\n\n\n")
print(domain)
print("\n\n\n\n")
if domain not in map:
map[domain] = 0
map[domain] += 1
if map[domain] > 10:
return
try:
index = self.start_urls.index(f"https://{domain}")
except:
index = "outside"
'''
Log to log_file
'''
print(f"> [{index}] {response.url}", file=open("spider_logs/ADDRESSES.txt", 'a'))
print(f"\t\ttokenized::recognized")
for el in data:
# el = [x for x in el if x[1] != "STATE_FULL"]
print(f"\t\t{keyed_tokens}::{el}", file=open("spider_logs/ADDRESSES.txt", 'a'))
print("\n", file=open("spider_logs/ADDRESSES.txt", 'a'))
urls = [l.url for l in links]
self.depth -= 1
'''
Crawl links
'''
if self.depth > 0:
for url in urls:
yield Request(url, callback=self.parse)