-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurls.py
93 lines (73 loc) · 2.77 KB
/
urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
try:
from .exceptions import (
NoUrlException,
WaitBeforeRetryException,
InvalidURLException,
)
except ImportError as _:
from exceptions import NoUrlException, WaitBeforeRetryException, InvalidURLException
import random
def get_protocol_and_domain_from_url(url: str):
logger_url_str = f"\"{url[:60]}{'...' if len(url) > 60 else ''}\""
if "//" not in url:
raise InvalidURLException(f"{logger_url_str} is not a supported url.")
protocol, _url = url.split("//", 1) # https:, example.com/test
if "?" in _url and "/" in _url and _url.index("?") < _url.index("/"):
domain = _url.split("?", 1)[0]
elif "?" in url and "/" not in url:
domain = _url.split("?", 1)[0]
elif "/" in _url:
domain = _url.split("/", 1)[0]
else:
domain = _url
return protocol, domain
class URLManager:
def __init__(
self,
seed_url: str,
crawled: set[str] | None = None,
to_crawl: dict[str, list[str]] | None = None,
):
self.seed_url: str | None = seed_url or None
self.enqueued: set[str] = (
crawled or set()
) # For all URLs that have been crawled or are already queued to crawl
self.to_crawl: dict[str, list[str]] = dict()
self.url_count = 0
if to_crawl:
self.to_crawl = to_crawl
else:
_, domain = get_protocol_and_domain_from_url(seed_url)
self.to_crawl[domain] = [seed_url]
def get_next_url(self) -> str:
"""
Gets the next URL to crawl and updates Crawler.enqueued.
:return: Next URL to crawl.
"""
# Check that we haven't crawled everything.
if len(self.to_crawl) == 0:
raise NoUrlException()
domain_choice = random.choice(list(self.to_crawl.keys()))
current_url = random.choice(self.to_crawl[domain_choice])
self.to_crawl[domain_choice].remove(current_url)
if len(self.to_crawl[domain_choice]) == 0:
del self.to_crawl[domain_choice]
if current_url is None:
raise NoUrlException()
self.enqueued.add(current_url)
return current_url
def add_to_to_crawl_queue(self, url: str, domain: str | None = None):
if not url.lower().startswith("http://") and not url.lower().startswith(
"https://"
):
return
if domain is None:
_, domain = get_protocol_and_domain_from_url(url)
if domain in self.to_crawl.keys():
self.to_crawl[domain].append(url)
else:
self.to_crawl[domain] = [url]
def add_many_to_to_crawl_queue(self, urls: set[str]):
urls_to_add = urls - self.enqueued
for url in urls_to_add:
self.add_to_to_crawl_queue(url)