Skip to content

Commit

Permalink
Added more URL sanitization and fixed parsing domain/protocol of URL
Browse files Browse the repository at this point in the history
  • Loading branch information
quintindunn committed Jun 28, 2024
1 parent 1de98ac commit ee7cce1
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 20 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,6 @@ cython_debug/
#.idea/

src/dbs/pages.db
to_crawl.json
to_crawl.json
to_crawl.json.old
launch.bat
8 changes: 3 additions & 5 deletions src/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .page import Page # noqa
from .robots import does_page_follow_robots_rules # noqa
from .url_checker import check_url_compliance # noqa
from .urls import URLManager # noqa
from .urls import URLManager, get_protocol_and_domain_from_url # noqa

from database import db, page_checker # noqa (Ignore import error)

Expand Down Expand Up @@ -115,8 +115,7 @@ def get_page(self, url: str) -> Page | None:
"""
# Perform any checks.

protocol, _url = url.split("//", 1)
domain = _url.split("/", 1)[0]
protocol, domain = get_protocol_and_domain_from_url(url)

domain_model = self.get_domain(domain)

Expand Down Expand Up @@ -181,8 +180,7 @@ def step(self) -> Page | None:
url = self.url_manager.get_next_url()

# Check if domain is in domain table.
protocol, _url = url.split("//", 1)
domain = _url.split("/", 1)[0]
protocol, domain = get_protocol_and_domain_from_url(url)

domain_model = (
self.db_session.query(db.DomainModel)
Expand Down
9 changes: 9 additions & 0 deletions src/crawler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,12 @@ def __init__(self, msg: str = ""):

def __str__(self):
return self.msg or "Cannot crawl page, try again later."


class InvalidURLException(Exception):
def __init__(self, msg: str = ""):
super().__init__()
self.msg = msg

def __str__(self):
return self.msg or "Invalid url"
39 changes: 31 additions & 8 deletions src/crawler/urls.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,31 @@
try:
from .exceptions import NoUrlException, WaitBeforeRetryException
except ImportError as e:
from exceptions import NoUrlException, WaitBeforeRetryException
from .exceptions import (
NoUrlException,
WaitBeforeRetryException,
InvalidURLException,
)
except ImportError as _:
from exceptions import NoUrlException, WaitBeforeRetryException, InvalidURLException

import random


def get_protocol_and_domain_from_url(url: str):
logger_url_str = f"\"{url[:60]}{'...' if len(url) > 60 else ''}\""
if "//" not in url:
raise InvalidURLException(f"{logger_url_str} is not a supported url.")

protocol, _url = url.split("//", 1)

if "?" in _url and "/" in _url and _url.index("?") < _url.index("/"):
domain = _url.split("?", 1)[0]
elif "/" in _url:
domain = _url.split("/", 1)[0]
else:
domain = _url
return protocol, domain


class URLManager:
def __init__(
self,
Expand All @@ -24,8 +44,7 @@ def __init__(
if to_crawl:
self.to_crawl = to_crawl
else:
protocol, _url = seed_url.split("//", 1)
domain = _url.split("/", 1)[0]
_, domain = get_protocol_and_domain_from_url(seed_url)
self.to_crawl[domain] = [seed_url]

def get_next_url(self) -> str:
Expand All @@ -36,7 +55,6 @@ def get_next_url(self) -> str:
# Check that we haven't crawled everything.
if len(self.to_crawl) == 0:
raise NoUrlException()

domain_choice = random.choice(list(self.to_crawl.keys()))
current_url = random.choice(self.to_crawl[domain_choice])

Expand All @@ -49,12 +67,17 @@ def get_next_url(self) -> str:
raise NoUrlException()

self.enqueued.add(current_url)

return current_url

def add_to_to_crawl_queue(self, url: str, domain: str | None = None):
if not url.lower().startswith("http://") and not url.lower().startswith(
"https://"
):
return

if domain is None:
_url = url.split("//", 1)[1]
domain = _url.split("/", 1)[0]
_, domain = get_protocol_and_domain_from_url(url)

if domain in self.to_crawl.keys():
self.to_crawl[domain].append(url)
Expand Down
7 changes: 5 additions & 2 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os.path
import random
import shutil

from crawler import Crawler

Expand All @@ -10,7 +11,7 @@
if __name__ == "__main__":
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

with open("seeds.txt", "r") as f:
with open("./seeds.txt", "r") as f:
seeds = [i.strip() for i in f.readlines()]

seed_url = random.choice(seeds)
Expand All @@ -29,5 +30,7 @@

to_crawl = crawler.url_manager.to_crawl

with open("to_crawl.json", "w") as f:
if os.path.isfile("./to_crawl.json"):
shutil.move("./to_crawl.json", "./to_crawl.json.old")
with open("./to_crawl.json", "w") as f:
json.dump(to_crawl, f)
4 changes: 0 additions & 4 deletions src/seeds.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ https://aka.ms
https://developer.microsoft.com
https://learn.microsoft.com
https://support.microsoft.com
https://www.onenote.com
https://azure.microsoft.com
https://techcommunity.microsoft.com
https://azuremarketplace.microsoft.com
Expand Down Expand Up @@ -114,7 +113,6 @@ https://jobs.opera.com
https://support.scribd.com
https://breezy-gallery.imgix.net
https://auth.opera.com
https://www.google.com
https://refit.sourceforge.net
https://github.blog
https://psychcentral.com
Expand All @@ -126,7 +124,6 @@ https://phandroid.com
https://www.ip-adress.com
https://www.textbroker.de
https://scholar.google.com
https://olduli.nli.org.il
https://codeql.com
https://itc.ua
https://www.ynet.co.il
Expand Down Expand Up @@ -157,7 +154,6 @@ https://partners.elastic.co
https://portal.acm.org
https://www.aljazeera.com
https://www.ebizmba.com
https://login.elastic.co
https://cloud.elastic.co
https://techcrunch.com
https://bambots.brucemyers.com
Expand Down

0 comments on commit ee7cce1

Please sign in to comment.