Added more URL sanitization and fixed parsing domain/protocol of URL

quintindunn · Jun 28, 2024 · ee7cce1 · ee7cce1
1 parent 1de98ac
commit ee7cce1
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -161,4 +161,6 @@ cython_debug/
 #.idea/
 
 src/dbs/pages.db
-to_crawl.json
+to_crawl.json
+to_crawl.json.old
+launch.bat
diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py
@@ -19,7 +19,7 @@
 from .page import Page  # noqa
 from .robots import does_page_follow_robots_rules  # noqa
 from .url_checker import check_url_compliance  # noqa
-from .urls import URLManager  # noqa
+from .urls import URLManager, get_protocol_and_domain_from_url  # noqa
 
 from database import db, page_checker  # noqa (Ignore import error)
 
@@ -115,8 +115,7 @@ def get_page(self, url: str) -> Page | None:
         """
         # Perform any checks.
 
-        protocol, _url = url.split("//", 1)
-        domain = _url.split("/", 1)[0]
+        protocol, domain = get_protocol_and_domain_from_url(url)
 
         domain_model = self.get_domain(domain)
 
@@ -181,8 +180,7 @@ def step(self) -> Page | None:
             url = self.url_manager.get_next_url()
 
             # Check if domain is in domain table.
-            protocol, _url = url.split("//", 1)
-            domain = _url.split("/", 1)[0]
+            protocol, domain = get_protocol_and_domain_from_url(url)
 
             domain_model = (
                 self.db_session.query(db.DomainModel)

diff --git a/src/crawler/exceptions.py b/src/crawler/exceptions.py
@@ -16,3 +16,12 @@ def __init__(self, msg: str = ""):
 
     def __str__(self):
         return self.msg or "Cannot crawl page, try again later."
+
+
+class InvalidURLException(Exception):
+    def __init__(self, msg: str = ""):
+        super().__init__()
+        self.msg = msg
+
+    def __str__(self):
+        return self.msg or "Invalid url"
diff --git a/src/crawler/urls.py b/src/crawler/urls.py
@@ -1,11 +1,31 @@
 try:
-    from .exceptions import NoUrlException, WaitBeforeRetryException
-except ImportError as e:
-    from exceptions import NoUrlException, WaitBeforeRetryException
+    from .exceptions import (
+        NoUrlException,
+        WaitBeforeRetryException,
+        InvalidURLException,
+    )
+except ImportError as _:
+    from exceptions import NoUrlException, WaitBeforeRetryException, InvalidURLException
 
 import random
 
 
+def get_protocol_and_domain_from_url(url: str):
+    logger_url_str = f"\"{url[:60]}{'...' if len(url) > 60 else ''}\""
+    if "//" not in url:
+        raise InvalidURLException(f"{logger_url_str} is not a supported url.")
+
+    protocol, _url = url.split("//", 1)
+
+    if "?" in _url and "/" in _url and _url.index("?") < _url.index("/"):
+        domain = _url.split("?", 1)[0]
+    elif "/" in _url:
+        domain = _url.split("/", 1)[0]
+    else:
+        domain = _url
+    return protocol, domain
+
+
 class URLManager:
     def __init__(
         self,
@@ -24,8 +44,7 @@ def __init__(
         if to_crawl:
             self.to_crawl = to_crawl
         else:
-            protocol, _url = seed_url.split("//", 1)
-            domain = _url.split("/", 1)[0]
+            _, domain = get_protocol_and_domain_from_url(seed_url)
             self.to_crawl[domain] = [seed_url]
 
     def get_next_url(self) -> str:
@@ -36,7 +55,6 @@ def get_next_url(self) -> str:
         # Check that we haven't crawled everything.
         if len(self.to_crawl) == 0:
             raise NoUrlException()
-
         domain_choice = random.choice(list(self.to_crawl.keys()))
         current_url = random.choice(self.to_crawl[domain_choice])
 
@@ -49,12 +67,17 @@ def get_next_url(self) -> str:
             raise NoUrlException()
 
         self.enqueued.add(current_url)
+
         return current_url
 
     def add_to_to_crawl_queue(self, url: str, domain: str | None = None):
+        if not url.lower().startswith("http://") and not url.lower().startswith(
+            "https://"
+        ):
+            return
+
         if domain is None:
-            _url = url.split("//", 1)[1]
-            domain = _url.split("/", 1)[0]
+            _, domain = get_protocol_and_domain_from_url(url)
 
         if domain in self.to_crawl.keys():
             self.to_crawl[domain].append(url)

diff --git a/src/main.py b/src/main.py
@@ -1,6 +1,7 @@
 import json
 import os.path
 import random
+import shutil
 
 from crawler import Crawler
 
@@ -10,7 +11,7 @@
 if __name__ == "__main__":
     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
-    with open("seeds.txt", "r") as f:
+    with open("./seeds.txt", "r") as f:
         seeds = [i.strip() for i in f.readlines()]
 
     seed_url = random.choice(seeds)
@@ -29,5 +30,7 @@
 
     to_crawl = crawler.url_manager.to_crawl
 
-    with open("to_crawl.json", "w") as f:
+    if os.path.isfile("./to_crawl.json"):
+        shutil.move("./to_crawl.json", "./to_crawl.json.old")
+    with open("./to_crawl.json", "w") as f:
         json.dump(to_crawl, f)
diff --git a/src/seeds.txt b/src/seeds.txt
@@ -69,7 +69,6 @@ https://aka.ms
 https://developer.microsoft.com
 https://learn.microsoft.com
 https://support.microsoft.com
-https://www.onenote.com
 https://azure.microsoft.com
 https://techcommunity.microsoft.com
 https://azuremarketplace.microsoft.com
@@ -114,7 +113,6 @@ https://jobs.opera.com
 https://support.scribd.com
 https://breezy-gallery.imgix.net
 https://auth.opera.com
-https://www.google.com
 https://refit.sourceforge.net
 https://github.blog
 https://psychcentral.com
@@ -126,7 +124,6 @@ https://phandroid.com
 https://www.ip-adress.com
 https://www.textbroker.de
 https://scholar.google.com
-https://olduli.nli.org.il
 https://codeql.com
 https://itc.ua
 https://www.ynet.co.il
@@ -157,7 +154,6 @@ https://partners.elastic.co
 https://portal.acm.org
 https://www.aljazeera.com
 https://www.ebizmba.com
-https://login.elastic.co
 https://cloud.elastic.co
 https://techcrunch.com
 https://bambots.brucemyers.com