Merge pull request #6 from quintindunn/crawler

Merge Crawler branch to main
quintindunn · Jun 28, 2024 · 9b5869e · 9b5869e
2 parents cace368 + 35b1319
commit 9b5869e
Show file tree

Hide file tree

Showing 10 changed files with 164 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -160,5 +160,9 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
+src/dbs/pages.db-journal
 src/dbs/pages.db
-to_crawl.json
+
+to_crawl.json
+to_crawl.json.old
+launch.bat
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 requests
 lxml
-SQLAlchemy
+SQLAlchemy
+psutil
diff --git a/src/crawler/crawler.py b/src/crawler/crawler.py
@@ -19,7 +19,7 @@
 from .page import Page  # noqa
 from .robots import does_page_follow_robots_rules  # noqa
 from .url_checker import check_url_compliance  # noqa
-from .urls import URLManager  # noqa
+from .urls import URLManager, get_protocol_and_domain_from_url  # noqa
 
 from database import db, page_checker  # noqa (Ignore import error)
 
@@ -115,8 +115,7 @@ def get_page(self, url: str) -> Page | None:
         """
         # Perform any checks.
 
-        protocol, _url = url.split("//", 1)
-        domain = _url.split("/", 1)[0]
+        protocol, domain = get_protocol_and_domain_from_url(url)
 
         domain_model = self.get_domain(domain)
 
@@ -181,8 +180,7 @@ def step(self) -> Page | None:
             url = self.url_manager.get_next_url()
 
             # Check if domain is in domain table.
-            protocol, _url = url.split("//", 1)
-            domain = _url.split("/", 1)[0]
+            protocol, domain = get_protocol_and_domain_from_url(url)
 
             domain_model = (
                 self.db_session.query(db.DomainModel)

diff --git a/src/crawler/crawleroptions.py b/src/crawler/crawleroptions.py
@@ -27,11 +27,14 @@ def __init__(self):
 
 
 class DefaultCrawlerOptions(BaseCrawlerOptions):
-    def __init__(self):
+    def __init__(
+        self,
+        ignored_file_extensions_path: str = "./configs/ignored_file_extensions.txt",
+    ):
         super().__init__()
 
         self.ua: str = "OWS-CRAWLER/0.1-DEV (https://github.com/quintindunn/OWS)"
 
-        with open("./configs/ignored_file_extensions.txt", "r") as f:
+        with open(ignored_file_extensions_path, "r") as f:
             extensions = f.readlines()[1:]
         self.ignored_url_endings = set(extensions)
diff --git a/src/crawler/exceptions.py b/src/crawler/exceptions.py
@@ -16,3 +16,21 @@ def __init__(self, msg: str = ""):
 
     def __str__(self):
         return self.msg or "Cannot crawl page, try again later."
+
+
+class InvalidURLException(Exception):
+    def __init__(self, msg: str = ""):
+        super().__init__()
+        self.msg = msg
+
+    def __str__(self):
+        return self.msg or "Invalid url"
+
+
+class CouldntFindNetworkInfoException(Exception):
+    def __init__(self, msg: str = ""):
+        super().__init__()
+        self.msg = msg
+
+    def __str__(self):
+        return self.msg or "Couldn't retrieve private network information."
diff --git a/src/crawler/networking.py b/src/crawler/networking.py
@@ -0,0 +1,74 @@
+try:
+    from .exceptions import CouldntFindNetworkInfoException
+    from .urls import get_protocol_and_domain_from_url
+except ImportError as e:
+    from exceptions import CouldntFindNetworkInfoException
+    from urls import get_protocol_and_domain_from_url
+
+from ipaddress import IPv4Network, IPv4Address
+import socket
+import psutil
+
+
+def _get_network_info() -> tuple[str, str]:
+    """
+    Gets the information on your network adapters
+    :return:
+    """
+    net_if_addrs = psutil.net_if_addrs().items()
+
+    if len(net_if_addrs) == 0:
+        raise CouldntFindNetworkInfoException()
+
+    for iface, addrs in psutil.net_if_addrs().items():
+        for addr in addrs:
+            if addr.family == socket.AF_INET:
+                network_ip = IPv4Network(
+                    f"{addr.address}/{addr.netmask}", strict=False
+                ).network_address
+                yield network_ip, addr.netmask
+
+
+def _is_ip_in_range(ip: IPv4Address, network_ip: str, subnet_mask: str) -> bool:
+    """
+    Checks if ip address is in an ip range.
+    :param ip: Ip to check
+    :param network_ip: Ip of the network
+    :param subnet_mask: subnet mask of the network
+    :return: True if the ip is in the range.
+    """
+    network = IPv4Network(f"{network_ip}/{subnet_mask}", strict=False)
+    ip_addr = IPv4Address(ip)
+    return ip_addr in network
+
+
+def _is_ip_private(ip: IPv4Address) -> bool:
+    """
+    Checks if an ip address is private
+    :param ip: Ip address to check
+    :return: True if ip is in the private range
+    """
+    for network_ip, subnet_mask in _get_network_info():
+        if _is_ip_in_range(ip, network_ip, subnet_mask) or ip.is_private:
+            return True
+    return False
+
+
+def _resolve_domain_to_ip(domain: str) -> IPv4Address:
+    """
+    Performs a DNS lookup to get the IP address of a domain.
+    :param domain: The domain to lookup
+    :return: IPv4Address object of the domain
+    """
+    ip_address = IPv4Address(socket.gethostbyname(domain))
+    return ip_address
+
+
+def is_host_private(host: str) -> bool:
+    """
+    Checks if a host resolves to a private ip address.
+    :param host: Host to check
+    :return: True if the host resolves to a private ip address.
+    """
+    ip = _resolve_domain_to_ip(host)
+    return _is_ip_private(ip)
diff --git a/src/crawler/url_checker.py b/src/crawler/url_checker.py
@@ -1,7 +1,13 @@
+import socket
+
 try:
     from .crawleroptions import BaseCrawlerOptions
+    from .urls import get_protocol_and_domain_from_url
+    from .networking import is_host_private
 except ImportError as e:
-    from crawleroptions import BaseCrawlerOptions
+    from crawleroptions import BaseCrawlerOptions, DefaultCrawlerOptions
+    from urls import get_protocol_and_domain_from_url
+    from networking import is_host_private
 
 
 def check_url_compliance(crawler_options: BaseCrawlerOptions, url: str) -> bool:
@@ -11,17 +17,25 @@ def check_url_compliance(crawler_options: BaseCrawlerOptions, url: str) -> bool:
     :param url: URL to check.
     :return: True if the URL complies with rules, otherwise False.
     """
+
     # Check file ending
     segments = url.replace("//", "/").split("/")
 
     # Check to see if these checks even apply:
-    if len(segments) == 2 or "." not in segments[-1]:
-        return True
+    if not (len(segments) == 2 or "." not in segments[-1]):
+        return False
 
     ending_segment = segments[-1]
     path_ending = ending_segment.split(".")[-1]
 
     if path_ending in crawler_options.ignored_url_endings:
         return False
 
+    try:
+        _, domain = get_protocol_and_domain_from_url(url)
+        if is_host_private(host=domain):
+            return False
+    except socket.gaierror:
+        return False
+
     return True
diff --git a/src/crawler/urls.py b/src/crawler/urls.py
@@ -1,11 +1,33 @@
 try:
-    from .exceptions import NoUrlException, WaitBeforeRetryException
-except ImportError as e:
-    from exceptions import NoUrlException, WaitBeforeRetryException
+    from .exceptions import (
+        NoUrlException,
+        WaitBeforeRetryException,
+        InvalidURLException,
+    )
+except ImportError as _:
+    from exceptions import NoUrlException, WaitBeforeRetryException, InvalidURLException
 
 import random
 
 
+def get_protocol_and_domain_from_url(url: str):
+    logger_url_str = f"\"{url[:60]}{'...' if len(url) > 60 else ''}\""
+    if "//" not in url:
+        raise InvalidURLException(f"{logger_url_str} is not a supported url.")
+
+    protocol, _url = url.split("//", 1)  # https:, example.com/test
+
+    if "?" in _url and "/" in _url and _url.index("?") < _url.index("/"):
+        domain = _url.split("?", 1)[0]
+    elif "?" in url and "/" not in url:
+        domain = _url.split("?", 1)[0]
+    elif "/" in _url:
+        domain = _url.split("/", 1)[0]
+    else:
+        domain = _url
+    return protocol, domain
+
+
 class URLManager:
     def __init__(
         self,
@@ -24,8 +46,7 @@ def __init__(
         if to_crawl:
             self.to_crawl = to_crawl
         else:
-            protocol, _url = seed_url.split("//", 1)
-            domain = _url.split("/", 1)[0]
+            _, domain = get_protocol_and_domain_from_url(seed_url)
             self.to_crawl[domain] = [seed_url]
 
     def get_next_url(self) -> str:
@@ -36,7 +57,6 @@ def get_next_url(self) -> str:
         # Check that we haven't crawled everything.
         if len(self.to_crawl) == 0:
             raise NoUrlException()
-
         domain_choice = random.choice(list(self.to_crawl.keys()))
         current_url = random.choice(self.to_crawl[domain_choice])
 
@@ -49,12 +69,17 @@ def get_next_url(self) -> str:
             raise NoUrlException()
 
         self.enqueued.add(current_url)
+
         return current_url
 
     def add_to_to_crawl_queue(self, url: str, domain: str | None = None):
+        if not url.lower().startswith("http://") and not url.lower().startswith(
+            "https://"
+        ):
+            return
+
         if domain is None:
-            _url = url.split("//", 1)[1]
-            domain = _url.split("/", 1)[0]
+            _, domain = get_protocol_and_domain_from_url(url)
 
         if domain in self.to_crawl.keys():
             self.to_crawl[domain].append(url)

diff --git a/src/main.py b/src/main.py
@@ -1,6 +1,7 @@
 import json
 import os.path
 import random
+import shutil
 
 from crawler import Crawler
 
@@ -10,7 +11,7 @@
 if __name__ == "__main__":
     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
-    with open("seeds.txt", "r") as f:
+    with open("./seeds.txt", "r") as f:
         seeds = [i.strip() for i in f.readlines()]
 
     seed_url = random.choice(seeds)
@@ -29,5 +30,7 @@
 
     to_crawl = crawler.url_manager.to_crawl
 
-    with open("to_crawl.json", "w") as f:
+    if os.path.isfile("./to_crawl.json"):
+        shutil.move("./to_crawl.json", "./to_crawl.json.old")
+    with open("./to_crawl.json", "w") as f:
         json.dump(to_crawl, f)
diff --git a/src/seeds.txt b/src/seeds.txt
@@ -69,7 +69,6 @@ https://aka.ms
 https://developer.microsoft.com
 https://learn.microsoft.com
 https://support.microsoft.com
-https://www.onenote.com
 https://azure.microsoft.com
 https://techcommunity.microsoft.com
 https://azuremarketplace.microsoft.com
@@ -114,7 +113,6 @@ https://jobs.opera.com
 https://support.scribd.com
 https://breezy-gallery.imgix.net
 https://auth.opera.com
-https://www.google.com
 https://refit.sourceforge.net
 https://github.blog
 https://psychcentral.com
@@ -126,7 +124,6 @@ https://phandroid.com
 https://www.ip-adress.com
 https://www.textbroker.de
 https://scholar.google.com
-https://olduli.nli.org.il
 https://codeql.com
 https://itc.ua
 https://www.ynet.co.il
@@ -157,12 +154,12 @@ https://partners.elastic.co
 https://portal.acm.org
 https://www.aljazeera.com
 https://www.ebizmba.com
-https://login.elastic.co
 https://cloud.elastic.co
 https://techcrunch.com
 https://bambots.brucemyers.com
 https://www.brainasoft.com
 https://bible.oremus.org
 https://loc.gov
 https://www.txnd.uscourts.gov
-https://www.yelp.com
+https://www.yelp.com
+https://fbi.gov