Skip to content

Commit

Permalink
Merge pull request #6 from quintindunn/crawler
Browse files Browse the repository at this point in the history
Merge Crawler branch to main
  • Loading branch information
quintindunn authored Jun 28, 2024
2 parents cace368 + 35b1319 commit 9b5869e
Show file tree
Hide file tree
Showing 10 changed files with 164 additions and 27 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,5 +160,9 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

src/dbs/pages.db-journal
src/dbs/pages.db
to_crawl.json

to_crawl.json
to_crawl.json.old
launch.bat
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
requests
lxml
SQLAlchemy
SQLAlchemy
psutil
8 changes: 3 additions & 5 deletions src/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from .page import Page # noqa
from .robots import does_page_follow_robots_rules # noqa
from .url_checker import check_url_compliance # noqa
from .urls import URLManager # noqa
from .urls import URLManager, get_protocol_and_domain_from_url # noqa

from database import db, page_checker # noqa (Ignore import error)

Expand Down Expand Up @@ -115,8 +115,7 @@ def get_page(self, url: str) -> Page | None:
"""
# Perform any checks.

protocol, _url = url.split("//", 1)
domain = _url.split("/", 1)[0]
protocol, domain = get_protocol_and_domain_from_url(url)

domain_model = self.get_domain(domain)

Expand Down Expand Up @@ -181,8 +180,7 @@ def step(self) -> Page | None:
url = self.url_manager.get_next_url()

# Check if domain is in domain table.
protocol, _url = url.split("//", 1)
domain = _url.split("/", 1)[0]
protocol, domain = get_protocol_and_domain_from_url(url)

domain_model = (
self.db_session.query(db.DomainModel)
Expand Down
7 changes: 5 additions & 2 deletions src/crawler/crawleroptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@ def __init__(self):


class DefaultCrawlerOptions(BaseCrawlerOptions):
def __init__(self):
def __init__(
self,
ignored_file_extensions_path: str = "./configs/ignored_file_extensions.txt",
):
super().__init__()

self.ua: str = "OWS-CRAWLER/0.1-DEV (https://github.com/quintindunn/OWS)"

with open("./configs/ignored_file_extensions.txt", "r") as f:
with open(ignored_file_extensions_path, "r") as f:
extensions = f.readlines()[1:]
self.ignored_url_endings = set(extensions)
18 changes: 18 additions & 0 deletions src/crawler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,21 @@ def __init__(self, msg: str = ""):

def __str__(self):
return self.msg or "Cannot crawl page, try again later."


class InvalidURLException(Exception):
def __init__(self, msg: str = ""):
super().__init__()
self.msg = msg

def __str__(self):
return self.msg or "Invalid url"


class CouldntFindNetworkInfoException(Exception):
def __init__(self, msg: str = ""):
super().__init__()
self.msg = msg

def __str__(self):
return self.msg or "Couldn't retrieve private network information."
74 changes: 74 additions & 0 deletions src/crawler/networking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
try:
from .exceptions import CouldntFindNetworkInfoException
from .urls import get_protocol_and_domain_from_url
except ImportError as e:
from exceptions import CouldntFindNetworkInfoException
from urls import get_protocol_and_domain_from_url

from ipaddress import IPv4Network, IPv4Address
import socket
import psutil


def _get_network_info() -> tuple[str, str]:
"""
Gets the information on your network adapters
:return:
"""
net_if_addrs = psutil.net_if_addrs().items()

if len(net_if_addrs) == 0:
raise CouldntFindNetworkInfoException()

for iface, addrs in psutil.net_if_addrs().items():
for addr in addrs:
if addr.family == socket.AF_INET:
network_ip = IPv4Network(
f"{addr.address}/{addr.netmask}", strict=False
).network_address
yield network_ip, addr.netmask


def _is_ip_in_range(ip: IPv4Address, network_ip: str, subnet_mask: str) -> bool:
"""
Checks if ip address is in an ip range.
:param ip: Ip to check
:param network_ip: Ip of the network
:param subnet_mask: subnet mask of the network
:return: True if the ip is in the range.
"""
network = IPv4Network(f"{network_ip}/{subnet_mask}", strict=False)
ip_addr = IPv4Address(ip)
return ip_addr in network


def _is_ip_private(ip: IPv4Address) -> bool:
"""
Checks if an ip address is private
:param ip: Ip address to check
:return: True if ip is in the private range
"""
for network_ip, subnet_mask in _get_network_info():
if _is_ip_in_range(ip, network_ip, subnet_mask) or ip.is_private:
return True
return False


def _resolve_domain_to_ip(domain: str) -> IPv4Address:
"""
Performs a DNS lookup to get the IP address of a domain.
:param domain: The domain to lookup
:return: IPv4Address object of the domain
"""
ip_address = IPv4Address(socket.gethostbyname(domain))
return ip_address


def is_host_private(host: str) -> bool:
"""
Checks if a host resolves to a private ip address.
:param host: Host to check
:return: True if the host resolves to a private ip address.
"""
ip = _resolve_domain_to_ip(host)
return _is_ip_private(ip)
20 changes: 17 additions & 3 deletions src/crawler/url_checker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import socket

try:
from .crawleroptions import BaseCrawlerOptions
from .urls import get_protocol_and_domain_from_url
from .networking import is_host_private
except ImportError as e:
from crawleroptions import BaseCrawlerOptions
from crawleroptions import BaseCrawlerOptions, DefaultCrawlerOptions
from urls import get_protocol_and_domain_from_url
from networking import is_host_private


def check_url_compliance(crawler_options: BaseCrawlerOptions, url: str) -> bool:
Expand All @@ -11,17 +17,25 @@ def check_url_compliance(crawler_options: BaseCrawlerOptions, url: str) -> bool:
:param url: URL to check.
:return: True if the URL complies with rules, otherwise False.
"""

# Check file ending
segments = url.replace("//", "/").split("/")

# Check to see if these checks even apply:
if len(segments) == 2 or "." not in segments[-1]:
return True
if not (len(segments) == 2 or "." not in segments[-1]):
return False

ending_segment = segments[-1]
path_ending = ending_segment.split(".")[-1]

if path_ending in crawler_options.ignored_url_endings:
return False

try:
_, domain = get_protocol_and_domain_from_url(url)
if is_host_private(host=domain):
return False
except socket.gaierror:
return False

return True
41 changes: 33 additions & 8 deletions src/crawler/urls.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
try:
from .exceptions import NoUrlException, WaitBeforeRetryException
except ImportError as e:
from exceptions import NoUrlException, WaitBeforeRetryException
from .exceptions import (
NoUrlException,
WaitBeforeRetryException,
InvalidURLException,
)
except ImportError as _:
from exceptions import NoUrlException, WaitBeforeRetryException, InvalidURLException

import random


def get_protocol_and_domain_from_url(url: str):
logger_url_str = f"\"{url[:60]}{'...' if len(url) > 60 else ''}\""
if "//" not in url:
raise InvalidURLException(f"{logger_url_str} is not a supported url.")

protocol, _url = url.split("//", 1) # https:, example.com/test

if "?" in _url and "/" in _url and _url.index("?") < _url.index("/"):
domain = _url.split("?", 1)[0]
elif "?" in url and "/" not in url:
domain = _url.split("?", 1)[0]
elif "/" in _url:
domain = _url.split("/", 1)[0]
else:
domain = _url
return protocol, domain


class URLManager:
def __init__(
self,
Expand All @@ -24,8 +46,7 @@ def __init__(
if to_crawl:
self.to_crawl = to_crawl
else:
protocol, _url = seed_url.split("//", 1)
domain = _url.split("/", 1)[0]
_, domain = get_protocol_and_domain_from_url(seed_url)
self.to_crawl[domain] = [seed_url]

def get_next_url(self) -> str:
Expand All @@ -36,7 +57,6 @@ def get_next_url(self) -> str:
# Check that we haven't crawled everything.
if len(self.to_crawl) == 0:
raise NoUrlException()

domain_choice = random.choice(list(self.to_crawl.keys()))
current_url = random.choice(self.to_crawl[domain_choice])

Expand All @@ -49,12 +69,17 @@ def get_next_url(self) -> str:
raise NoUrlException()

self.enqueued.add(current_url)

return current_url

def add_to_to_crawl_queue(self, url: str, domain: str | None = None):
if not url.lower().startswith("http://") and not url.lower().startswith(
"https://"
):
return

if domain is None:
_url = url.split("//", 1)[1]
domain = _url.split("/", 1)[0]
_, domain = get_protocol_and_domain_from_url(url)

if domain in self.to_crawl.keys():
self.to_crawl[domain].append(url)
Expand Down
7 changes: 5 additions & 2 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os.path
import random
import shutil

from crawler import Crawler

Expand All @@ -10,7 +11,7 @@
if __name__ == "__main__":
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

with open("seeds.txt", "r") as f:
with open("./seeds.txt", "r") as f:
seeds = [i.strip() for i in f.readlines()]

seed_url = random.choice(seeds)
Expand All @@ -29,5 +30,7 @@

to_crawl = crawler.url_manager.to_crawl

with open("to_crawl.json", "w") as f:
if os.path.isfile("./to_crawl.json"):
shutil.move("./to_crawl.json", "./to_crawl.json.old")
with open("./to_crawl.json", "w") as f:
json.dump(to_crawl, f)
7 changes: 2 additions & 5 deletions src/seeds.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ https://aka.ms
https://developer.microsoft.com
https://learn.microsoft.com
https://support.microsoft.com
https://www.onenote.com
https://azure.microsoft.com
https://techcommunity.microsoft.com
https://azuremarketplace.microsoft.com
Expand Down Expand Up @@ -114,7 +113,6 @@ https://jobs.opera.com
https://support.scribd.com
https://breezy-gallery.imgix.net
https://auth.opera.com
https://www.google.com
https://refit.sourceforge.net
https://github.blog
https://psychcentral.com
Expand All @@ -126,7 +124,6 @@ https://phandroid.com
https://www.ip-adress.com
https://www.textbroker.de
https://scholar.google.com
https://olduli.nli.org.il
https://codeql.com
https://itc.ua
https://www.ynet.co.il
Expand Down Expand Up @@ -157,12 +154,12 @@ https://partners.elastic.co
https://portal.acm.org
https://www.aljazeera.com
https://www.ebizmba.com
https://login.elastic.co
https://cloud.elastic.co
https://techcrunch.com
https://bambots.brucemyers.com
https://www.brainasoft.com
https://bible.oremus.org
https://loc.gov
https://www.txnd.uscourts.gov
https://www.yelp.com
https://www.yelp.com
https://fbi.gov

0 comments on commit 9b5869e

Please sign in to comment.