Skip to content

Commit

Permalink
Merge pull request #4 from quintindunn/crawler
Browse files Browse the repository at this point in the history
Crawler
  • Loading branch information
quintindunn authored Jun 27, 2024
2 parents 05931e4 + 3b1cc18 commit cace368
Show file tree
Hide file tree
Showing 11 changed files with 191 additions and 106 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,5 +160,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

src/database/dbs/*
src/dbs/pages.db
to_crawl.json
2 changes: 1 addition & 1 deletion src/crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .crawler import Crawler
from .crawler import Crawler
157 changes: 78 additions & 79 deletions src/crawler/crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import datetime
import random

from sqlalchemy import func

Expand All @@ -13,40 +12,34 @@

sys.path.insert(0, "..")

from .crawlerstats import CrawlerStats # noqa
from .exceptions import NoUrlException, WaitBeforeRetryException # noqa
from .requester import Requester # noqa
from .crawleroptions import BaseCrawlerOptions, DefaultCrawlerOptions # noqa
from .page import Page # noqa
from .robots import does_page_follow_robots_rules # noqa
from .url_checker import check_url_compliance # noqa
from .urls import URLManager # noqa

from database import db, page_checker # noqa (Ignore import error)

if typing.TYPE_CHECKING:
# Allow IDE to find correct import.
from ..database import db, page_checker

try:
from .crawlerstats import CrawlerStats
from .exceptions import NoUrlException, WaitBeforeRetryException
from .requester import Requester
from .crawleroptions import BaseCrawlerOptions, DefaultCrawlerOptions
from .page import Page
from .robots import does_page_follow_robots_rules
from .url_checker import check_url_compliance
except ImportError as e:
from crawlerstats import CrawlerStats
from exceptions import NoUrlException, WaitBeforeRetryException
from requester import Requester
from crawleroptions import BaseCrawlerOptions, DefaultCrawlerOptions
from page import Page
from robots import does_page_follow_robots_rules
from url_checker import check_url_compliance
finally:
from database import db, page_checker

DB_MAX_CONTENT_CHARS = 15000000

logger = logging.getLogger("Crawler")


class Crawler:
def __init__(self, seed_url: str,
crawled: set[str] | None = None,
to_crawl: set[str] | None = None,
crawler_options: BaseCrawlerOptions | None = None
):
def __init__(
self,
seed_url: str,
crawled: set[str] | None = None,
to_crawl: dict[str, list[str]] | None = None,
crawler_options: BaseCrawlerOptions | None = None,
):
"""
:param seed_url: The URL to seed from.
:param crawled: A set of pages to ignore as they've been crawled already.
Expand All @@ -59,40 +52,25 @@ def __init__(self, seed_url: str,
self.stats = CrawlerStats()
self.requester = Requester(crawler_options=self.options)

self.seed_url: str | None = seed_url or None
self.enqueued: set[str] = crawled or set() # For all URLs that have been crawled or are already queued to crawl
self.to_crawl: list[str] = list(to_crawl or [seed_url])
self.url_manager = URLManager(
seed_url=seed_url, crawled=crawled, to_crawl=to_crawl
)

self.current_url: str | None = None

self.db_session = db.Session()

self.url_compliance_checker = functools.partial(check_url_compliance, self.options)
self.page_follows_db_rules = functools.partial(page_checker.page_follows_db_rules, self.options)

def _get_next_url(self) -> str:
"""
Gets the next URL to crawl and updates Crawler.enqueued.
:return: Next URL to crawl.
"""
# Check that we haven't crawled everything.
logger.debug(f"[URLs] {len(self.to_crawl)} URLs left to crawl.")
if len(self.to_crawl) == 0:
raise NoUrlException()

random_idx = random.randint(0, len(self.to_crawl)-1)
current_url = self.to_crawl.pop(random_idx)

if current_url is None:
raise NoUrlException()

self.enqueued.add(current_url)
return current_url
self.url_compliance_checker = functools.partial(
check_url_compliance, self.options
)
self.page_follows_db_rules = functools.partial(
page_checker.page_follows_db_rules, self.options
)

def get_domain_robots(self, domain: str, protocol: str) -> str:
protocol = protocol + (":" if not protocol[-1] == ":" else "")
robots_txt_url = f"{protocol}//{domain}/robots.txt"
logger.info(f"[Robots] Getting \"{domain}\"'s robots.txt.")
logger.info(f'[Robots] Getting "{domain}"\'s robots.txt.')
robots_txt_request = self.requester.get(robots_txt_url, is_robots=True)

if robots_txt_request.status_code == 404:
Expand All @@ -102,22 +80,25 @@ def get_domain_robots(self, domain: str, protocol: str) -> str:

@functools.lru_cache(maxsize=1024)
def get_domain(self, domain: str) -> db.DomainModel:
domain_model = self.db_session.query(db.DomainModel).filter(
func.lower(db.DomainModel.domain) == domain
).first()
domain_model = (
self.db_session.query(db.DomainModel)
.filter(func.lower(db.DomainModel.domain) == domain)
.first()
)
return domain_model

@functools.lru_cache(maxsize=1024)
def get_robots_txt(self, domain):
domain_model = self.db_session.query(db.DomainModel).filter(
func.lower(db.DomainModel.domain) == domain
).first()
domain_model = (
self.db_session.query(db.DomainModel)
.filter(func.lower(db.DomainModel.domain) == domain)
.first()
)

if not domain_model:
try:
domain_model = db.DomainModel(
domain=domain,
robots=self.get_domain_robots(domain, "http")
domain=domain, robots=self.get_domain_robots(domain, "http")
)
self.db_session.add(domain_model)
self.db_session.commit()
Expand All @@ -143,18 +124,24 @@ def get_page(self, url: str) -> Page | None:

try:
if self.options.follow_robots_txt and not does_page_follow_robots_rules(
self.options, url, self.get_robots_txt(domain), domain=domain_model):
logger.info(f"[Robots.txt] Page @ {logger_url_str} conflicts with robots.txt")
self.options, url, self.get_robots_txt(domain), domain=domain_model
):
logger.info(
f"[Robots.txt] Page @ {logger_url_str} conflicts with robots.txt"
)
return None
except WaitBeforeRetryException:
self.to_crawl.append(url)
self.enqueued.remove(url)
logger.info(f"[Robots.txt] Cannot crawl {logger_url_str} as it was crawled too recently.")
self.url_manager.add_to_to_crawl_queue(url, domain)
logger.info(
f"[Robots.txt] Cannot crawl {logger_url_str} as it was crawled too recently."
)

# Get the page.
request = self.requester.get(url=url, stream=True, timeout=self.options.page_timeout)
request = self.requester.get(
url=url, stream=True, timeout=self.options.page_timeout
)

content = b''
content = b""

max_bytes = self.options.max_page_size
chunk_size = self.options.content_buffer_size
Expand All @@ -166,7 +153,7 @@ def get_page(self, url: str) -> Page | None:
if total_bytes >= max_bytes:
break

if content == b'':
if content == b"":
return None

# Do some basic parsing.
Expand All @@ -175,7 +162,7 @@ def get_page(self, url: str) -> Page | None:
elapsed=request.elapsed,
content=content,
response_headers=request.headers,
url=url
url=url,
)

if domain_model:
Expand All @@ -191,34 +178,39 @@ def step(self) -> Page | None:
try:
start_time = time.time_ns()

url = self._get_next_url()
url = self.url_manager.get_next_url()

# Check if domain is in domain table.
protocol, _url = url.split("//", 1)
domain = _url.split("/", 1)[0]

domain_model = self.db_session.query(db.DomainModel).filter(
func.lower(db.DomainModel.domain) == domain
).all()
domain_model = (
self.db_session.query(db.DomainModel)
.filter(func.lower(db.DomainModel.domain) == domain)
.all()
)

if not domain_model:
try:
domain_model = db.DomainModel(
domain=domain,
robots=self.get_domain_robots(domain, protocol)
domain=domain, robots=self.get_domain_robots(domain, protocol)
)
self.db_session.add(domain_model)
self.db_session.commit()
except requests.exceptions.ConnectionError:
pass

# Get the page, and update the crawling queue to hold the new links.
logger.info(f"[Crawling] Crawling page \"{url[:60]}{'...' if len(url) > 60 else ''}\"")
logger.info(
f"[Crawling] Crawling page \"{url[:60]}{'...' if len(url) > 60 else ''}\""
)
try:
page = self.get_page(url)

except requests.exceptions.ConnectionError as e:
logger.info(f"[Request Error] on page \"{url[:60]}{'...' if len(url) > 60 else ''}\" {e}")
logger.info(
f"[Request Error] on page \"{url[:60]}{'...' if len(url) > 60 else ''}\" {e}"
)
self.stats.pages_crawled += 1
self.stats.pages_failed += 1
return None
Expand All @@ -237,9 +229,12 @@ def step(self) -> Page | None:
if self.url_compliance_checker(url):
passed_urls.add(url)

self.to_crawl.extend(passed_urls - self.enqueued)
self.url_manager.add_many_to_to_crawl_queue(passed_urls)

else:
logger.info(f"[Response] HTTP {page.status_code} @ \"{url[:60]}{'...' if len(url) > 60 else ''}\"")
logger.info(
f"[Response] HTTP {page.status_code} @ \"{url[:60]}{'...' if len(url) > 60 else ''}\""
)

# Update statistics.
total_time = time.time_ns() - start_time
Expand All @@ -255,14 +250,18 @@ def step(self) -> Page | None:
url=page.url,
domain=page.domain,
title=page.html_title,
content=page.content.decode().encode("UTF-8")[:DB_MAX_CONTENT_CHARS]
content=page.content.decode().encode("UTF-8")[
:DB_MAX_CONTENT_CHARS
],
)
self.db_session.add(page_model)

# Save to db.
self.db_session.commit()
else:
logger.info(f"[DB] \"{url[:60]}{'...' if len(url) > 60 else ''}\" doesn't follow database rules.")
logger.info(
f"[DB] \"{url[:60]}{'...' if len(url) > 60 else ''}\" doesn't follow database rules."
)
return page
except NoUrlException as e:
raise e
Expand Down
2 changes: 1 addition & 1 deletion src/crawler/crawleroptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ def __init__(self):

self.ua: str = "OWS-CRAWLER/0.1-DEV (https://github.com/quintindunn/OWS)"

with open("./configs/ignored_file_extensions.txt", 'r') as f:
with open("./configs/ignored_file_extensions.txt", "r") as f:
extensions = f.readlines()[1:]
self.ignored_url_endings = set(extensions)
10 changes: 8 additions & 2 deletions src/crawler/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@


class Page:
def __init__(self, status_code: int, elapsed: timedelta, content: bytes, url: str,
response_headers: CaseInsensitiveDict[str]):
def __init__(
self,
status_code: int,
elapsed: timedelta,
content: bytes,
url: str,
response_headers: CaseInsensitiveDict[str],
):
self.status_code: int = status_code
self.elapsed = elapsed
self.url = url
Expand Down
22 changes: 16 additions & 6 deletions src/crawler/requester.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,16 @@
class Requester:
def __init__(self, crawler_options: BaseCrawlerOptions):
self.options: BaseCrawlerOptions = crawler_options
self.base_headers = {
"User-Agent": crawler_options.ua
}
self.base_headers = {"User-Agent": crawler_options.ua}

def get(self, url: str, *args, headers: dict | None = None, is_robots: bool = False,
**kwargs) -> requests.Response:
def get(
self,
url: str,
*args,
headers: dict | None = None,
is_robots: bool = False,
**kwargs
) -> requests.Response:
"""
Makes a GET request to the given URL but passes the base headers into the request.
:param url: URL to make the request to.
Expand All @@ -29,7 +33,13 @@ def get(self, url: str, *args, headers: dict | None = None, is_robots: bool = Fa
headers = headers or dict()
local_headers.update(headers)
if is_robots:
request = requests.get(url, headers=headers, timeout=self.options.robots_timeout, *args, **kwargs)
request = requests.get(
url,
headers=headers,
timeout=self.options.robots_timeout,
*args,
**kwargs
)
else:
request = requests.get(url, headers=headers, *args, **kwargs)
return request
13 changes: 9 additions & 4 deletions src/crawler/robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
from exceptions import WaitBeforeRetryException


def does_page_follow_robots_rules(crawler_options: BaseCrawlerOptions, url: str, robots: str,
domain: "db.DomainModel") -> bool:
def does_page_follow_robots_rules(
crawler_options: BaseCrawlerOptions, url: str, robots: str, domain: "db.DomainModel"
) -> bool:
parser = robotparser.RobotFileParser()
parser.parse(robots.splitlines())

Expand All @@ -31,10 +32,14 @@ def does_page_follow_robots_rules(crawler_options: BaseCrawlerOptions, url: str,
request_delay = parser.request_rate(crawler_options.ua)

now = datetime.datetime.now()
if crawl_delay and (now - domain.last_crawled).total_seconds() < int(crawl_delay):
if crawl_delay and (now - domain.last_crawled).total_seconds() < int(
crawl_delay
):
raise WaitBeforeRetryException()

if request_delay and (now - domain.last_crawled).total_seconds() < int(request_delay.seconds):
if request_delay and (now - domain.last_crawled).total_seconds() < int(
request_delay.seconds
):
raise WaitBeforeRetryException()
except ValueError:
pass
Expand Down
Loading

0 comments on commit cace368

Please sign in to comment.