Skip to content

Commit

Permalink
Improved protocol/domain splitting, added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
quintindunn committed Jun 29, 2024
1 parent 7b7635b commit 15d49d8
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 9 deletions.
1 change: 0 additions & 1 deletion src/crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
from .crawler import Crawler
4 changes: 4 additions & 0 deletions src/crawler/networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,9 @@ def is_host_private(host: str) -> bool:
:param host: Host to check
:return: True if the host resolves to a private ip address.
"""

if "[" in host or "]" in host:
return True

ip = _resolve_domain_to_ip(host)
return _is_ip_private(ip)
10 changes: 2 additions & 8 deletions src/crawler/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from exceptions import NoUrlException, WaitBeforeRetryException, InvalidURLException

import random
import re


def get_protocol_and_domain_from_url(url: str):
Expand All @@ -17,14 +18,7 @@ def get_protocol_and_domain_from_url(url: str):

protocol, _url = url.split("//", 1) # https:, example.com/test

if "?" in _url and "/" in _url and _url.index("?") < _url.index("/"):
domain = _url.split("?", 1)[0]
elif "?" in url and "/" not in url:
domain = _url.split("?", 1)[0]
elif "/" in _url:
domain = _url.split("/", 1)[0]
else:
domain = _url
domain = re.split(r'[?/#]', _url, maxsplit=1)[0]
return protocol, domain


Expand Down
Empty file added src/tests/__init__.py
Empty file.
35 changes: 35 additions & 0 deletions src/tests/url_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from unittest import TestCase

import sys

sys.path.append("..")

from crawler.urls import get_protocol_and_domain_from_url


class TestURLs(TestCase):
def test_protocol_domain(self):
urls = {
"http://example.com/path?query=123#section": ["http:", "example.com"],
"https://example.com?query": ["https:", "example.com"],
"http://example.com/path": ["http:", "example.com"],
"http://example.com": ["http:", "example.com"],
"https://example.com/path?query#section": ["https:", "example.com"],
"http://example.com/path/another#section": ["http:", "example.com"],
"http://example.com/path/another?query": ["http:", "example.com"],
"http://example.com?query#section": ["http:", "example.com"],
"http://subdomain.example.com/path": ["http:", "subdomain.example.com"],
"https://example.com:8080/path": ["https:", "example.com:8080"],
"http://example.com//////path": ["http:", "example.com"],
"http://example.com/path?query=1/2/3": ["http:", "example.com"],
"http://example.com/path#": ["http:", "example.com"],
"http://example.com?query?more": ["http:", "example.com"],
"http://example.com/path;param?query#fragment": ["http:", "example.com"],
"https://example.com#fragment": ["https:", "example.com"],
"http://192.168.0.1/path": ["http:", "192.168.0.1"],
"http://[::1]/path": ["http:", "[::1]"]
}

for url, expected in urls.items():
protocol, domain = get_protocol_and_domain_from_url(url)
self.assertEqual([protocol, domain], expected)

0 comments on commit 15d49d8

Please sign in to comment.