Skip to content

Commit

Permalink
Added check if ip address is a private ip address
Browse files Browse the repository at this point in the history
  • Loading branch information
quintindunn committed Jun 28, 2024
1 parent 9d955d0 commit 27c7379
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 6 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

src/dbs/pages.db-journal
src/dbs/pages.db

to_crawl.json
to_crawl.json.old
launch.bat
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
requests
lxml
SQLAlchemy
SQLAlchemy
psutil
4 changes: 2 additions & 2 deletions src/crawler/crawleroptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def __init__(self):


class DefaultCrawlerOptions(BaseCrawlerOptions):
def __init__(self):
def __init__(self, ignored_file_extensions_path: str = "./configs/ignored_file_extensions.txt"):
super().__init__()

self.ua: str = "OWS-CRAWLER/0.1-DEV (https://github.com/quintindunn/OWS)"

with open("./configs/ignored_file_extensions.txt", "r") as f:
with open(ignored_file_extensions_path, "r") as f:
extensions = f.readlines()[1:]
self.ignored_url_endings = set(extensions)
9 changes: 9 additions & 0 deletions src/crawler/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,12 @@ def __init__(self, msg: str = ""):

def __str__(self):
return self.msg or "Invalid url"


class CouldntFindNetworkInfoException(Exception):
def __init__(self, msg: str = ""):
super().__init__()
self.msg = msg

def __str__(self):
return self.msg or "Couldn't retrieve private network information."
72 changes: 72 additions & 0 deletions src/crawler/networking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
try:
from .exceptions import CouldntFindNetworkInfoException
from .urls import get_protocol_and_domain_from_url
except ImportError as e:
from exceptions import CouldntFindNetworkInfoException
from urls import get_protocol_and_domain_from_url

from ipaddress import IPv4Network, IPv4Address
import socket
import psutil


def _get_network_info() -> tuple[str, str]:
"""
Gets the information on your network adapters
:return:
"""
net_if_addrs = psutil.net_if_addrs().items()

if len(net_if_addrs) == 0:
raise CouldntFindNetworkInfoException()

for iface, addrs in psutil.net_if_addrs().items():
for addr in addrs:
if addr.family == socket.AF_INET:
network_ip = IPv4Network(f"{addr.address}/{addr.netmask}", strict=False).network_address
yield network_ip, addr.netmask


def _is_ip_in_range(ip: IPv4Address, network_ip: str, subnet_mask: str) -> bool:
"""
Checks if ip address is in an ip range.
:param ip: Ip to check
:param network_ip: Ip of the network
:param subnet_mask: subnet mask of the network
:return: True if the ip is in the range.
"""
network = IPv4Network(f"{network_ip}/{subnet_mask}", strict=False)
ip_addr = IPv4Address(ip)
return ip_addr in network


def _is_ip_private(ip: IPv4Address) -> bool:
"""
Checks if an ip address is private
:param ip: Ip address to check
:return: True if ip is in the private range
"""
for network_ip, subnet_mask in _get_network_info():
if _is_ip_in_range(ip, network_ip, subnet_mask) or ip.is_private:
return True
return False


def _resolve_domain_to_ip(domain: str) -> IPv4Address:
"""
Performs a DNS lookup to get the IP address of a domain.
:param domain: The domain to lookup
:return: IPv4Address object of the domain
"""
ip_address = IPv4Address(socket.gethostbyname(domain))
return ip_address


def is_host_private(host: str) -> bool:
"""
Checks if a host resolves to a private ip address.
:param host: Host to check
:return: True if the host resolves to a private ip address.
"""
ip = _resolve_domain_to_ip(host)
return _is_ip_private(ip)
20 changes: 17 additions & 3 deletions src/crawler/url_checker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import socket

try:
from .crawleroptions import BaseCrawlerOptions
from .urls import get_protocol_and_domain_from_url
from .networking import is_host_private
except ImportError as e:
from crawleroptions import BaseCrawlerOptions
from crawleroptions import BaseCrawlerOptions, DefaultCrawlerOptions
from urls import get_protocol_and_domain_from_url
from networking import is_host_private


def check_url_compliance(crawler_options: BaseCrawlerOptions, url: str) -> bool:
Expand All @@ -11,17 +17,25 @@ def check_url_compliance(crawler_options: BaseCrawlerOptions, url: str) -> bool:
:param url: URL to check.
:return: True if the URL complies with rules, otherwise False.
"""

# Check file ending
segments = url.replace("//", "/").split("/")

# Check to see if these checks even apply:
if len(segments) == 2 or "." not in segments[-1]:
return True
if not (len(segments) == 2 or "." not in segments[-1]):
return False

ending_segment = segments[-1]
path_ending = ending_segment.split(".")[-1]

if path_ending in crawler_options.ignored_url_endings:
return False

try:
_, domain = get_protocol_and_domain_from_url(url)
if is_host_private(host=domain):
return False
except socket.gaierror:
return False

return True

0 comments on commit 27c7379

Please sign in to comment.