Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to configure input and output directories #71

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -91,6 +91,9 @@ ENV CISA_GROUP=${CISA_USER}
ENV CISA_HOME="/home/${CISA_USER}"
ENV VIRTUAL_ENV="${CISA_HOME}/.venv"

# Host mount directory
ARG HOST_MOUNT="${CISA_HOME}/host_mount"

RUN apk --no-cache add \
ca-certificates=20241121-r1 \
chromium=132.0.6834.83-r0 \
@@ -100,6 +103,9 @@ RUN apk --no-cache add \
RUN addgroup --system --gid ${CISA_GID} ${CISA_GROUP} \
&& adduser --system --uid ${CISA_UID} --ingroup ${CISA_GROUP} ${CISA_USER}

# Create the HOST_MOUNT directory (and any intermediate directories)
RUN mkdir --parents ${HOST_MOUNT}

###
# Copy in the Python virtual environment created in compile-stage, symlink the
# Python binary in the venv to the system-wide Python, and add the venv to the PATH.
@@ -113,15 +119,13 @@ COPY --from=compile-stage --chown=${CISA_USER}:${CISA_GROUP} ${VIRTUAL_ENV} ${VI
RUN ln -fs "$(command -v python3)" "${VIRTUAL_ENV}"/bin/python3
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"

WORKDIR ${CISA_HOME}
RUN mkdir host_mount

# Copy in the necessary files
COPY --chown=${CISA_USER}:${CISA_GROUP} src/version.txt src/vdp_scanner.py ./
COPY --chown=${CISA_USER}:${CISA_GROUP} src/version.txt src/vdp_scanner.py ${CISA_HOME}/

###
# Prepare to run
###
WORKDIR ${CISA_HOME}
USER ${CISA_USER}:${CISA_GROUP}
ENTRYPOINT ["python3", "vdp_scanner.py"]
CMD ["github"]
19 changes: 15 additions & 4 deletions src/vdp_scanner.py
Original file line number Diff line number Diff line change
@@ -16,6 +16,8 @@
-h, --help Show this help message.
-v, --version Show script version.
-d, --debug Enable debugging output.
-i, --input-dir=INPUT_DIR Input directory path. [default: host_mount]
-o, --output-dir=OUTPUT_DIR Output directory path. [default: host_mount]
-a, --agency-csv=AGENCY_CSV Filename to use for agency results.
-t, --domain-csv=DOMAIN_CSV Filename to use for domain (TLD) results.
-p, --path-to-chromium=PATH Specify the Chromium binary to use.
@@ -27,6 +29,7 @@
import csv
from datetime import datetime
import logging
from os.path import exists as path_exists
from os.path import join as path_join
from typing import Any, Dict, List, NamedTuple, Optional, Tuple
from urllib.parse import urlparse, urlunparse
@@ -89,13 +92,13 @@ class VdpScanner:
"VDP Hash",
]

def __init__(self, hasher: UrlHasher):
def __init__(self, hasher: UrlHasher, output_directory: str):
"""Initialize variables and perform setup."""
self._hasher = hasher
file_date = datetime.utcnow().strftime("%Y-%m-%d")
self.agency_csv = f"agency_results_{file_date}.csv"
self.domain_csv = f"domain_results_{file_date}.csv"
self.output_directory = "host_mount"
self.output_directory = output_directory

self.agency_results: defaultdict = defaultdict(
lambda: {k: 0 for k in self.agency_csv_header[1:]}
@@ -272,6 +275,14 @@ def main():
format="%(asctime)-15s %(levelname)s %(message)s", level=log_level
)

# Before continuing make sure that our input and output directories exist
if not path_exists(args["--input-dir"]):
logging.error("Input directory '%s' does not exist.", args["--input-dir"])
return 1
if not path_exists(args["--output-dir"]):
logging.error("Output directory '%s' does not exist.", args["--output-dir"])
return 1

# If we make a call to UrlHasher.hash_url() with verify=False, it will output
# a warning. Since this is a fallback mechanism, we can squelch these warnings.
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
@@ -287,7 +298,7 @@ def main():
}
http_hasher = UrlHasher("sha256", browser_options=browser_opts)

scanner: VdpScanner = VdpScanner(http_hasher)
scanner: VdpScanner = VdpScanner(http_hasher, args["--output-dir"])
if args["--agency-csv"]:
scanner.agency_csv = args["--agency-csv"]
if args["--domain-csv"]:
@@ -296,7 +307,7 @@ def main():
domains_to_scan: List[Dict[str, str]]

if args["local"]:
domains_to_scan = get_local_csv(path_join("host_mount", args["FILE"]))
domains_to_scan = get_local_csv(path_join(args["--input-dir"], args["FILE"]))

if args["github"]:
domains_to_scan = get_remote_csv()