darwinpyspark.py

import json
import urllib.parse
import urllib.request
import zipfile
from io import BytesIO

import requests
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json


class DarwinPyspark:
    def __init__(self, API_KEY, team_slug, dataset_slug):
        """
        Method to initialise Darwin Pyspark

        Parameters
        ----------
            API_KEY (str): Your Darwin API Key
            team_slug (str): The slug name of the team in Darwin you want to interact with
            dataset_slug (str): The slug name of the dataset in Darwin you want to interact with

        Returns
        -------
            DarwinPyspark class
        """
        self.headers = {
            "accept": "application/json",
            "Authorization": f"ApiKey {API_KEY}",
        }
        self.team_slug = team_slug.lower().strip().replace(" ", "-")
        self.dataset_slug = dataset_slug.lower().strip().replace(" ", "-")
    
    def upload_items(self, df):
        """
        Method to upload a pyspark dataframes data to V7

        Parameters
        ----------
            df (pyspark dataframe): A dataframe, with columns 'object_url' (accessible open or presigned url for the image) and 'file_name' (the name you want the file to be listed as in V7)

        Returns
        -------
            None
        """
        df.select("file_name", "object_url").foreach(
            lambda row: self._upload_item(row[0], row[1])
        )
    
    def download_export(self, export_name):
        """
        Calls all download methods to get and write an export to a pyspark dataframe

        Parameters
        ----------
            export_name (str): Name of the export in V7 that is to be downloaded

        Returns
        -------
            export_df (pyspark dataframe): pyspark dataframe of the exported darwin json data
        """
        export_url = self._get_export_url(export_name)
        # create a SparkSession object
        spark = SparkSession.builder.appName("darwinpyspark").getOrCreate()
        return self._extract_export(self._download_export_zip(export_url), spark)

    def _data_registration(self, item_name):
        """
        Method to register items and slots

        Parameters
        ----------
            item_name (str): Name of the file to be uploaded

        Returns
        -------
            upload_id (str): the upload id for the file to be uploaded
        """

        url = f"https://darwin.v7labs.com/api/v2/teams/{self.team_slug}/items/register_upload"

        payload = {
            "items": [
                {
                    "slots": [{"tags": [], "file_name": item_name, "slot_name": "0"}],
                    "name": item_name,
                    "layout": None,
                    "path": "",
                    "tags": [],
                }
            ],
            "dataset_slug": self.dataset_slug,
        }

        response = requests.post(url, headers=self.headers, json=payload)
        json_response = response.json()

        if json_response["blocked_items"]:
            raise RuntimeError(f"{json_response}")
        return json_response["items"][0]["slots"][0]["upload_id"]

    def _sign_upload(self, upload_id):
        """
        Method to sign upload for an item

        Parameters
        ----------
            upload_id (str): Upload id generated by data_registration() for file to be uploaded

        Returns
        -------
            upload_url (str): the upload url for the file
        """

        url = f"https://darwin.v7labs.com/api/v2/teams/{self.team_slug}/items/uploads/{upload_id}/sign"

        response = requests.get(url, headers=self.headers)
        return response.json()["upload_url"]

    def _upload_binary(self, item_path, upload_url):
        """
        Method to upload item data to the V7 platform

        Parameters
        ----------
            item_path (str): Accessible open or presigned url for the image
            upload_url (str): The upload url for the file

        Returns
        -------
            None

        Rasies
        --------
            RuntimeError if upload failed
        """

        encoded_url = urllib.parse.quote(item_path, safe=":/")

        with urllib.request.urlopen(encoded_url) as response:
            data = response.read()

        response = requests.put(
            url=upload_url,
            data=data,
            headers={"Content-Type": "application/octet-stream"},
        )

        if not response.ok:
            raise RuntimeError(f"Issue uploading {item_path} data to V7")

    def _confirm(self, upload_id):
        """
        Method to confirm an upload for a particular upload_id

        Parameters
        ----------
            upload_id (str): The generated id for the file to be loaded

        Returns
        -------
            response (str): the response from the request to upload the binary image data to V7
        """
        url = f"https://darwin.v7labs.com/api/v2/teams/{self.team_slug}/items/uploads/{upload_id}/confirm"
        return requests.post(url, headers=self.headers)

    def _upload_item(self, item_name, item_path):
        """
        Method to call all upload methods and upload a specific item to V7

        Parameters
        ----------
            item_name (str): Name of the file to be uploaded
            item_path (str): Accessible open or presigned url for the image

        Returns
        -------
            None
        """
        upload_id = self._data_registration(item_name)
        if upload_id == None:
            return

        upload_url = self._sign_upload(upload_id)
        self._upload_binary(item_path, upload_url)
        self._confirm(upload_id)

    def _get_export_url(self, export_name):
        """
        Method to get the url for the export to be downloaded

        Parameters
        ----------
            export_name (str): Name of the export in V7 that is to be downloaded

        Returns
        -------
            download_url (str): The url for the generated export that is to be downloaded
        """
        url = f"https://darwin.v7labs.com/api/v2/teams/{self.team_slug}/datasets/{self.dataset_slug}/exports"
        response = requests.get(url, headers=self.headers)
        if not response.ok:
            raise RuntimeError(f"Failed to fetch export '{export_name}': {response.status_code} - {response.content}")

        exports_json = response.json()
        # get the export zip url
        for export_json in exports_json:
            if export_json["name"] == export_name:
                return export_json["download_url"]

        raise RuntimeError(f"No export with name '{export_name}' found")

    def _download_export_zip(self, download_url):
        """
        From the export url, method to download the relevant darwin json export

        Parameters
        ----------
            download_url (str): The url for the generated export that is to be downloaded

        Returns
        -------
            (zipfile): Zipped set of darwin export JSON's
        """
        # download the zip file from the URL
        response = urllib.request.urlopen(download_url)

        return zipfile.ZipFile(BytesIO(response.read()))

    def _extract_export(self, zipfile, spark):
        """
        Method to write the darwin json results to a pyspark dataframe

        Parameters
        ----------
            zipfile (Zipfile): Zipped set of darwin export JSON's
            spark (DarwinPyspark Spark Session): The spark session created to write the results to a pyspark table

        Returns
        -------
            df (pyspark dataframe): pyspark dataframe of the exported darwin json data
        """

        # Set Databricks user agent tag
        spark.conf.set("spark.databricks.agent.id", "darwinpyspark")

        # extract the JSON files and read them into a DataFrame
        json_files = []
        for filename in zipfile.namelist():
            if filename.endswith(".json"):
                data = zipfile.read(filename)
                json_files.append(data.decode("utf-8"))

        # Define the schema for the JSON data
        schema = "struct<"
        for key in json.loads(json_files[0]).keys():
            schema += f"`{key}` string,"
        schema = schema[:-1] + ">"

        # Create a DataFrame from the JSON data and parse the JSON strings
        df = spark.createDataFrame(json_files, "string")
        df = df.select(from_json(df.value, schema).alias("data")).select("data.*")

        return df