diff --git a/.github/workflows/docs_build.yml b/.github/workflows/docs_build.yml new file mode 100644 index 0000000000..4fd71ce3b0 --- /dev/null +++ b/.github/workflows/docs_build.yml @@ -0,0 +1,26 @@ +name: Docs Build + +on: + push: + branches: + - master + pull_request: + branches: + - master +jobs: + docs_warnings: + name: Docs Warnings + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: "0" + - uses: actions/setup-python@v4 + with: + python-version: '3.9' + - name: Report Sphinx Warnings + id: sphinx-warnings + run: | + sudo apt-get install python3-sphinx + pip install -r doc-requirements.txt + SPHINXOPTS="-W" cd docs && make html diff --git a/.github/workflows/pythonbuild.yml b/.github/workflows/pythonbuild.yml index 2344b15391..2bc8f68d0c 100644 --- a/.github/workflows/pythonbuild.yml +++ b/.github/workflows/pythonbuild.yml @@ -65,6 +65,7 @@ jobs: - flytekit-dbt - flytekit-deck-standard - flytekit-dolt + - flytekit-duckdb - flytekit-greatexpectations - flytekit-hive - flytekit-k8s-pod @@ -169,11 +170,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Fetch the code - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/checkout@v3 + - name: Set up Python 3.9 + uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3.9 - name: Install dependencies run: | python -m pip install --upgrade pip==21.2.4 setuptools wheel diff --git a/doc-requirements.in b/doc-requirements.in index a5b921481c..872201189a 100644 --- a/doc-requirements.in +++ b/doc-requirements.in @@ -47,4 +47,5 @@ ray # ray scikit-learn # scikit-learn dask[distributed] # dask vaex # vaex -mlflow # mlflow +mlflow # mlflow +duckdb # duckdb diff --git a/doc-requirements.txt b/doc-requirements.txt index 2eb0532253..98a84f41c9 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -14,9 +14,9 @@ aiosignal==1.3.1 # via ray alabaster==0.7.13 # via sphinx -alembic==1.9.2 +alembic==1.9.3 # via mlflow -altair==4.2.2 +altair==4.2.0 # via great-expectations ansiwrap==0.8.4 # via papermill @@ -27,6 +27,10 @@ anyio==3.6.2 # watchfiles aplus==0.11.0 # via vaex-core +appnope==0.1.3 + # via + # ipykernel + # ipython argon2-cffi==21.3.0 # via # jupyter-server @@ -38,7 +42,7 @@ arrow==1.2.3 # via # isoduration # jinja2-time -astroid==2.14.1 +astroid==2.14.2 # via sphinx-autoapi astropy==5.2.1 # via vaex-astro @@ -67,7 +71,7 @@ blake3==0.3.3 # via vaex-core bleach==6.0.0 # via nbconvert -botocore==1.29.61 +botocore==1.29.72 # via -r doc-requirements.in bqplot==0.12.36 # via @@ -125,17 +129,16 @@ cookiecutter==2.1.1 # via flytekit croniter==1.3.8 # via flytekit -cryptography==39.0.0 +cryptography==39.0.1 # via # -r doc-requirements.in # great-expectations # pyopenssl - # secretstorage css-html-js-minify==2.5.5 # via sphinx-material cycler==0.11.0 # via matplotlib -dask[distributed]==2023.1.1 +dask[distributed]==2023.2.0 # via # -r doc-requirements.in # distributed @@ -160,7 +163,7 @@ diskcache==5.4.0 # via flytekit distlib==0.3.6 # via virtualenv -distributed==2023.1.1 +distributed==2023.2.0 # via dask docker==6.0.1 # via @@ -177,8 +180,10 @@ docutils==0.17.1 # sphinx-panels dolt-integrations==0.1.5 # via -r doc-requirements.in -doltcli==0.1.17 +doltcli==0.1.18 # via dolt-integrations +duckdb==0.7.0 + # via -r doc-requirements.in entrypoints==0.4 # via # altair @@ -186,7 +191,7 @@ entrypoints==0.4 # papermill executing==1.2.0 # via stack-data -fastapi==0.89.1 +fastapi==0.92.0 # via vaex-server fastjsonschema==2.16.2 # via nbformat @@ -195,11 +200,11 @@ filelock==3.9.0 # ray # vaex-core # virtualenv -flask==2.2.2 +flask==2.2.3 # via mlflow flatbuffers==23.1.21 # via tensorflow -flyteidl==1.3.5 +flyteidl==1.3.7 # via flytekit fonttools==4.38.0 # via matplotlib @@ -260,7 +265,7 @@ googleapis-common-protos==1.58.0 # flytekit # google-api-core # grpcio-status -great-expectations==0.15.46 +great-expectations==0.15.48 # via -r doc-requirements.in greenlet==2.0.2 # via sqlalchemy @@ -291,7 +296,7 @@ htmlmin==0.1.12 # via ydata-profiling httptools==0.5.0 # via uvicorn -identify==2.5.17 +identify==2.5.18 # via pre-commit idna==3.4 # via @@ -315,7 +320,7 @@ importlib-metadata==5.2.0 # sphinx ipydatawidgets==4.3.2 # via pythreejs -ipykernel==6.20.2 +ipykernel==6.21.2 # via # ipywidgets # jupyter @@ -325,9 +330,9 @@ ipykernel==6.20.2 # qtconsole ipyleaflet==0.17.2 # via vaex-jupyter -ipympl==0.9.2 +ipympl==0.9.3 # via vaex-jupyter -ipython==8.9.0 +ipython==8.10.0 # via # great-expectations # ipykernel @@ -371,10 +376,6 @@ jaraco-classes==3.2.3 # via keyring jedi==0.18.2 # via ipython -jeepney==0.8.0 - # via - # keyring - # secretstorage jinja2==3.1.2 # via # altair @@ -426,11 +427,13 @@ jupyter-client==8.0.2 # nbclient # notebook # qtconsole -jupyter-console==6.4.4 +jupyter-console==6.5.1 # via jupyter jupyter-core==5.2.0 # via + # ipykernel # jupyter-client + # jupyter-console # jupyter-server # nbclassic # nbclient @@ -440,7 +443,7 @@ jupyter-core==5.2.0 # qtconsole jupyter-events==0.6.3 # via jupyter-server -jupyter-server==2.2.0 +jupyter-server==2.3.0 # via # nbclassic # notebook-shim @@ -458,7 +461,7 @@ keyring==23.13.1 # via flytekit kiwisolver==1.4.4 # via matplotlib -kubernetes==25.3.0 +kubernetes==26.1.0 # via # -r doc-requirements.in # flytekit @@ -516,7 +519,7 @@ matplotlib-inline==0.1.6 # ipython mdurl==0.1.2 # via markdown-it-py -mistune==2.0.4 +mistune==2.0.5 # via # great-expectations # nbconvert @@ -534,7 +537,7 @@ multimethod==1.9.1 # via # visions # ydata-profiling -mypy-extensions==0.4.3 +mypy-extensions==1.0.0 # via typing-inspect natsort==8.2.0 # via flytekit @@ -618,16 +621,6 @@ numpy==1.23.5 # visions # xarray # ydata-profiling -nvidia-cublas-cu11==11.10.3.66 - # via - # nvidia-cudnn-cu11 - # torch -nvidia-cuda-nvrtc-cu11==11.7.99 - # via torch -nvidia-cuda-runtime-cu11==11.7.99 - # via torch -nvidia-cudnn-cu11==8.5.0.96 - # via torch oauthlib==3.2.2 # via # databricks-cli @@ -701,13 +694,13 @@ pillow==9.4.0 # matplotlib # vaex-viz # visions -platformdirs==2.6.2 +platformdirs==3.0.0 # via # jupyter-core # virtualenv plotly==5.13.0 # via -r doc-requirements.in -pre-commit==3.0.2 +pre-commit==3.0.4 # via sphinx-tags progressbar2==4.2.0 # via vaex-core @@ -766,7 +759,7 @@ pyasn1-modules==0.2.8 # via google-auth pycparser==2.21 # via cffi -pydantic==1.10.4 +pydantic==1.10.5 # via # fastapi # great-expectations @@ -795,7 +788,7 @@ pyparsing==3.0.9 # matplotlib pyrsistent==0.19.3 # via jsonschema -pyspark==3.3.1 +pyspark==3.3.2 # via -r doc-requirements.in python-dateutil==2.8.2 # via @@ -812,7 +805,7 @@ python-dateutil==2.8.2 # whylabs-client python-dotenv==0.21.1 # via uvicorn -python-json-logger==2.0.4 +python-json-logger==2.0.6 # via # flytekit # jupyter-events @@ -820,7 +813,7 @@ python-slugify[unidecode]==8.0.0 # via # cookiecutter # sphinx-material -python-utils==3.4.5 +python-utils==3.5.2 # via progressbar2 pythreejs==2.4.1 # via ipyvolume @@ -858,6 +851,7 @@ pyzmq==25.0.0 # via # ipykernel # jupyter-client + # jupyter-console # jupyter-server # nbclassic # notebook @@ -933,8 +927,6 @@ scipy==1.9.3 # ydata-profiling seaborn==0.12.2 # via ydata-profiling -secretstorage==3.3.3 - # via keyring send2trash==1.8.0 # via # jupyter-server @@ -971,7 +963,7 @@ sortedcontainers==2.4.0 # via # distributed # flytekit -soupsieve==2.3.2.post1 +soupsieve==2.4 # via beautifulsoup4 sphinx==4.5.0 # via @@ -1034,7 +1026,7 @@ sqlparse==0.4.3 # via mlflow stack-data==0.6.2 # via ipython -starlette==0.22.0 +starlette==0.25.0 # via fastapi statsd==3.3.0 # via flytekit @@ -1048,7 +1040,7 @@ tangled-up-in-unicode==0.2.0 # via visions tblib==1.7.0 # via distributed -tenacity==8.1.0 +tenacity==8.2.1 # via # papermill # plotly @@ -1116,6 +1108,7 @@ traitlets==5.9.0 # ipyvolume # ipywidgets # jupyter-client + # jupyter-console # jupyter-core # jupyter-events # jupyter-server @@ -1135,11 +1128,13 @@ traittypes==0.2.1 # ipydatawidgets # ipyleaflet # ipyvolume +typed-ast==1.5.4 + # via doltcli typeguard==2.13.3 # via ydata-profiling -types-toml==0.10.8.1 +types-toml==0.10.8.4 # via responses -typing-extensions==4.4.0 +typing-extensions==4.5.0 # via # astroid # flytekit @@ -1204,7 +1199,7 @@ vaex-viz==0.5.4 # via # vaex # vaex-jupyter -virtualenv==20.17.1 +virtualenv==20.19.0 # via # pre-commit # ray @@ -1220,14 +1215,14 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.5.0 +websocket-client==1.5.1 # via # docker # jupyter-server # kubernetes websockets==10.4 # via uvicorn -werkzeug==2.2.2 +werkzeug==2.2.3 # via # flask # tensorboard @@ -1235,12 +1230,10 @@ wheel==0.38.4 # via # astunparse # flytekit - # nvidia-cublas-cu11 - # nvidia-cuda-runtime-cu11 # tensorboard -whylabs-client==0.4.3 +whylabs-client==0.4.2 # via -r doc-requirements.in -whylogs==1.1.24 +whylogs==1.1.26 # via -r doc-requirements.in whylogs-sketching==3.4.1.dev3 # via whylogs @@ -1253,7 +1246,7 @@ wrapt==1.14.1 # flytekit # pandera # tensorflow -xarray==2023.1.0 +xarray==2023.2.0 # via vaex-jupyter xyzservices==2022.9.0 # via ipyleaflet @@ -1261,7 +1254,7 @@ ydata-profiling==4.0.0 # via pandas-profiling zict==2.2.0 # via distributed -zipp==3.12.0 +zipp==3.13.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/docs/source/plugins/duckdb.rst b/docs/source/plugins/duckdb.rst new file mode 100644 index 0000000000..bde0783e3a --- /dev/null +++ b/docs/source/plugins/duckdb.rst @@ -0,0 +1,12 @@ +.. _duckdb: + +################################################### +DuckDB API reference +################################################### + +.. tags:: Integration, Data, Analytics + +.. automodule:: flytekitplugins.duckdb + :no-members: + :no-inherited-members: + :no-special-members: diff --git a/docs/source/plugins/index.rst b/docs/source/plugins/index.rst index 693587192e..e56e9230db 100644 --- a/docs/source/plugins/index.rst +++ b/docs/source/plugins/index.rst @@ -31,6 +31,7 @@ Plugin API reference * :ref:`DBT ` - DBT API reference * :ref:`Vaex ` - Vaex API reference * :ref:`MLflow ` - MLflow API reference +* :ref:`DuckDB ` - DuckDB API reference .. toctree:: :maxdepth: 2 @@ -63,3 +64,4 @@ Plugin API reference DBT Vaex MLflow + DuckDB diff --git a/plugins/README.md b/plugins/README.md index 495ce91019..4abf809346 100644 --- a/plugins/README.md +++ b/plugins/README.md @@ -23,6 +23,8 @@ All the Flytekit plugins maintained by the core team are added here. It is not n | Snowflake | ```bash pip install flytekitplugins-snowflake``` | Use Snowflake as a 'data warehouse-as-a-service' within Flyte | [![PyPI version fury.io](https://badge.fury.io/py/flytekitplugins-snowflake.svg)](https://pypi.python.org/pypi/flytekitplugins-snowflake/) | Backend | | dbt | ```bash pip install flytekitplugins-dbt``` | Run dbt within Flyte | [![PyPI version fury.io](https://badge.fury.io/py/flytekitplugins-dbt.svg)](https://pypi.python.org/pypi/flytekitplugins-dbt/) | Flytekit-only | | Huggingface | ```bash pip install flytekitplugins-huggingface``` | Read & write Hugginface Datasets as Flyte StructuredDatasets | [![PyPI version fury.io](https://badge.fury.io/py/flytekitplugins-huggingface.svg)](https://pypi.python.org/pypi/flytekitplugins-huggingface/) | Flytekit-only | +| DuckDB | ```bash pip install flytekitplugins-duckdb``` | Run analytical workloads with ease using DuckDB. +| [![PyPI version fury.io](https://badge.fury.io/py/flytekitplugins-duckdb.svg)](https://pypi.python.org/pypi/flytekitplugins-duckdb/) | Flytekit-only | ## Have a Plugin Idea? 💡 Please [file an issue](https://github.com/flyteorg/flyte/issues/new?assignees=&labels=untriaged%2Cplugins&template=backend-plugin-request.md&title=%5BPlugin%5D). diff --git a/plugins/flytekit-duckdb/README.md b/plugins/flytekit-duckdb/README.md new file mode 100644 index 0000000000..b914e14505 --- /dev/null +++ b/plugins/flytekit-duckdb/README.md @@ -0,0 +1,9 @@ +# Flytekit DuckDB Plugin + +Run analytical workloads with ease using DuckDB. + +To install the plugin, run the following command: + +```bash +pip install flytekitplugins-duckdb +``` diff --git a/plugins/flytekit-duckdb/flytekitplugins/duckdb/__init__.py b/plugins/flytekit-duckdb/flytekitplugins/duckdb/__init__.py new file mode 100644 index 0000000000..7f46dbf52e --- /dev/null +++ b/plugins/flytekit-duckdb/flytekitplugins/duckdb/__init__.py @@ -0,0 +1,11 @@ +""" +.. currentmodule:: flytekitplugins.duckdb + +.. autosummary:: + :template: custom.rst + :toctree: generated/ + + DuckDBQuery +""" + +from .task import DuckDBQuery diff --git a/plugins/flytekit-duckdb/flytekitplugins/duckdb/task.py b/plugins/flytekit-duckdb/flytekitplugins/duckdb/task.py new file mode 100644 index 0000000000..ff69a0663f --- /dev/null +++ b/plugins/flytekit-duckdb/flytekitplugins/duckdb/task.py @@ -0,0 +1,117 @@ +from typing import Dict, List, NamedTuple, Optional, Union + +import duckdb +import pandas as pd +import pyarrow as pa + +from flytekit import PythonInstanceTask +from flytekit.extend import Interface +from flytekit.types.structured.structured_dataset import StructuredDataset + + +class QueryOutput(NamedTuple): + counter: int = -1 + output: Optional[str] = None + + +class DuckDBQuery(PythonInstanceTask): + _TASK_TYPE = "duckdb" + + def __init__( + self, + name: str, + query: Union[str, List[str]], + inputs: Optional[Dict[str, Union[StructuredDataset, list]]] = None, + **kwargs, + ): + """ + This method initializes the DuckDBQuery. + + Args: + name: Name of the task + query: DuckDB query to execute + inputs: The query parameters to be used while executing the query + """ + self._query = query + # create an in-memory database that's non-persistent + self._con = duckdb.connect(":memory:") + + outputs = {"result": StructuredDataset} + + super(DuckDBQuery, self).__init__( + name=name, + task_type=self._TASK_TYPE, + task_config=None, + interface=Interface(inputs=inputs, outputs=outputs), + **kwargs, + ) + + def _execute_query(self, params: list, query: str, counter: int, multiple_params: bool): + """ + This method runs the DuckDBQuery. + + Args: + params: Query parameters to use while executing the query + query: DuckDB query to execute + counter: Use counter to map user-given arguments to the query parameters + multiple_params: Set flag to indicate the presence of params for multiple queries + """ + if any(x in query for x in ("$", "?")): + if multiple_params: + counter += 1 + if not counter < len(params): + raise ValueError("Parameter doesn't exist.") + if "insert" in query.lower(): + # run executemany disregarding the number of entries to store for an insert query + yield QueryOutput(output=self._con.executemany(query, params[counter]), counter=counter) + else: + yield QueryOutput(output=self._con.execute(query, params[counter]), counter=counter) + else: + if params: + yield QueryOutput(output=self._con.execute(query, params), counter=counter) + else: + raise ValueError("Parameter not specified.") + else: + yield QueryOutput(output=self._con.execute(query), counter=counter) + + def execute(self, **kwargs) -> StructuredDataset: + # TODO: Enable iterative download after adding the functionality to structured dataset code. + params = None + for key in self.python_interface.inputs.keys(): + val = kwargs.get(key) + if isinstance(val, StructuredDataset): + # register structured dataset + self._con.register(key, val.open(pa.Table).all()) + elif isinstance(val, (pd.DataFrame, pa.Table)): + # register pandas dataframe/arrow table + self._con.register(key, val) + elif isinstance(val, list): + # copy val into params + params = val + else: + raise ValueError(f"Expected inputs of type StructuredDataset, str or list, received {type(val)}") + + final_query = self._query + query_output = QueryOutput() + # set flag to indicate the presence of params for multiple queries + multiple_params = isinstance(params[0], list) if params else False + + if isinstance(self._query, list) and len(self._query) > 1: + # loop until the penultimate query + for query in self._query[:-1]: + query_output = next( + self._execute_query( + params=params, query=query, counter=query_output.counter, multiple_params=multiple_params + ) + ) + final_query = self._query[-1] + + # fetch query output from the last query + # expecting a SELECT query + dataframe = next( + self._execute_query( + params=params, query=final_query, counter=query_output.counter, multiple_params=multiple_params + ) + ).output.arrow() + + return StructuredDataset(dataframe=dataframe) diff --git a/plugins/flytekit-duckdb/requirements.in b/plugins/flytekit-duckdb/requirements.in new file mode 100644 index 0000000000..1589691aa5 --- /dev/null +++ b/plugins/flytekit-duckdb/requirements.in @@ -0,0 +1,3 @@ +. +-e file:.#egg=flytekitplugins-duckdb +duckdb diff --git a/plugins/flytekit-duckdb/requirements.txt b/plugins/flytekit-duckdb/requirements.txt new file mode 100644 index 0000000000..c69007f914 --- /dev/null +++ b/plugins/flytekit-duckdb/requirements.txt @@ -0,0 +1,194 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +-e file:.#egg=flytekitplugins-duckdb + # via -r requirements.in +arrow==1.2.3 + # via jinja2-time +binaryornot==0.4.4 + # via cookiecutter +certifi==2022.12.7 + # via requests +cffi==1.15.1 + # via cryptography +chardet==5.1.0 + # via binaryornot +charset-normalizer==2.1.1 + # via requests +click==8.1.3 + # via + # cookiecutter + # flytekit +cloudpickle==2.2.0 + # via flytekit +cookiecutter==2.1.1 + # via flytekit +croniter==1.3.8 + # via flytekit +cryptography==39.0.0 + # via pyopenssl +dataclasses-json==0.5.7 + # via flytekit +decorator==5.1.1 + # via retry +deprecated==1.2.13 + # via flytekit +diskcache==5.4.0 + # via flytekit +docker==6.0.1 + # via flytekit +docker-image-py==0.1.12 + # via flytekit +docstring-parser==0.15 + # via flytekit +duckdb==0.6.1 + # via + # -r requirements.in + # flytekitplugins-duckdb +flyteidl==1.3.2 + # via flytekit +flytekit==1.3.0b6 + # via flytekitplugins-duckdb +googleapis-common-protos==1.58.0 + # via + # flyteidl + # flytekit + # grpcio-status +grpcio==1.51.1 + # via + # flytekit + # grpcio-status +grpcio-status==1.51.1 + # via flytekit +idna==3.4 + # via requests +importlib-metadata==6.0.0 + # via + # flytekit + # keyring +jaraco-classes==3.2.3 + # via keyring +jinja2==3.1.2 + # via + # cookiecutter + # jinja2-time +jinja2-time==0.2.0 + # via cookiecutter +joblib==1.2.0 + # via flytekit +keyring==23.13.1 + # via flytekit +markupsafe==2.1.1 + # via jinja2 +marshmallow==3.19.0 + # via + # dataclasses-json + # marshmallow-enum + # marshmallow-jsonschema +marshmallow-enum==1.5.1 + # via dataclasses-json +marshmallow-jsonschema==0.13.0 + # via flytekit +more-itertools==9.0.0 + # via jaraco-classes +mypy-extensions==0.4.3 + # via typing-inspect +natsort==8.2.0 + # via flytekit +numpy==1.23.5 + # via + # duckdb + # flytekit + # pandas + # pyarrow +packaging==23.0 + # via + # docker + # marshmallow +pandas==1.5.2 + # via flytekit +protobuf==4.21.12 + # via + # flyteidl + # googleapis-common-protos + # grpcio-status + # protoc-gen-swagger +protoc-gen-swagger==0.1.0 + # via flyteidl +py==1.11.0 + # via retry +pyarrow==10.0.1 + # via flytekit +pycparser==2.21 + # via cffi +pyopenssl==23.0.0 + # via flytekit +python-dateutil==2.8.2 + # via + # arrow + # croniter + # flytekit + # pandas +python-json-logger==2.0.4 + # via flytekit +python-slugify==7.0.0 + # via cookiecutter +pytimeparse==1.1.8 + # via flytekit +pytz==2022.7 + # via + # flytekit + # pandas +pyyaml==6.0 + # via + # cookiecutter + # flytekit +regex==2022.10.31 + # via docker-image-py +requests==2.28.1 + # via + # cookiecutter + # docker + # flytekit + # responses +responses==0.22.0 + # via flytekit +retry==0.9.2 + # via flytekit +six==1.16.0 + # via python-dateutil +sortedcontainers==2.4.0 + # via flytekit +statsd==3.3.0 + # via flytekit +text-unidecode==1.3 + # via python-slugify +toml==0.10.2 + # via responses +types-toml==0.10.8.1 + # via responses +typing-extensions==4.4.0 + # via + # flytekit + # typing-inspect +typing-inspect==0.8.0 + # via dataclasses-json +urllib3==1.26.13 + # via + # docker + # flytekit + # requests + # responses +websocket-client==1.4.2 + # via docker +wheel==0.38.4 + # via flytekit +wrapt==1.14.1 + # via + # deprecated + # flytekit +zipp==3.11.0 + # via importlib-metadata diff --git a/plugins/flytekit-duckdb/setup.py b/plugins/flytekit-duckdb/setup.py new file mode 100644 index 0000000000..f2642bbdb0 --- /dev/null +++ b/plugins/flytekit-duckdb/setup.py @@ -0,0 +1,36 @@ +from setuptools import setup + +PLUGIN_NAME = "duckdb" + +microlib_name = f"flytekitplugins-{PLUGIN_NAME}" + +plugin_requires = ["flytekit>=1.3.0b2,<2.0.0", "duckdb"] + +__version__ = "0.0.0+develop" + +setup( + name=microlib_name, + version=__version__, + author="flyteorg", + author_email="admin@flyte.org", + description="DuckDB Plugin for Flytekit", + namespace_packages=["flytekitplugins"], + packages=[f"flytekitplugins.{PLUGIN_NAME}"], + install_requires=plugin_requires, + license="apache2", + python_requires=">=3.7,<3.11", + classifiers=[ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + ], +) diff --git a/plugins/flytekit-duckdb/tests/test_task.py b/plugins/flytekit-duckdb/tests/test_task.py new file mode 100644 index 0000000000..5af73a45eb --- /dev/null +++ b/plugins/flytekit-duckdb/tests/test_task.py @@ -0,0 +1,145 @@ +from typing import List, Union + +import pandas as pd +import pyarrow as pa +from flytekitplugins.duckdb import DuckDBQuery +from typing_extensions import Annotated + +from flytekit import kwtypes, task, workflow +from flytekit.types.structured.structured_dataset import StructuredDataset + + +def test_simple(): + duckdb_task = DuckDBQuery(name="duckdb_task", query="SELECT SUM(a) FROM mydf", inputs=kwtypes(mydf=pd.DataFrame)) + + @workflow + def pandas_wf(mydf: pd.DataFrame) -> pd.DataFrame: + return duckdb_task(mydf=mydf) + + @workflow + def arrow_wf(mydf: pd.DataFrame) -> pa.Table: + return duckdb_task(mydf=mydf) + + df = pd.DataFrame({"a": [1, 2, 3]}) + assert isinstance(pandas_wf(mydf=df), pd.DataFrame) + assert isinstance(arrow_wf(mydf=df), pa.Table) + + +def test_parquet(): + duckdb_query = DuckDBQuery( + name="read_parquet", + query=[ + "INSTALL httpfs", + "LOAD httpfs", + """SELECT hour(lpep_pickup_datetime) AS hour, count(*) AS count FROM READ_PARQUET(?) GROUP BY hour""", + ], + inputs=kwtypes(params=List[str]), + ) + + @workflow + def parquet_wf(parquet_file: str) -> pd.DataFrame: + return duckdb_query(params=[parquet_file]) + + assert isinstance( + parquet_wf(parquet_file="https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet"), + pd.DataFrame, + ) + + +def test_arrow(): + duckdb_task = DuckDBQuery( + name="duckdb_arrow_task", query="SELECT * FROM arrow_table WHERE i = 2", inputs=kwtypes(arrow_table=pa.Table) + ) + + @task + def get_arrow_table() -> pa.Table: + return pa.Table.from_pydict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]}) + + @workflow + def arrow_wf(arrow_table: pa.Table) -> pa.Table: + return duckdb_task(arrow_table=arrow_table) + + assert isinstance(arrow_wf(arrow_table=get_arrow_table()), pa.Table) + + +def test_structured_dataset_arrow_table(): + duckdb_task = DuckDBQuery( + name="duckdb_sd_table", + query="SELECT * FROM arrow_table WHERE i = 2", + inputs=kwtypes(arrow_table=StructuredDataset), + ) + + @task + def get_arrow_table() -> StructuredDataset: + return StructuredDataset( + dataframe=pa.Table.from_pydict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]}) + ) + + @workflow + def arrow_wf(arrow_table: StructuredDataset) -> pa.Table: + return duckdb_task(arrow_table=arrow_table) + + assert isinstance(arrow_wf(arrow_table=get_arrow_table()), pa.Table) + + +def test_structured_dataset_pandas_df(): + duckdb_task = DuckDBQuery( + name="duckdb_sd_df", + query="SELECT * FROM pandas_df WHERE i = 2", + inputs=kwtypes(pandas_df=StructuredDataset), + ) + + @task + def get_pandas_df() -> StructuredDataset: + return StructuredDataset( + dataframe=pd.DataFrame.from_dict({"i": [1, 2, 3, 4], "j": ["one", "two", "three", "four"]}) + ) + + @workflow + def pandas_wf(pandas_df: StructuredDataset) -> pd.DataFrame: + return duckdb_task(pandas_df=pandas_df) + + assert isinstance(pandas_wf(pandas_df=get_pandas_df()), pd.DataFrame) + + +def test_distinct_params(): + duckdb_params_query = DuckDBQuery( + name="params_query", + query=[ + "CREATE TABLE items(item VARCHAR, value DECIMAL(10,2), count INTEGER)", + "INSERT INTO items VALUES (?, ?, ?)", + "SELECT $1 AS one, $1 AS two, $2 AS three", + ], + inputs=kwtypes(params=List[List[Union[str, List[Union[str, int]]]]]), + ) + + @task + def read_df(df: Annotated[StructuredDataset, kwtypes(one=str)]) -> pd.DataFrame: + return df.open(pd.DataFrame).all() + + @workflow + def params_wf(params: List[List[Union[str, List[Union[str, int]]]]]) -> pd.DataFrame: + return read_df(df=duckdb_params_query(params=params)) + + params = [[["chainsaw", 500, 10], ["iphone", 300, 2]], ["duck", "goose"]] + wf_output = params_wf(params=params) + assert isinstance(wf_output, pd.DataFrame) + assert wf_output.columns.values == ["one"] + + +def test_insert_query_with_single_params(): + duckdb_params_query = DuckDBQuery( + name="params_query", + query=[ + "CREATE TABLE items(value DECIMAL(10,2))", + "INSERT INTO items VALUES (?)", + "SELECT * FROM items", + ], + inputs=kwtypes(params=List[List[List[int]]]), + ) + + @workflow + def params_wf(params: List[List[List[int]]]) -> pa.Table: + return duckdb_params_query(params=params) + + assert isinstance(params_wf(params=[[[500], [300], [2]]]), pa.Table) diff --git a/plugins/setup.py b/plugins/setup.py index de44797296..cef41e0a0b 100644 --- a/plugins/setup.py +++ b/plugins/setup.py @@ -15,6 +15,7 @@ "flytekitplugins-fsspec": "flytekit-data-fsspec", "flytekitplugins-dbt": "flytekit-dbt", "flytekitplugins-dolt": "flytekit-dolt", + "flytekitplugins-duckdb": "flytekit-duckdb", "flytekitplugins-great_expectations": "flytekit-greatexpectations", "flytekitplugins-hive": "flytekit-hive", "flytekitplugins-pod": "flytekit-k8s-pod",