Fixed SQL-Datacleaning task. (flyteorg#377)

Signed-off-by: Ketan Umare <[email protected]>
eapolinario · Sep 1, 2021 · 35e9e48 · 35e9e48
1 parent 54819a3
commit 35e9e48
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 5 deletions.
diff --git a/cookbook/case_studies/feature_engineering/sqlite_datacleaning/datacleaning_workflow.py b/cookbook/case_studies/feature_engineering/sqlite_datacleaning/datacleaning_workflow.py
@@ -17,14 +17,34 @@
 from flytekit.extras.sqlite3.task import SQLite3Config, SQLite3Task
 from flytekit.types.schema import FlyteSchema
 
+
 # %%
 # Next, we define the reference tasks. A :py:func:`flytekit.reference_task` references the Flyte tasks that have already been defined, serialized, and registered.
 # The primary advantage of using a reference task is to reduce the redundancy; we needn't define the task(s) again if we have multiple datasets that need to be feature-engineered.
+#
+#  .. note::
+#
+#     The Macro ``{{ registration.version }}`` is filled during the registration time by `flytectl register`. This is usually not required for using reference tasks, you should
+#     ideally bind to a specific version of the entity - task / launchplan. But, in the case of this example, we are registering both the actual task ``sqlite_datacleaning.tasks.mean_median_imputer`` and
+#     and the workflow that references it. Thus we want it to actually be updated to the version of a specific release of FlyteSnacks. This is why we use the ``{{ registration.version }}`` macro.
+#     A typical example of reference task would look more like
+#
+#     .. code-block:: python
+#
+#          @reference_task(
+#               project="flytesnacks",
+#               domain="development",
+#               name="sqlite_datacleaning.tasks.mean_median_imputer",
+#               version="d06cebcfbeabc02b545eefa13a01c6ca992940c8", # If using GIT for versioning OR 0.16.0 is using semver
+#           )
+#           def mean_median_imputer()
+#               ...
+#
 @reference_task(
     project="flytesnacks",
     domain="development",
     name="sqlite_datacleaning.tasks.mean_median_imputer",
-    version="fast4f51f7895819256f2540a08c97a51194",
+    version="{{ registration.version }}",
 )
 def mean_median_imputer(
     dataframe: pd.DataFrame,
@@ -37,7 +57,7 @@ def mean_median_imputer(
     project="flytesnacks",
     domain="development",
     name="sqlite_datacleaning.tasks.univariate_selection",
-    version="fast4f51f7895819256f2540a08c97a51194",
+    version="{{ registration.version }}",
 )
 def univariate_selection(
     dataframe: pd.DataFrame,

diff --git a/cookbook/deployment/deploying_workflows.py b/cookbook/deployment/deploying_workflows.py
@@ -1,11 +1,19 @@
 """
-Deploying Workflows
---------------------
+Deploying Workflows - Registration
+-----------------------------------
 
 Locally, Flytekit relies on the Python interpreter to execute both tasks and workflows.
 To leverage the full power of Flyte, we recommend using a deployed backend of Flyte. Flyte can be run
 on any Kubernetes cluster (e.g. a local cluster like `kind <https://kind.sigs.k8s.io/>`__), in a cloud environment,
-or on-prem.
+or on-prem. This process of deploying your workflows to a Flyte cluster is called as Registration. It involves the
+following steps,
+
+1. Writing code, SQL etc
+2. Providing packaging in the form of Docker images, for code, when needed. Some cases you dont need packaging,
+   because the code itself is portable - example SQL, or the task references a remote service - Sagemaker Builtin
+   algorithms, or the code can be safely transferred over
+3. Alternatively, package with :ref:`deployment-fast-registration`
+4. Register the serialized workflows and tasks
 
 Using remote Flyte gives you the ability to:
 

diff --git a/cookbook/deployment/fast_registration.py b/cookbook/deployment/fast_registration.py
@@ -1,4 +1,6 @@
 """
+.. _deployment-fast-registration:
+
 #################
 Fast Registration
 #################