forked from flyteorg/flytekit
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Gate new Structured Dataset feature & remove old config objects (flyt…
…eorg#831) Signed-off-by: Yee Hing Tong <[email protected]>
- Loading branch information
1 parent
9338b0d
commit b431c0d
Showing
21 changed files
with
324 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,23 @@ | ||
from flytekit.configuration.sdk import USE_STRUCTURED_DATASET | ||
from flytekit.loggers import logger | ||
|
||
from .basic_dfs import ( | ||
ArrowToParquetEncodingHandler, | ||
PandasToParquetEncodingHandler, | ||
ParquetToArrowDecodingHandler, | ||
ParquetToPandasDecodingHandler, | ||
) | ||
|
||
try: | ||
from .bigquery import ( | ||
ArrowToBQEncodingHandlers, | ||
BQToArrowDecodingHandler, | ||
BQToPandasDecodingHandler, | ||
PandasToBQEncodingHandlers, | ||
) | ||
except ImportError: | ||
logger.info( | ||
"We won't register bigquery handler for structured dataset because " | ||
"we can't find the packages google-cloud-bigquery-storage and google-cloud-bigquery" | ||
if USE_STRUCTURED_DATASET.get(): | ||
from .basic_dfs import ( | ||
ArrowToParquetEncodingHandler, | ||
PandasToParquetEncodingHandler, | ||
ParquetToArrowDecodingHandler, | ||
ParquetToPandasDecodingHandler, | ||
) | ||
|
||
try: | ||
from .bigquery import ( | ||
ArrowToBQEncodingHandlers, | ||
BQToArrowDecodingHandler, | ||
BQToPandasDecodingHandler, | ||
PandasToBQEncodingHandlers, | ||
) | ||
except ImportError: | ||
logger.info( | ||
"We won't register bigquery handler for structured dataset because " | ||
"we can't find the packages google-cloud-bigquery-storage and google-cloud-bigquery" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,7 @@ | ||
from .schema import ParquetToSparkDecodingHandler, SparkToParquetEncodingHandler | ||
from flytekit.configuration.sdk import USE_STRUCTURED_DATASET | ||
|
||
from .schema import SparkDataFrameSchemaReader, SparkDataFrameSchemaWriter, SparkDataFrameTransformer # noqa | ||
from .task import Spark, new_spark_session | ||
|
||
if USE_STRUCTURED_DATASET.get(): | ||
from .sd_transformers import ParquetToSparkDecodingHandler, SparkToParquetEncodingHandler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 49 additions & 0 deletions
49
plugins/flytekit-spark/flytekitplugins/spark/sd_transformers.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import typing | ||
|
||
from pyspark.sql.dataframe import DataFrame | ||
|
||
from flytekit import FlyteContext | ||
from flytekit.models import literals | ||
from flytekit.models.literals import StructuredDatasetMetadata | ||
from flytekit.models.types import StructuredDatasetType | ||
from flytekit.types.structured.structured_dataset import ( | ||
FLYTE_DATASET_TRANSFORMER, | ||
PARQUET, | ||
StructuredDataset, | ||
StructuredDatasetDecoder, | ||
StructuredDatasetEncoder, | ||
) | ||
|
||
|
||
class SparkToParquetEncodingHandler(StructuredDatasetEncoder): | ||
def __init__(self, protocol: str): | ||
super().__init__(DataFrame, protocol, PARQUET) | ||
|
||
def encode( | ||
self, | ||
ctx: FlyteContext, | ||
structured_dataset: StructuredDataset, | ||
structured_dataset_type: StructuredDatasetType, | ||
) -> literals.StructuredDataset: | ||
path = typing.cast(str, structured_dataset.uri) or ctx.file_access.get_random_remote_directory() | ||
df = typing.cast(DataFrame, structured_dataset.dataframe) | ||
df.write.mode("overwrite").parquet(path) | ||
return literals.StructuredDataset(uri=path, metadata=StructuredDatasetMetadata(structured_dataset_type)) | ||
|
||
|
||
class ParquetToSparkDecodingHandler(StructuredDatasetDecoder): | ||
def __init__(self, protocol: str): | ||
super().__init__(DataFrame, protocol, PARQUET) | ||
|
||
def decode( | ||
self, | ||
ctx: FlyteContext, | ||
flyte_value: literals.StructuredDataset, | ||
) -> DataFrame: | ||
user_ctx = FlyteContext.current_context().user_space_params | ||
return user_ctx.spark_session.read.parquet(flyte_value.uri) | ||
|
||
|
||
for protocol in ["/", "s3"]: | ||
FLYTE_DATASET_TRANSFORMER.register_handler(SparkToParquetEncodingHandler(protocol), default_for_type=True) | ||
FLYTE_DATASET_TRANSFORMER.register_handler(ParquetToSparkDecodingHandler(protocol), default_for_type=True) |
Oops, something went wrong.