-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingestion/glue): delta schemas #10299
Changes from 6 commits
cd7f751
ac351e9
21b3c47
7fe6f0f
1562daa
cecb5aa
5f53955
bd8a97b
7774c34
1a66f4d
dbceda1
d634b10
8fc4f15
ee9f466
f602238
310102e
da5c3b7
6bec426
4bbadbd
b8dfc25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
import json | ||
import logging | ||
from collections import defaultdict | ||
from dataclasses import dataclass, field as dataclass_field | ||
|
@@ -98,6 +99,7 @@ | |
UpstreamClass, | ||
UpstreamLineageClass, | ||
) | ||
from datahub.utilities.delta import delta_type_to_hive_type | ||
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column | ||
|
||
logger = logging.getLogger(__name__) | ||
|
@@ -204,6 +206,8 @@ class GlueSourceReport(StaleEntityRemovalSourceReport): | |
num_job_script_failed_parsing: int = 0 | ||
num_job_without_nodes: int = 0 | ||
num_dataset_to_dataset_edges_in_job: int = 0 | ||
num_dataset_schema_invalid: int = 0 | ||
num_dataset_buggy_delta_schema: int = 0 | ||
|
||
def report_table_scanned(self) -> None: | ||
self.tables_scanned += 1 | ||
|
@@ -1148,9 +1152,35 @@ def get_s3_tags() -> Optional[GlobalTagsClass]: | |
return new_tags | ||
|
||
def get_schema_metadata() -> Optional[SchemaMetadata]: | ||
if not table.get("StorageDescriptor"): | ||
def is_delta_schema(columns: Optional[List[Mapping[str, Any]]]) -> bool: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Defining functions within other functions is discouraged as per coding style. Can you please move this function outside ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be refractored to accept both tableParameters and tableStorageDescriptor and then return boolean ? this function can subsume this check as well -> There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
While I do agree, I just followed the existing pattern in the code. Note So, are you suggesting to move There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As just moved the |
||
return ( | ||
columns is not None | ||
and (len(columns) == 1) | ||
and (columns[0].get("Name", "") == "col") | ||
and (columns[0].get("Type", "") == "array<string>") | ||
) | ||
|
||
# https://github.com/delta-io/delta/pull/2310 | ||
provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "") | ||
num_parts = int( | ||
table.get("Parameters", {}).get( | ||
"spark.sql.sources.schema.numParts", "0" | ||
) | ||
) | ||
columns = table.get("StorageDescriptor", {}).get("Columns", [{}]) | ||
|
||
if (provider == "delta") and (num_parts > 0) and is_delta_schema(columns): | ||
return _get_delta_schema_metadata() | ||
|
||
elif table.get("StorageDescriptor"): | ||
return _get_glue_schema_metadata() | ||
|
||
else: | ||
return None | ||
|
||
def _get_glue_schema_metadata() -> Optional[SchemaMetadata]: | ||
assert table.get("StorageDescriptor") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can remove this assert as this is already checked earlier. |
||
|
||
schema = table["StorageDescriptor"]["Columns"] | ||
fields: List[SchemaField] = [] | ||
for field in schema: | ||
|
@@ -1183,6 +1213,51 @@ def get_schema_metadata() -> Optional[SchemaMetadata]: | |
platformSchema=MySqlDDL(tableSchema=""), | ||
) | ||
|
||
def _get_delta_schema_metadata() -> Optional[SchemaMetadata]: | ||
assert ( | ||
table["Parameters"]["spark.sql.sources.provider"] == "delta" | ||
and int(table["Parameters"]["spark.sql.sources.schema.numParts"]) > 0 | ||
) | ||
|
||
try: | ||
numParts = int(table["Parameters"]["spark.sql.sources.schema.numParts"]) | ||
schema_str = "".join( | ||
[ | ||
table["Parameters"][f"spark.sql.sources.schema.part.{i}"] | ||
for i in range(numParts) | ||
] | ||
) | ||
schema_json = json.loads(schema_str) | ||
fields: List[SchemaField] = [] | ||
for field in schema_json["fields"]: | ||
field_type = delta_type_to_hive_type(field.get("type", "unknown")) | ||
schema_fields = get_schema_fields_for_hive_column( | ||
hive_column_name=field["name"], | ||
hive_column_type=field_type, | ||
description=field.get("description"), | ||
default_nullable=bool(field.get("nullable", True)), | ||
) | ||
assert schema_fields | ||
fields.extend(schema_fields) | ||
|
||
self.report.num_dataset_buggy_delta_schema += 1 | ||
return SchemaMetadata( | ||
schemaName=table_name, | ||
version=0, | ||
fields=fields, | ||
platform=f"urn:li:dataPlatform:{self.platform}", | ||
hash="", | ||
platformSchema=MySqlDDL(tableSchema=""), | ||
) | ||
|
||
except Exception as e: | ||
self.report_warning( | ||
dataset_urn, | ||
f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}", | ||
) | ||
self.report.num_dataset_schema_invalid += 1 | ||
return None | ||
|
||
def get_data_platform_instance() -> DataPlatformInstanceClass: | ||
return DataPlatformInstanceClass( | ||
platform=make_data_platform_urn(self.platform), | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from typing import Any | ||
|
||
|
||
def delta_type_to_hive_type(field_type: Any) -> str: | ||
if isinstance(field_type, str): | ||
""" | ||
return the field type | ||
""" | ||
return field_type | ||
else: | ||
if field_type.get("type") == "array": | ||
""" | ||
if array is of complex type, recursively parse the | ||
fields and create the native datatype | ||
""" | ||
return ( | ||
"array<" + delta_type_to_hive_type(field_type.get("elementType")) + ">" | ||
) | ||
elif field_type.get("type") == "struct": | ||
parsed_struct = "" | ||
for field in field_type.get("fields"): | ||
""" | ||
if field is of complex type, recursively parse | ||
and create the native datatype | ||
""" | ||
parsed_struct += ( | ||
"{0}:{1}".format( | ||
field.get("name"), | ||
delta_type_to_hive_type(field.get("type")), | ||
) | ||
+ "," | ||
) | ||
return "struct<" + parsed_struct.rstrip(",") + ">" | ||
return "" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note for reviewers:
I added these two mainly for the testing. I'm ok to rename or remove even
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is nice. Let's please rename as
num_dataset_schema_invalid -> num_dataset_invalid_delta_schema
num_dataset_buggy_delta_schema -> num_dataset_valid_delta_schema
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
addressed in d634b10