datahub-project · hsheth2 · Dec 2, 2024 · Nov 29, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -57,7 +57,11 @@
     convert_to_cardinality,
 )
 from datahub.ingestion.source.sql.sql_report import SQLSourceReport
-from datahub.metadata.com.linkedin.pegasus2avro.schema import EditableSchemaMetadata
+from datahub.ingestion.source.sql.sql_types import resolve_sql_type
+from datahub.metadata.com.linkedin.pegasus2avro.schema import (
+    EditableSchemaMetadata,
+    NumberType,
+)
 from datahub.metadata.schema_classes import (
     DatasetFieldProfileClass,
     DatasetProfileClass,
@@ -361,6 +365,8 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
     platform: str
     env: str
 
+    column_types: Dict[str, str] = dataclasses.field(default_factory=dict)
+
     def _get_columns_to_profile(self) -> List[str]:
         if not self.config.any_field_level_metrics_enabled():
             return []
@@ -374,6 +380,7 @@ def _get_columns_to_profile(self) -> List[str]:
 
         for col_dict in self.dataset.columns:
             col = col_dict["name"]
+            self.column_types[col] = str(col_dict["type"])
             # We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
             if not self.config._allow_deny_patterns.allowed(
                 f"{self.dataset_name}.{col}"
@@ -430,6 +437,21 @@ def _get_column_type(self, column_spec: _SingleColumnSpec, column: str) -> None:
             self.dataset, column
         )
 
+        if column_spec.type_ == ProfilerDataType.UNKNOWN:
+            try:
+                datahub_field_type = resolve_sql_type(
+                    self.column_types[column], self.dataset.engine.dialect.name.lower()
+                )
+            except Exception as e:
+                logger.debug(
+                    f"Error resolving sql type {self.column_types[column]}: {e}"
+                )
+                datahub_field_type = None
+            if datahub_field_type is None:
+                return
+            if isinstance(datahub_field_type, NumberType):
+                column_spec.type_ = ProfilerDataType.NUMERIC
+
     @_run_with_query_combiner
     def _get_column_cardinality(
         self, column_spec: _SingleColumnSpec, column: str

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@@ -276,7 +276,6 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
     return VERTICA_SQL_TYPES_MAP[type_string]
 
 
-# see https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html
 SNOWFLAKE_TYPES_MAP: Dict[str, Any] = {
     "NUMBER": NumberType,
     "DECIMAL": NumberType,
@@ -312,6 +311,18 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
     "GEOGRAPHY": None,
 }
 
+
+def resolve_snowflake_modified_type(type_string: str) -> Any:
+    # Match types with precision and scale, e.g., 'DECIMAL(38,0)'
+    match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
+    if match:
+        modified_type_base = match.group(1)  # Extract the base type
+        return SNOWFLAKE_TYPES_MAP.get(modified_type_base, None)
+
+    # Fallback for types without precision/scale
+    return SNOWFLAKE_TYPES_MAP.get(type_string, None)
+
+
 # see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
 BIGQUERY_TYPES_MAP: Dict[str, Any] = {
     "STRING": StringType,
@@ -380,6 +391,7 @@ def resolve_vertica_modified_type(type_string: str) -> Any:
     "row": RecordType,
     "map": MapType,
     "array": ArrayType,
+    "json": RecordType,
 }
 
 # https://docs.aws.amazon.com/athena/latest/ug/data-types.html
@@ -490,7 +502,7 @@ def resolve_sql_type(
             TypeClass = resolve_vertica_modified_type(column_type)
         elif platform == "snowflake":
             # Snowflake types are uppercase, so we check that.
-            TypeClass = _merged_mapping.get(column_type.upper())
+            TypeClass = resolve_snowflake_modified_type(column_type.upper())
 
     if TypeClass:
         return TypeClass()