From 43bac365bc927b493246e15fc3894f5ca7c0bdb3 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Mon, 8 Jul 2024 21:56:39 +0530 Subject: [PATCH] fix(ingestion/lookml): liquid template resolution and view-to-view cll (#10542) --- metadata-ingestion/setup.py | 3 +- .../ingestion/source/looker/looker_common.py | 342 +++- .../ingestion/source/looker/looker_config.py | 81 +- .../source/looker/looker_connection.py | 55 + .../source/looker/looker_dataclasses.py | 290 +++ .../source/looker/looker_file_loader.py | 113 ++ .../source/looker/looker_liquid_tag.py | 104 + .../ingestion/source/looker/looker_source.py | 3 +- .../source/looker/looker_template_language.py | 115 ++ .../source/looker/looker_view_id_cache.py | 120 ++ .../source/looker/lookml_concept_context.py | 414 ++++ .../ingestion/source/looker/lookml_config.py | 235 +++ .../source/looker/lookml_refinement.py | 251 +++ .../ingestion/source/looker/lookml_source.py | 1692 ++--------------- .../ingestion/source/looker/str_functions.py | 23 + .../ingestion/source/looker/urn_functions.py | 18 + .../ingestion/source/looker/view_upstream.py | 636 +++++++ .../tests/integration/looker/test_looker.py | 15 +- .../duplicate_field_ingestion_golden.json | 14 +- .../integration/lookml/expected_output.json | 258 ++- .../lookml/field_tag_ingestion_golden.json | 14 +- .../lookml/lkml_samples/liquid.view.lkml | 22 +- .../nested/fragment_derived.view.lkml | 6 +- .../included_view_file.view.lkml | 2 +- .../lookml/lkml_samples_hive/liquid.view.lkml | 2 +- .../nested/fragment_derived.view.lkml | 6 +- .../lookml/lookml_mces_api_bigquery.json | 224 ++- .../lookml/lookml_mces_api_hive2.json | 224 ++- .../lookml/lookml_mces_badsql_parser.json | 264 ++- .../lookml/lookml_mces_offline.json | 162 +- ...lookml_mces_offline_platform_instance.json | 162 +- .../lookml_mces_with_external_urls.json | 162 +- .../lookml/lookml_reachable_views.json | 242 ++- ...l_same_name_views_different_file_path.json | 215 ++- .../lookml/refinements_ingestion_golden.json | 262 ++- .../tests/integration/lookml/test_lookml.py | 127 +- .../activity_logs.view.lkml | 18 + .../data.model.lkml | 22 + .../employee_income_source.view.lkml | 40 + .../employee_tax_report.view.lkml | 18 + .../employee_total_income.view.lkml | 18 + .../top_10_employee_income_source.view.lkml | 26 + .../vv_lineage_liquid_template_golden.json | 1335 +++++++++++++ 43 files changed, 6451 insertions(+), 1904 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/looker_connection.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/str_functions.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/urn_functions.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/activity_logs.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_tax_report.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_total_income.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/top_10_employee_income_source.view.lkml create mode 100644 metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index e25762925be29..ea88f1904d8a0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -166,6 +166,7 @@ "sql-metadata==2.2.2", *sqllineage_lib, "GitPython>2", + "python-liquid", } bigquery_common = { @@ -371,7 +372,7 @@ "kafka-connect": sql_common | {"requests", "JPype1"}, "ldap": {"python-ldap>=2.4"}, "looker": looker_common, - "lookml": looker_common, + "lookml": looker_common | sqlglot_lib, "metabase": {"requests"} | sqlglot_lib, "mlflow": { "mlflow-skinny>=2.3.0", diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 1d7956a806558..ce135f90e828b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -8,7 +8,6 @@ from enum import Enum from functools import lru_cache from typing import ( - TYPE_CHECKING, Dict, Iterable, Iterator, @@ -38,12 +37,20 @@ from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.looker.looker_config import ( LookerCommonConfig, + LookerConnectionDefinition, LookerDashboardSourceConfig, NamingPatternMapping, ViewNamingPatternMapping, ) from datahub.ingestion.source.looker.looker_constant import IMPORTED_PROJECTS +from datahub.ingestion.source.looker.looker_dataclasses import ProjectInclude +from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI +from datahub.ingestion.source.looker.lookml_config import ( + _BASE_PROJECT_NAME, + LookMLSourceReport, +) +from datahub.ingestion.source.looker.str_functions import remove_suffix from datahub.ingestion.source.sql.sql_types import ( POSTGRES_TYPES_MAP, SNOWFLAKE_TYPES_MAP, @@ -93,17 +100,12 @@ TagSnapshotClass, ) from datahub.metadata.urns import TagUrn +from datahub.sql_parsing.sqlglot_lineage import ColumnRef from datahub.utilities.lossy_collections import LossyList, LossySet from datahub.utilities.url_util import remove_port_from_url CORPUSER_DATAHUB = "urn:li:corpuser:datahub" -if TYPE_CHECKING: - from datahub.ingestion.source.looker.lookml_source import ( - LookerViewFileLoader, - LookMLSourceReport, - ) - logger = logging.getLogger(__name__) @@ -126,11 +128,59 @@ class LookerFolderKey(ContainerKey): folder_id: str -def remove_suffix(original: str, suffix: str) -> str: - # This can be removed in favour of original.removesuffix for python>3.8 - if original.endswith(suffix): - return original[: -len(suffix)] - return original +def deduplicate_fields(fields: List["ViewField"]) -> List["ViewField"]: + # Remove duplicates filed from self.fields + # Logic is: If more than a field has same ViewField.name then keep only one filed where ViewField.field_type + # is DIMENSION_GROUP. + # Looker Constraint: + # - Any field declared as dimension or measure can be redefined as dimension_group. + # - Any field declared in dimension can't be redefined in measure and vice-versa. + + dimension_group_field_names: List[str] = [ + field.name + for field in fields + if field.field_type == ViewFieldType.DIMENSION_GROUP + ] + + new_fields: List[ViewField] = [] + + for field in fields: + if ( + field.name in dimension_group_field_names + and field.field_type != ViewFieldType.DIMENSION_GROUP + ): + continue + + new_fields.append(field) + + return new_fields + + +def find_view_from_resolved_includes( + connection: Optional[LookerConnectionDefinition], + resolved_includes: List["ProjectInclude"], + looker_viewfile_loader: LookerViewFileLoader, + target_view_name: str, + reporter: LookMLSourceReport, +) -> Optional[Tuple["ProjectInclude", dict]]: + # It could live in one of the included files. We do not know which file the base view + # lives in, so we try them all! + for include in resolved_includes: + included_looker_viewfile = looker_viewfile_loader.load_viewfile( + include.include, + include.project, + connection, + reporter, + ) + if not included_looker_viewfile: + continue + for raw_view in included_looker_viewfile.views: + raw_view_name = raw_view["name"] + # Make sure to skip loading view we are currently trying to resolve + if raw_view_name == target_view_name: + return include, raw_view + + return None @dataclass @@ -243,58 +293,170 @@ class ViewField: project_name: Optional[str] = None view_name: Optional[str] = None is_primary_key: bool = False - upstream_fields: List[str] = dataclasses_field(default_factory=list) tags: List[str] = dataclasses_field(default_factory=list) + # It is the list of ColumnRef for derived view defined using SQL otherwise simple column name + upstream_fields: Union[List[ColumnRef]] = dataclasses_field(default_factory=list) + + @classmethod + def view_fields_from_dict( + cls, + field_dict: Dict, + upstream_column_ref: List[ColumnRef], + type_cls: ViewFieldType, + populate_sql_logic_in_descriptions: bool, + ) -> "ViewField": + + is_primary_key = field_dict.get("primary_key", "no") == "yes" + + name = field_dict["name"] + + native_type = field_dict.get("type", "string") + + default_description = ( + f"sql:{field_dict['sql']}" + if "sql" in field_dict and populate_sql_logic_in_descriptions + else "" + ) + + description = field_dict.get("description", default_description) + + label = field_dict.get("label", "") + + return ViewField( + name=name, + type=native_type, + label=label, + description=description, + is_primary_key=is_primary_key, + field_type=type_cls, + tags=field_dict.get("tags") or [], + upstream_fields=upstream_column_ref, + ) + @dataclass class ExploreUpstreamViewField: explore: LookmlModelExplore field: LookmlModelExploreField - def _form_field_name(self): + def _form_field_name( + self, + view_project_map: Dict[str, str], + explore_project_name: str, + model_name: str, + upstream_views_file_path: Dict[str, Optional[str]], + config: LookerCommonConfig, + ) -> Optional[ColumnRef]: assert self.field.name is not None if len(self.field.name.split(".")) != 2: - return self.field.name # Inconsistent info received + return None # Inconsistent info received - view_name: Optional[str] = self.explore.name + assert self.explore.name - if ( - self.field.original_view is not None - ): # if `from` is used in explore then original_view is pointing to - # lookml view - view_name = self.field.original_view + view_name: Optional[str] = ( + self.explore.name + if self.field.original_view is not None + else self.field.original_view + ) field_name = self.field.name.split(".")[1] - return f"{view_name}.{field_name}" + if ( + self.field.field_group_variant is not None + and self.field.field_group_variant.lower() in field_name.lower() + ): + # remove variant at the end. +1 for "_" + field_name = field_name[ + : -(len(self.field.field_group_variant.lower()) + 1) + ] + + assert view_name # for lint false positive + + project_include: ProjectInclude = ProjectInclude( + project=view_project_map.get(view_name, _BASE_PROJECT_NAME), + include=view_name, + ) + + file_path: Optional[str] = ( + upstream_views_file_path.get(view_name) + if upstream_views_file_path.get(view_name) is not None + else ViewFieldValue.NOT_AVAILABLE.value + ) + + assert file_path + + view_urn = LookerViewId( + project_name=( + project_include.project + if project_include.project != _BASE_PROJECT_NAME + else explore_project_name + ), + model_name=model_name, + view_name=project_include.include, + file_path=file_path, + ).get_urn(config) + + return ColumnRef( + table=view_urn, + column=field_name, + ) - def upstream(self) -> str: + def upstream( + self, + view_project_map: Dict[str, str], + explore_project_name: str, + model_name: str, + upstream_views_file_path: Dict[str, Optional[str]], + config: LookerCommonConfig, + ) -> Optional[ColumnRef]: assert self.field.name is not None if self.field.dimension_group is None: # It is not part of Dimensional Group - return self._form_field_name() + return self._form_field_name( + view_project_map, + explore_project_name, + model_name, + upstream_views_file_path, + config, + ) if self.field.field_group_variant is None: - return ( - self._form_field_name() + return self._form_field_name( + view_project_map, + explore_project_name, + model_name, + upstream_views_file_path, + config, ) # Variant i.e. Month, Day, Year ... is not available if self.field.type is None or not self.field.type.startswith("date_"): - return ( - self._form_field_name() + return self._form_field_name( + view_project_map, + explore_project_name, + model_name, + upstream_views_file_path, + config, ) # for Dimensional Group the type is always start with date_[time|date] if not self.field.name.endswith(f"_{self.field.field_group_variant.lower()}"): - return ( - self._form_field_name() + return self._form_field_name( + view_project_map, + explore_project_name, + model_name, + upstream_views_file_path, + config, ) # if the explore field is generated because of Dimensional Group in View # then the field_name should ends with field_group_variant - return self._form_field_name()[ - : -(len(self.field.field_group_variant.lower()) + 1) - ] # remove variant at the end. +1 for "_" + return self._form_field_name( + view_project_map, + explore_project_name, + model_name, + upstream_views_file_path, + config, + ) def create_view_project_map(view_fields: List[ViewField]) -> Dict[str, str]: @@ -481,9 +643,7 @@ def extract_project_name_from_source_file( return None @staticmethod - def _get_field_type( - native_type: str, reporter: SourceReport - ) -> SchemaFieldDataType: + def get_field_type(native_type: str) -> SchemaFieldDataType: type_class = LookerUtil.field_type_mapping.get(native_type) if type_class is None: @@ -603,7 +763,7 @@ def view_field_to_schema_field( ) -> SchemaField: return SchemaField( fieldPath=field.name, - type=LookerUtil._get_field_type(field.type, reporter), + type=LookerUtil.get_field_type(field.type), nativeDataType=field.type, label=field.label, description=( @@ -658,12 +818,6 @@ def create_query_request(q: dict, limit: Optional[str] = None) -> WriteQuery: ) -@dataclass(frozen=True, order=True) -class ProjectInclude: - project: str - include: str - - @dataclass class LookerExplore: name: str @@ -699,8 +853,8 @@ def from_dict( model_name: str, dict: Dict, resolved_includes: List[ProjectInclude], - looker_viewfile_loader: "LookerViewFileLoader", - reporter: "LookMLSourceReport", + looker_viewfile_loader: LookerViewFileLoader, + reporter: LookMLSourceReport, model_explores_map: Dict[str, dict], ) -> "LookerExplore": view_names: Set[str] = set() @@ -724,12 +878,6 @@ def from_dict( fields = cls._get_fields_from_sql_equality(sql_on) joins = fields - # HACK: We shouldn't be doing imports here. We also have - # circular imports that don't belong. - from datahub.ingestion.source.looker.lookml_source import ( - _find_view_from_resolved_includes, - ) - upstream_views: List[ProjectInclude] = [] # create the list of extended explores extends = list( @@ -756,7 +904,7 @@ def from_dict( else: # we only fallback to the view_names list if this is not an extended explore for view_name in view_names: - info = _find_view_from_resolved_includes( + info = find_view_from_resolved_includes( None, resolved_includes, looker_viewfile_loader, @@ -837,7 +985,7 @@ def from_api( # noqa: C901 reporter.report_warning( title="Missing View Name", message="The field was not prefixed by a view name. This can happen when the field references another dynamic field.", - context=view_name, + context=field_name, ) continue @@ -850,18 +998,14 @@ def from_api( # noqa: C901 views.add(view_name) view_fields: List[ViewField] = [] + field_name_vs_raw_explore_field: Dict = {} if explore.fields is not None: if explore.fields.dimensions is not None: for dim_field in explore.fields.dimensions: if dim_field.name is None: continue else: - dimension_upstream_field: ExploreUpstreamViewField = ( - ExploreUpstreamViewField( - explore=explore, - field=dim_field, - ) - ) + field_name_vs_raw_explore_field[dim_field.name] = dim_field view_fields.append( ViewField( @@ -893,9 +1037,7 @@ def from_api( # noqa: C901 if dim_field.primary_key else False ), - upstream_fields=[ - dimension_upstream_field.upstream() - ], + upstream_fields=[], ) ) if explore.fields.measures is not None: @@ -903,12 +1045,9 @@ def from_api( # noqa: C901 if measure_field.name is None: continue else: - measure_upstream_field: ExploreUpstreamViewField = ( - ExploreUpstreamViewField( - explore=explore, - field=measure_field, - ) - ) + field_name_vs_raw_explore_field[ + measure_field.name + ] = measure_field view_fields.append( ViewField( @@ -936,7 +1075,7 @@ def from_api( # noqa: C901 if measure_field.primary_key else False ), - upstream_fields=[measure_upstream_field.upstream()], + upstream_fields=[], ) ) @@ -953,6 +1092,28 @@ def from_api( # noqa: C901 if upstream_views_file_path: logger.debug(f"views and their file-paths: {upstream_views_file_path}") + # form upstream of fields as all information is now available + for view_field in view_fields: + measure_upstream_field: ExploreUpstreamViewField = ( + ExploreUpstreamViewField( + explore=explore, + field=field_name_vs_raw_explore_field[view_field.name], + ) + ) + + assert explore.project_name is not None + + column_ref: Optional[ColumnRef] = measure_upstream_field.upstream( + view_project_map=view_project_map, + explore_project_name=explore.project_name, + model_name=model, + upstream_views_file_path=upstream_views_file_path, + config=source_config, + ) + view_field.upstream_fields = ( + [column_ref] if column_ref is not None else [] + ) + return cls( name=explore_name, model_name=model, @@ -1066,7 +1227,8 @@ def _to_metadata_events( # noqa: C901 observed_lineage_ts = datetime.datetime.now(tz=datetime.timezone.utc) for view_ref in sorted(self.upstream_views): # set file_path to ViewFieldType.UNKNOWN if file_path is not available to keep backward compatibility - # if we raise error on file_path equal to None then existing test-cases will fail as mock data doesn't have required attributes. + # if we raise error on file_path equal to None then existing test-cases will fail as mock data + # doesn't have required attributes. file_path: str = ( cast(str, self.upstream_views_file_path[view_ref.include]) if self.upstream_views_file_path[view_ref.include] is not None @@ -1098,30 +1260,24 @@ def _to_metadata_events( # noqa: C901 fine_grained_lineages = [] if config.extract_column_level_lineage: for field in self.fields or []: - for upstream_field in field.upstream_fields: - if len(upstream_field.split(".")) >= 2: - (view_name, field_path) = upstream_field.split(".")[ - 0 - ], ".".join(upstream_field.split(".")[1:]) - assert view_name - view_urn = view_name_to_urn_map.get(view_name, "") - if view_urn: - fine_grained_lineages.append( - FineGrainedLineageClass( - upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, - downstreamType=FineGrainedLineageDownstreamType.FIELD, - upstreams=[ - builder.make_schema_field_urn( - view_urn, field_path - ) - ], - downstreams=[ - builder.make_schema_field_urn( - self.get_explore_urn(config), field.name - ) - ], + for upstream_column_ref in field.upstream_fields: + fine_grained_lineages.append( + FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + downstreamType=FineGrainedLineageDownstreamType.FIELD, + upstreams=[ + builder.make_schema_field_urn( + upstream_column_ref.table, + upstream_column_ref.column, ) - ) + ], + downstreams=[ + builder.make_schema_field_urn( + self.get_explore_urn(config), field.name + ) + ], + ) + ) upstream_lineage = UpstreamLineage( upstreams=upstreams, fineGrainedLineages=fine_grained_lineages or None diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 8de213cfabaf0..5b774012e70da 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -1,13 +1,14 @@ import dataclasses import os import re -from typing import Any, ClassVar, Dict, List, Optional, Union, cast +from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union, cast import pydantic +from looker_sdk.sdk.api40.models import DBConnection from pydantic import Field, validator from datahub.configuration import ConfigModel -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.source_common import ( EnvConfigMixin, PlatformInstanceConfigMixin, @@ -143,6 +144,82 @@ class LookerCommonConfig(EnvConfigMixin, PlatformInstanceConfigMixin): ) +def _get_bigquery_definition( + looker_connection: DBConnection, +) -> Tuple[str, Optional[str], Optional[str]]: + platform = "bigquery" + # bigquery project ids are returned in the host field + db = looker_connection.host + schema = looker_connection.database + return platform, db, schema + + +def _get_generic_definition( + looker_connection: DBConnection, platform: Optional[str] = None +) -> Tuple[str, Optional[str], Optional[str]]: + if platform is None: + # We extract the platform from the dialect name + dialect_name = looker_connection.dialect_name + assert dialect_name is not None + # generally the first part of the dialect name before _ is the name of the platform + # versions are encoded as numbers and can be removed + # e.g. spark1 or hive2 or druid_18 + platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0]) + + assert ( + platform is not None + ), f"Failed to extract a valid platform from connection {looker_connection}" + db = looker_connection.database + schema = looker_connection.schema # ok for this to be None + return platform, db, schema + + +class LookerConnectionDefinition(ConfigModel): + platform: str + default_db: str + default_schema: Optional[str] # Optional since some sources are two-level only + platform_instance: Optional[str] = None + platform_env: Optional[str] = Field( + default=None, + description="The environment that the platform is located in. Leaving this empty will inherit defaults from " + "the top level Looker configuration", + ) + + @validator("platform_env") + def platform_env_must_be_one_of(cls, v: Optional[str]) -> Optional[str]: + if v is not None: + return EnvConfigMixin.env_must_be_one_of(v) + return v + + @validator("platform", "default_db", "default_schema") + def lower_everything(cls, v): + """We lower case all strings passed in to avoid casing issues later""" + if v is not None: + return v.lower() + + @classmethod + def from_looker_connection( + cls, looker_connection: DBConnection + ) -> "LookerConnectionDefinition": + """Dialect definitions are here: https://docs.looker.com/setup-and-management/database-config""" + extractors: Dict[str, Any] = { + "^bigquery": _get_bigquery_definition, + ".*": _get_generic_definition, + } + + if looker_connection.dialect_name is None: + raise ConfigurationError( + f"Unable to fetch a fully filled out connection for {looker_connection.name}. Please check your API permissions." + ) + for extractor_pattern, extracting_function in extractors.items(): + if re.match(extractor_pattern, looker_connection.dialect_name): + (platform, db, schema) = extracting_function(looker_connection) + return cls(platform=platform, default_db=db, default_schema=schema) + raise ConfigurationError( + f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}" + ) + + class LookerDashboardSourceConfig( LookerAPIConfig, LookerCommonConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_connection.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_connection.py new file mode 100644 index 0000000000000..2b7ce6f6da026 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_connection.py @@ -0,0 +1,55 @@ +import logging +from typing import Optional + +from looker_sdk.error import SDKError +from looker_sdk.sdk.api40.models import DBConnection + +from datahub.configuration.common import ConfigurationError +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI +from datahub.ingestion.source.looker.lookml_config import ( + LookMLSourceConfig, + LookMLSourceReport, +) + +logger = logging.getLogger(__name__) + + +def get_connection_def_based_on_connection_string( + connection: str, + source_config: LookMLSourceConfig, + looker_client: Optional[LookerAPI], + reporter: LookMLSourceReport, +) -> Optional[LookerConnectionDefinition]: + if source_config.connection_to_platform_map is None: + source_config.connection_to_platform_map = {} + + assert source_config.connection_to_platform_map is not None + + connection_def: Optional[LookerConnectionDefinition] = None + + if connection in source_config.connection_to_platform_map: + connection_def = source_config.connection_to_platform_map[connection] + elif looker_client: + try: + looker_connection: DBConnection = looker_client.connection(connection) + except SDKError: + logger.error( + f"Failed to retrieve connection {connection} from Looker. This usually happens when the " + f"credentials provided are not admin credentials." + ) + else: + try: + connection_def = LookerConnectionDefinition.from_looker_connection( + looker_connection + ) + + # Populate the cache (using the config map) to avoid calling looker again for this connection + source_config.connection_to_platform_map[connection] = connection_def + except ConfigurationError: + reporter.report_warning( + f"connection-{connection}", + "Failed to load connection from Looker", + ) + + return connection_def diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py new file mode 100644 index 0000000000000..adaa3c4875450 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_dataclasses.py @@ -0,0 +1,290 @@ +import glob +import logging +import pathlib +from dataclasses import dataclass +from typing import Dict, List, Optional, Set + +from datahub.ingestion.source.looker.lkml_patched import load_lkml +from datahub.ingestion.source.looker.looker_connection import LookerConnectionDefinition +from datahub.ingestion.source.looker.lookml_config import ( + _BASE_PROJECT_NAME, + _EXPLORE_FILE_EXTENSION, + LookMLSourceReport, +) + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, order=True) +class ProjectInclude: + project: str + include: str + + +@dataclass +class LookerField: + name: str + primary_key: str # possible values yes and no + type: str + sql: Optional[str] + + +@dataclass +class LookerModel: + connection: str + includes: List[str] + explores: List[dict] + resolved_includes: List[ProjectInclude] + + @staticmethod + def from_looker_dict( + looker_model_dict: dict, + base_project_name: str, + root_project_name: Optional[str], + base_projects_folders: Dict[str, pathlib.Path], + path: str, + reporter: LookMLSourceReport, + ) -> "LookerModel": + logger.debug(f"Loading model from {path}") + connection = looker_model_dict["connection"] + includes = looker_model_dict.get("includes", []) + resolved_includes = LookerModel.resolve_includes( + includes, + base_project_name, + root_project_name, + base_projects_folders, + path, + reporter, + seen_so_far=set(), + traversal_path=pathlib.Path(path).stem, + ) + logger.debug(f"{path} has resolved_includes: {resolved_includes}") + explores = looker_model_dict.get("explores", []) + + explore_files = [ + x.include + for x in resolved_includes + if x.include.endswith(_EXPLORE_FILE_EXTENSION) + ] + for included_file in explore_files: + try: + parsed = load_lkml(included_file) + included_explores = parsed.get("explores", []) + explores.extend(included_explores) + except Exception as e: + reporter.report_warning( + title="Error Loading Include", + message="Failed to load include file", + context=f"Include Details: {included_file}", + exc=e, + ) + # continue in this case, as it might be better to load and resolve whatever we can + + return LookerModel( + connection=connection, + includes=includes, + resolved_includes=resolved_includes, + explores=explores, + ) + + @staticmethod + def resolve_includes( + includes: List[str], + project_name: str, + root_project_name: Optional[str], + base_projects_folder: Dict[str, pathlib.Path], + path: str, + reporter: LookMLSourceReport, + seen_so_far: Set[str], + traversal_path: str = "", # a cosmetic parameter to aid debugging + ) -> List[ProjectInclude]: + """Resolve ``include`` statements in LookML model files to a list of ``.lkml`` files. + + For rules on how LookML ``include`` statements are written, see + https://docs.looker.com/data-modeling/getting-started/ide-folders#wildcard_examples + """ + + resolved = [] + for inc in includes: + # Filter out dashboards - we get those through the looker source. + if ( + inc.endswith(".dashboard") + or inc.endswith(".dashboard.lookml") + or inc.endswith(".dashboard.lkml") + ): + logger.debug(f"include '{inc}' is a dashboard, skipping it") + continue + + resolved_project_name = project_name + resolved_project_folder = str(base_projects_folder[project_name]) + + # Massage the looker include into a valid glob wildcard expression + if inc.startswith("//"): + # remote include, let's see if we have the project checked out locally + (remote_project, project_local_path) = inc[2:].split("/", maxsplit=1) + if remote_project in base_projects_folder: + resolved_project_folder = str(base_projects_folder[remote_project]) + glob_expr = f"{resolved_project_folder}/{project_local_path}" + resolved_project_name = remote_project + else: + logger.warning( + f"Resolving {inc} failed. Could not find a locally checked out reference for {remote_project}" + ) + continue + elif inc.startswith("/"): + glob_expr = f"{resolved_project_folder}{inc}" + + # The include path is sometimes '/{project_name}/{path_within_project}' + # instead of '//{project_name}/{path_within_project}' or '/{path_within_project}'. + # + # TODO: I can't seem to find any documentation on this pattern, but we definitely + # have seen it in the wild. Example from Mozilla's public looker-hub repo: + # https://github.com/mozilla/looker-hub/blob/f491ca51ce1add87c338e6723fd49bc6ae4015ca/fenix/explores/activation.explore.lkml#L7 + # As such, we try to handle it but are as defensive as possible. + + non_base_project_name = project_name + if project_name == _BASE_PROJECT_NAME and root_project_name is not None: + non_base_project_name = root_project_name + if non_base_project_name != _BASE_PROJECT_NAME and inc.startswith( + f"/{non_base_project_name}/" + ): + # This might be a local include. Let's make sure that '/{project_name}' doesn't + # exist as normal include in the project. + if not pathlib.Path( + f"{resolved_project_folder}/{non_base_project_name}" + ).exists(): + path_within_project = pathlib.Path(*pathlib.Path(inc).parts[2:]) + glob_expr = f"{resolved_project_folder}/{path_within_project}" + else: + # Need to handle a relative path. + glob_expr = str(pathlib.Path(path).parent / inc) + # "**" matches an arbitrary number of directories in LookML + # we also resolve these paths to absolute paths so we can de-dup effectively later on + included_files = [ + str(p.resolve()) + for p in [ + pathlib.Path(p) + for p in sorted( + glob.glob(glob_expr, recursive=True) + + glob.glob(f"{glob_expr}.lkml", recursive=True) + ) + ] + # We don't want to match directories. The '**' glob can be used to + # recurse into directories. + if p.is_file() + ] + logger.debug( + f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}" + ) + if "*" not in inc and not included_files: + reporter.report_failure( + title="Error Resolving Include", + message=f"Cannot resolve include {inc}", + context=f"Path: {path}", + ) + elif not included_files: + reporter.report_failure( + title="Error Resolving Include", + message=f"Did not resolve anything for wildcard include {inc}", + context=f"Path: {path}", + ) + # only load files that we haven't seen so far + included_files = [x for x in included_files if x not in seen_so_far] + for included_file in included_files: + # Filter out dashboards - we get those through the looker source. + if ( + included_file.endswith(".dashboard") + or included_file.endswith(".dashboard.lookml") + or included_file.endswith(".dashboard.lkml") + ): + logger.debug( + f"include '{included_file}' is a dashboard, skipping it" + ) + continue + + logger.debug( + f"Will be loading {included_file}, traversed here via {traversal_path}" + ) + try: + parsed = load_lkml(included_file) + seen_so_far.add(included_file) + if "includes" in parsed: # we have more includes to resolve! + resolved.extend( + LookerModel.resolve_includes( + parsed["includes"], + resolved_project_name, + root_project_name, + base_projects_folder, + included_file, + reporter, + seen_so_far, + traversal_path=traversal_path + + "." + + pathlib.Path(included_file).stem, + ) + ) + except Exception as e: + reporter.report_warning( + title="Error Loading Include File", + message="Failed to load included file", + context=f"Include Details: {included_file}", + exc=e, + ) + # continue in this case, as it might be better to load and resolve whatever we can + + resolved.extend( + [ + ProjectInclude(project=resolved_project_name, include=f) + for f in included_files + ] + ) + return resolved + + +@dataclass +class LookerViewFile: + absolute_file_path: str + connection: Optional[LookerConnectionDefinition] + includes: List[str] + resolved_includes: List[ProjectInclude] + views: List[Dict] + raw_file_content: str + + @classmethod + def from_looker_dict( + cls, + absolute_file_path: str, + looker_view_file_dict: dict, + project_name: str, + root_project_name: Optional[str], + base_projects_folder: Dict[str, pathlib.Path], + raw_file_content: str, + reporter: LookMLSourceReport, + ) -> "LookerViewFile": + logger.debug(f"Loading view file at {absolute_file_path}") + includes = looker_view_file_dict.get("includes", []) + resolved_path = str(pathlib.Path(absolute_file_path).resolve()) + seen_so_far = set() + seen_so_far.add(resolved_path) + resolved_includes = LookerModel.resolve_includes( + includes, + project_name, + root_project_name, + base_projects_folder, + absolute_file_path, + reporter, + seen_so_far=seen_so_far, + ) + logger.debug( + f"resolved_includes for {absolute_file_path} is {resolved_includes}" + ) + views = looker_view_file_dict.get("views", []) + + return cls( + absolute_file_path=absolute_file_path, + connection=None, + includes=includes, + resolved_includes=resolved_includes, + views=views, + raw_file_content=raw_file_content, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py new file mode 100644 index 0000000000000..1b6619b4c4d28 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_file_loader.py @@ -0,0 +1,113 @@ +import logging +import pathlib +from dataclasses import replace +from typing import Any, Dict, Optional + +from datahub.ingestion.source.looker.lkml_patched import load_lkml +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile +from datahub.ingestion.source.looker.looker_template_language import ( + resolve_liquid_variable_in_view_dict, +) +from datahub.ingestion.source.looker.lookml_config import ( + _EXPLORE_FILE_EXTENSION, + _VIEW_FILE_EXTENSION, + LookMLSourceReport, +) + +logger = logging.getLogger(__name__) + + +class LookerViewFileLoader: + """ + Loads the looker viewfile at a :path and caches the LookerViewFile in memory + This is to avoid reloading the same file off of disk many times during the recursive include resolution process + """ + + def __init__( + self, + root_project_name: Optional[str], + base_projects_folder: Dict[str, pathlib.Path], + reporter: LookMLSourceReport, + liquid_variable: Dict[Any, Any], + ) -> None: + self.viewfile_cache: Dict[str, LookerViewFile] = {} + self._root_project_name = root_project_name + self._base_projects_folder = base_projects_folder + self.reporter = reporter + self.liquid_variable = liquid_variable + + def is_view_seen(self, path: str) -> bool: + return path in self.viewfile_cache + + def _load_viewfile( + self, project_name: str, path: str, reporter: LookMLSourceReport + ) -> Optional[LookerViewFile]: + # always fully resolve paths to simplify de-dup + path = str(pathlib.Path(path).resolve()) + allowed_extensions = [_VIEW_FILE_EXTENSION, _EXPLORE_FILE_EXTENSION] + matched_any_extension = [ + match for match in [path.endswith(x) for x in allowed_extensions] if match + ] + if not matched_any_extension: + # not a view file + logger.debug( + f"Skipping file {path} because it doesn't appear to be a view file. Matched extensions {allowed_extensions}" + ) + return None + + if self.is_view_seen(str(path)): + return self.viewfile_cache[path] + + try: + with open(path) as file: + raw_file_content = file.read() + except Exception as e: + logger.debug(f"An error occurred while reading path {path}", exc_info=True) + self.reporter.report_failure( + path, f"failed to load view file {path} from disk: {e}" + ) + return None + try: + logger.debug(f"Loading viewfile {path}") + + parsed = load_lkml(path) + + resolve_liquid_variable_in_view_dict( + raw_view=parsed, + liquid_variable=self.liquid_variable, + ) + + looker_viewfile = LookerViewFile.from_looker_dict( + absolute_file_path=path, + looker_view_file_dict=parsed, + project_name=project_name, + root_project_name=self._root_project_name, + base_projects_folder=self._base_projects_folder, + raw_file_content=raw_file_content, + reporter=reporter, + ) + logger.debug(f"adding viewfile for path {path} to the cache") + self.viewfile_cache[path] = looker_viewfile + return looker_viewfile + except Exception as e: + logger.debug(f"An error occurred while parsing path {path}", exc_info=True) + self.reporter.report_failure(path, f"failed to load view file {path}: {e}") + return None + + def load_viewfile( + self, + path: str, + project_name: str, + connection: Optional[LookerConnectionDefinition], + reporter: LookMLSourceReport, + ) -> Optional[LookerViewFile]: + viewfile = self._load_viewfile( + project_name=project_name, + path=path, + reporter=reporter, + ) + if viewfile is None: + return None + + return replace(viewfile, connection=connection) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py new file mode 100644 index 0000000000000..35231d273fbba --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py @@ -0,0 +1,104 @@ +from functools import lru_cache +from typing import ClassVar, Optional, TextIO, cast + +from liquid import Environment +from liquid.ast import Node +from liquid.context import Context +from liquid.parse import expect, get_parser +from liquid.stream import TokenStream +from liquid.tag import Tag +from liquid.template import BoundTemplate +from liquid.token import TOKEN_EXPRESSION, TOKEN_LITERAL, TOKEN_TAG, Token + + +class CustomTagException(Exception): + def __init__(self, message): + super().__init__(message) + + +class ConditionNode(Node): + def __init__(self, tok: Token, sql_or_lookml_reference: str, filter_name: str): + self.tok = tok + + self.sql_or_lookml_reference = sql_or_lookml_reference + + self.filter_name = filter_name + + def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]: + filter_value: Optional[str] = cast( + str, context.globals.get(self.filter_name) + ) # to silent lint + + if filter_value is None: + raise CustomTagException( + f'filter {self.filter_name} value is not provided for "condition" tag' + ) + + filter_value = filter_value.strip() + + buffer.write(f"{self.sql_or_lookml_reference}='{filter_value}'") + + return True + + +# Define the custom tag +class ConditionTag(Tag): + """ + ConditionTag is the equivalent implementation of looker's custom liquid tag "condition". + Refer doc: https://cloud.google.com/looker/docs/templated-filters#basic_usage + + Refer doc to see how to write liquid custom tag: https://jg-rp.github.io/liquid/guides/custom-tags + + This class render the below tag as order.region='ap-south-1' if order_region is provided in config.liquid_variables + as order_region: 'ap-south-1' + {% condition order_region %} order.region {% endcondition %} + + """ + + TAG_START: ClassVar[str] = "condition" + TAG_END: ClassVar[str] = "endcondition" + name: str = "condition" + + def __init__(self, env: Environment): + super().__init__(env) + self.parser = get_parser(self.env) + + def parse(self, stream: TokenStream) -> Node: + expect(stream, TOKEN_TAG, value=ConditionTag.TAG_START) + + start_token = stream.current + + stream.next_token() + expect(stream, TOKEN_EXPRESSION) + filter_name: str = stream.current.value.strip() + + stream.next_token() + expect(stream, TOKEN_LITERAL) + + sql_or_lookml_reference: str = stream.current.value.strip() + + stream.next_token() + expect(stream, TOKEN_TAG, value=ConditionTag.TAG_END) + + return ConditionNode( + tok=start_token, + sql_or_lookml_reference=sql_or_lookml_reference, + filter_name=filter_name, + ) + + +custom_tags = [ConditionTag] + + +@lru_cache(maxsize=1) +def _create_env() -> Environment: + env: Environment = Environment() + # register tag. One time activity + for custom_tag in custom_tags: + env.add_tag(custom_tag) + return env + + +def create_template(text: str) -> BoundTemplate: + env: Environment = _create_env() + return env.from_string(text) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 72d094c2cf942..d951a6dbe7a62 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -371,7 +371,8 @@ def _get_input_fields_from_query( if field is None: continue - # we haven't loaded in metadata about the explore yet, so we need to wait until explores are populated later to fetch this + # we haven't loaded in metadata about the explore yet, so we need to wait until explores are populated + # later to fetch this result.append( InputFieldElement( name=field, view_field=None, model=query.model, explore=query.view diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py new file mode 100644 index 0000000000000..919d9232a18c5 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -0,0 +1,115 @@ +import logging +import re +from typing import Any, ClassVar, Dict, Set + +from liquid import Undefined +from liquid.exceptions import LiquidSyntaxError + +from datahub.ingestion.source.looker.looker_liquid_tag import ( + CustomTagException, + create_template, +) +from datahub.ingestion.source.looker.str_functions import ( + remove_extra_spaces_and_newlines, +) + +logger = logging.getLogger(__name__) + + +class SpecialVariable: + SPECIAL_VARIABLE_PATTERN: ClassVar[ + str + ] = r"\b\w+(\.\w+)*\._(is_selected|in_query|is_filtered)\b" + liquid_variable: dict + + def __init__(self, liquid_variable): + self.liquid_variable = liquid_variable + + def _create_new_liquid_variables_with_default( + self, + variables: Set[str], + ) -> dict: + new_dict = {**self.liquid_variable} + + for variable in variables: + keys = variable.split( + "." + ) # variable is defined as view._is_selected or view.field_name._is_selected + + current_dict: dict = new_dict + + for key in keys[:-1]: + + if key not in current_dict: + current_dict[key] = {} + + current_dict = current_dict[key] + + if keys[-1] not in current_dict: + current_dict[keys[-1]] = True + + logger.debug("added special variables in liquid_variable dictionary") + + return new_dict + + def liquid_variable_with_default(self, text: str) -> dict: + variables: Set[str] = set( + [ + text[m.start() : m.end()] + for m in re.finditer(SpecialVariable.SPECIAL_VARIABLE_PATTERN, text) + ] + ) + + # if set is empty then no special variables are found. + if not variables: + return self.liquid_variable + + return self._create_new_liquid_variables_with_default(variables=variables) + + +def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str: + # Set variable value to NULL if not present in liquid_variable dictionary + Undefined.__str__ = lambda instance: "NULL" # type: ignore + try: + # See is there any special boolean variables are there in the text like _in_query, _is_selected, and + # _is_filtered. Refer doc for more information + # https://cloud.google.com/looker/docs/liquid-variable-reference#usage_of_in_query_is_selected_and_is_filtered + # update in liquid_variable with there default values + liquid_variable = SpecialVariable(liquid_variable).liquid_variable_with_default( + text + ) + # Resolve liquid template + return create_template(text).render(liquid_variable) + except LiquidSyntaxError as e: + logger.warning(f"Unsupported liquid template encountered. error [{e.message}]") + # TODO: There are some tag specific to looker and python-liquid library does not understand them. currently + # we are not parsing such liquid template. + # + # See doc: https://cloud.google.com/looker/docs/templated-filters and look for { % condition region %} + # order.region { % endcondition %} + except CustomTagException as e: + logger.warning(e) + logger.debug(e, exc_info=e) + + return text + + +def resolve_liquid_variable_in_view_dict( + raw_view: dict, liquid_variable: Dict[Any, Any] +) -> None: + if "views" not in raw_view: + return + + for view in raw_view["views"]: + if "sql_table_name" in view: + view["sql_table_name"] = resolve_liquid_variable( + text=remove_extra_spaces_and_newlines(view["sql_table_name"]), + liquid_variable=liquid_variable, + ) + + if "derived_table" in view and "sql" in view["derived_table"]: + # In sql we don't need to remove the extra spaces as sql parser takes care of extra spaces and \n + # while generating URN from sql + view["derived_table"]["sql"] = resolve_liquid_variable( + text=view["derived_table"]["sql"], liquid_variable=liquid_variable + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py new file mode 100644 index 0000000000000..aa45bb72d1f46 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_view_id_cache.py @@ -0,0 +1,120 @@ +import logging +from typing import Dict, List, Optional + +from datahub.ingestion.source.looker.looker_common import LookerViewId, ViewFieldValue +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_dataclasses import LookerModel +from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader +from datahub.ingestion.source.looker.lookml_config import ( + _BASE_PROJECT_NAME, + NAME, + LookMLSourceReport, +) + +logger = logging.getLogger(__name__) + + +def determine_view_file_path(base_folder_path: str, absolute_file_path: str) -> str: + splits: List[str] = absolute_file_path.split(base_folder_path, 1) + if len(splits) != 2: + logger.debug( + f"base_folder_path({base_folder_path}) and absolute_file_path({absolute_file_path}) not matching" + ) + return ViewFieldValue.NOT_AVAILABLE.value + + file_path: str = splits[1] + logger.debug(f"file_path={file_path}") + + return file_path.strip( + "/" + ) # strip / from path to make it equivalent to source_file attribute of LookerModelExplore API + + +class LookerViewIdCache: + """ + For view to view lineage we require LookerViewId object to form urn in advance for lineage generation. + The case where a view is referencing to another view using derived table can be located in this cache. + + Example: Consider a view registration_monthly_phasing has below SQL + SELECT * + + FROM ${registration_daily_phasing.SQL_TABLE_NAME} + + {% if date_sel._parameter_value == "'Weekly'"%} + WHERE DW_EFF_DT < DATEADD(DAY, (-DAYOFWEEK(current_date()) - 1),current_date()) + {% endif %} + + While generating MCPs for registration_monthly_phasing, the connector can look for view id + of registration_daily_phasing in this cache to generate the lineage between registration_monthly_phasing + and registration_daily_phasing + + This cache can be used for many other use case. + """ + + looker_model: LookerModel + looker_viewfile_loader: LookerViewFileLoader + project_name: str + model_name: str + reporter: LookMLSourceReport + looker_view_id_cache: Dict[ + str, LookerViewId + ] # Map of view-name as key, and LookerViewId instance as value + + def __init__( + self, + project_name: str, + model_name: str, + looker_model: LookerModel, + looker_viewfile_loader: LookerViewFileLoader, + reporter: LookMLSourceReport, + ): + self.project_name = project_name + self.model_name = model_name + self.looker_model = looker_model + self.looker_viewfile_loader = looker_viewfile_loader + self.looker_view_id_cache = {} + self.reporter = reporter + + def get_looker_view_id( + self, + view_name: str, + base_folder_path: str, + connection: Optional[LookerConnectionDefinition] = None, + ) -> Optional[LookerViewId]: + if view_name in self.looker_view_id_cache: + return self.looker_view_id_cache[view_name] + + for include in self.looker_model.resolved_includes: + included_looker_viewfile = self.looker_viewfile_loader.load_viewfile( + path=include.include, + project_name=include.project, + reporter=self.reporter, + connection=connection, + ) + + if included_looker_viewfile is None: + continue + + for view in included_looker_viewfile.views: + if view[NAME] == view_name: + file_path = determine_view_file_path( + base_folder_path, included_looker_viewfile.absolute_file_path + ) + + current_project_name: str = ( + include.project + if include.project != _BASE_PROJECT_NAME + else self.project_name + ) + + looker_view_id: LookerViewId = LookerViewId( + project_name=current_project_name, + model_name=self.model_name, + view_name=view_name, + file_path=file_path, + ) + + self.looker_view_id_cache[view_name] = looker_view_id + return looker_view_id + + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py new file mode 100644 index 0000000000000..e528e578dcf9f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -0,0 +1,414 @@ +import itertools +import logging +import re +from typing import Any, Dict, List, Optional + +from datahub.ingestion.source.looker.looker_common import ( + ViewFieldValue, + find_view_from_resolved_includes, +) +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile +from datahub.ingestion.source.looker.looker_file_loader import LookerViewFileLoader +from datahub.ingestion.source.looker.lookml_config import ( + DERIVED_VIEW_PATTERN, + DERIVED_VIEW_SUFFIX, + NAME, + LookMLSourceReport, +) +from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver + +logger = logging.getLogger(__name__) + + +class LookerFieldContext: + raw_field: Dict[Any, Any] + + def __init__(self, raw_field: Dict[Any, Any]): + self.raw_field = raw_field + + def name(self) -> str: + return self.raw_field[NAME] + + def sql(self) -> Optional[str]: + return self.raw_field.get("sql") + + def column_name_in_sql_attribute(self) -> List[str]: + if self.sql() is None: + # If no "sql" is specified, we assume this is referencing an upstream field + # with the same name. This commonly happens for extends and derived tables. + return [self.name()] + + column_names: List[str] = [] + + sql: Optional[str] = self.sql() + + assert sql # to silent lint false positive + + for upstream_field_match in re.finditer(r"\${TABLE}\.[\"]*([\.\w]+)", sql): + matched_field = upstream_field_match.group(1) + # Remove quotes from field names + matched_field = matched_field.replace('"', "").replace("`", "").lower() + column_names.append(matched_field) + + return column_names + + +class LookerViewContext: + """ + There are six patterns to associate the view's fields with dataset + + Pattern1: + view: view_name { + ... measure and dimension definition i.e. fields of a view + } + + In Pattern1 the fields' upstream dataset name is equivalent to view_name and this dataset should be present in + the connection. + + Pattern2: + view: view_name { + sql_table_name: dataset-name + + ... measure and dimension definition i.e. fields of a view + } + + In Pattern2 the fields' upstream dataset name is mentioned in "sql_table_name" attribute and this dataset + should be present in the connection. + + Pattern3: + view: view_name { + sql_table_name: ".SQL_TABLE_NAME" + + ... measure and dimension definition i.e. fields of a view + } + + In Pattern3 the fields' upstream is another view in same looker project. + + Pattern4: + view: view_name { + derived_table: + sql: + ... SQL select query + + ... measure and dimension definition i.e. fields of a view + } + + In Pattern4 the fields' upstream dataset is the output of sql mentioned in derived_table.sql. + + Pattern5: + view: view_name { + derived_table: + explore_source: + ... LookML native query + + ... measure and dimension definition i.e. fields of a view + } + + In Pattern5 the fields' upstream dataset is the output of LookML native query mentioned in + derived_table.explore_source. + + In all patterns the "sql_table_name" or "derived_table" field might present in parent view instead of current + view (see "extends" doc https://cloud.google.com/looker/docs/reference/param-view-extends) + + In all the patterns the common thing is field definition and fields are defined as + # Dimensions + dimension: id { + primary_key: yes + type: number + sql: ${TABLE}.id ;; + } + + # Measures + measure: total_revenue { + type: sum + sql: ${TABLE}.total_revenue ;; + } + + Here "sql" attribute is referring to column present in upstream dataset. + + This sql can be complex sql, see below example + + dimension: profit_in_dollars { type: number sql: ${TABLE}.revenue_in_dollars - ${TABLE}.cost_in_dollars ;; } + Here "profit_in_dollars" has two upstream columns from upstream dataset i.e. revenue_in_dollars and + cost_in_dollars. + + There is one special case of view definition, which is actually not useful but still a valid lookml definition. We + call it pattern 6. Refer below lookml + + view: customer_facts { + derived_table: { + sql: + SELECT + customer_id, + SUM(sale_price) AS lifetime_spend + FROM + order + WHERE + {% if order.region == "ap-south-1" %} + region = "AWS_AP_SOUTH_1" + {% else %} + region = "GCP_SOUTH_1" + {% endif %} + GROUP BY 1 + ;; + } + } + + The customer_facts view is not useful for looker as there is no field definition, but still such view appears in + connector test-cases, and it might be present on customer side + + For all possible options of "sql" attribute please refer looker doc: + https://cloud.google.com/looker/docs/reference/param-field-sql + + """ + + raw_view: Dict + view_file: LookerViewFile + view_connection: LookerConnectionDefinition + view_file_loader: LookerViewFileLoader + looker_refinement_resolver: LookerRefinementResolver + base_folder_path: str + reporter: LookMLSourceReport + + def __init__( + self, + raw_view: Dict, + view_file: LookerViewFile, + view_connection: LookerConnectionDefinition, + view_file_loader: LookerViewFileLoader, + looker_refinement_resolver: LookerRefinementResolver, + base_folder_path: str, + reporter: LookMLSourceReport, + ): + self.raw_view = raw_view + self.view_file = view_file + self.view_connection = view_connection + self.view_file_loader = view_file_loader + self.looker_refinement_resolver = looker_refinement_resolver + self.base_folder_path = base_folder_path + self.reporter = reporter + + def resolve_extends_view_name( + self, + target_view_name: str, + ) -> Optional[dict]: + # The view could live in the same file. + for raw_view in self.view_file.views: + raw_view_name = raw_view["name"] + if raw_view_name == target_view_name: + return self.looker_refinement_resolver.apply_view_refinement(raw_view) + + # Or, it could live in one of the imports. + view = find_view_from_resolved_includes( + connection=self.view_connection, + resolved_includes=self.view_file.resolved_includes, + looker_viewfile_loader=self.view_file_loader, + target_view_name=target_view_name, + reporter=self.reporter, + ) + + if view: + return self.looker_refinement_resolver.apply_view_refinement(view[1]) + else: + logger.warning( + f"failed to resolve view {target_view_name} included from {self.view_file.absolute_file_path}" + ) + return None + + def get_including_extends( + self, + field: str, + ) -> Optional[Any]: + extends = list( + itertools.chain.from_iterable( + self.raw_view.get("extends", self.raw_view.get("extends__all", [])) + ) + ) + + # First, check the current view. + if field in self.raw_view: + return self.raw_view[field] + + # The field might be defined in another view and this view is extending that view, + # so we resolve this field while taking that into account. + # following Looker's precedence rules. + for extend in reversed(extends): + assert extend != self.raw_view[NAME], "a view cannot extend itself" + extend_view = self.resolve_extends_view_name( + extend, + ) + if not extend_view: + raise NameError( + f"failed to resolve extends view {extend} in view {self.raw_view[NAME]} of" + f" file {self.view_file.absolute_file_path}" + ) + if field in extend_view: + return extend_view[field] + + return None + + def _get_sql_table_name_field(self) -> Optional[str]: + return self.get_including_extends(field="sql_table_name") + + def _is_dot_sql_table_name_present(self) -> bool: + sql_table_name: Optional[str] = self._get_sql_table_name_field() + + if sql_table_name is None: + return False + + if DERIVED_VIEW_SUFFIX in sql_table_name.lower(): + return True + + return False + + def sql_table_name(self) -> str: + sql_table_name: Optional[str] = self._get_sql_table_name_field() + # if sql_table_name field is not set then the table name is equal to view-name + if sql_table_name is None: + return self.raw_view[NAME].lower() + + # sql_table_name is in the format "${view-name}.SQL_TABLE_NAME" + # remove extra characters + if self._is_dot_sql_table_name_present(): + sql_table_name = re.sub(DERIVED_VIEW_PATTERN, r"\1", sql_table_name) + + # Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes + return sql_table_name.replace('"', "").replace("`", "").lower() + + def derived_table(self) -> Dict[Any, Any]: + """ + This function should only be called if is_native_derived_case return true + """ + derived_table = self.get_including_extends(field="derived_table") + + assert derived_table, "derived_table should not be None" + + return derived_table + + def explore_source(self) -> Dict[Any, Any]: + """ + This function should only be called if is_native_derived_case return true + """ + derived_table = self.derived_table() + + assert derived_table.get("explore_source"), "explore_source should not be None" + + return derived_table["explore_source"] + + def sql(self, transformed: bool = True) -> str: + """ + This function should only be called if is_sql_based_derived_case return true + """ + derived_table = self.derived_table() + + # Looker supports sql fragments that omit the SELECT and FROM parts of the query + # Add those in if we detect that it is missing + sql_query: str = derived_table["sql"] + + if transformed: # update the original sql attribute only if transformed is true + if not re.search(r"SELECT\s", sql_query, flags=re.I): + # add a SELECT clause at the beginning + sql_query = f"SELECT {sql_query}" + + if not re.search(r"FROM\s", sql_query, flags=re.I): + # add a FROM clause at the end + sql_query = f"{sql_query} FROM {self.name()}" + # Get the list of tables in the query + + # Drop ${ and } + sql_query = re.sub(DERIVED_VIEW_PATTERN, r"\1", sql_query) + + return sql_query + + def name(self) -> str: + return self.raw_view[NAME] + + def view_file_name(self) -> str: + splits: List[str] = self.view_file.absolute_file_path.split( + self.base_folder_path, 1 + ) + if len(splits) != 2: + logger.debug( + f"base_folder_path({self.base_folder_path}) and absolute_file_path({self.view_file.absolute_file_path})" + f" not matching" + ) + return ViewFieldValue.NOT_AVAILABLE.value + + file_name: str = splits[1] + logger.debug(f"file_path={file_name}") + + return file_name.strip( + "/" + ) # strip / from path to make it equivalent to source_file attribute of LookerModelExplore API + + def _get_list_dict(self, attribute_name: str) -> List[Dict]: + ans: Optional[List[Dict]] = self.raw_view.get(attribute_name) + if ans is not None: + return ans + return [] + + def dimensions(self) -> List[Dict]: + return self._get_list_dict("dimensions") + + def measures(self) -> List[Dict]: + return self._get_list_dict("measures") + + def dimension_groups(self) -> List[Dict]: + return self._get_list_dict("dimension_groups") + + def is_materialized_derived_view(self) -> bool: + for k in self.derived_table(): + if k in ["datagroup_trigger", "sql_trigger_value", "persist_for"]: + return True + + if "materialized_view" in self.derived_table(): + return self.derived_table()["materialized_view"] == "yes" + + return False + + def is_regular_case(self) -> bool: + # regular-case is pattern1 and 2 where upstream table is either view-name or + # table name mentioned in sql_table_name attribute + if ( + self.is_sql_table_name_referring_to_view() + or self.is_sql_based_derived_case() + or self.is_native_derived_case() + ): + return False + + return True + + def is_sql_table_name_referring_to_view(self) -> bool: + # It is pattern3 + return self._is_dot_sql_table_name_present() + + def is_sql_based_derived_case(self) -> bool: + # It is pattern 4 + if "derived_table" in self.raw_view and "sql" in self.raw_view["derived_table"]: + return True + + return False + + def is_native_derived_case(self) -> bool: + # It is pattern 5 + if ( + "derived_table" in self.raw_view + and "explore_source" in self.raw_view["derived_table"] + ): + return True + + return False + + def is_sql_based_derived_view_without_fields_case(self) -> bool: + # Pattern 6 + fields: List[Dict] = [] + + fields.extend(self.dimensions()) + fields.extend(self.measures()) + fields.extend(self.dimension_groups()) + + if self.is_sql_based_derived_case() and len(fields) == 0: + return True + + return False diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py new file mode 100644 index 0000000000000..937c75b7eaf35 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -0,0 +1,235 @@ +import logging +from dataclasses import dataclass, field as dataclass_field +from datetime import timedelta +from typing import Any, Dict, List, Optional, Union + +import pydantic +from pydantic import root_validator, validator +from pydantic.fields import Field + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.git import GitInfo +from datahub.configuration.source_common import EnvConfigMixin +from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.ingestion.source.looker.looker_config import LookerCommonConfig +from datahub.ingestion.source.looker.looker_connection import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_lib_wrapper import ( + LookerAPI, + LookerAPIConfig, + TransportOptionsConfig, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) +from datahub.utilities.lossy_collections import LossyList + +logger = logging.getLogger(__name__) + +NAME: str = "name" + +_BASE_PROJECT_NAME = "__BASE" + +_EXPLORE_FILE_EXTENSION = ".explore.lkml" + +_VIEW_FILE_EXTENSION = ".view.lkml" + +_MODEL_FILE_EXTENSION = ".model.lkml" + +VIEW_LANGUAGE_LOOKML: str = "lookml" + +VIEW_LANGUAGE_SQL: str = "sql" + +DERIVED_VIEW_SUFFIX = r".sql_table_name" + +DERIVED_VIEW_PATTERN: str = r"\$\{([^}]*)\}" + + +@dataclass +class LookMLSourceReport(StaleEntityRemovalSourceReport): + git_clone_latency: Optional[timedelta] = None + models_discovered: int = 0 + models_dropped: List[str] = dataclass_field(default_factory=LossyList) + views_discovered: int = 0 + views_dropped: List[str] = dataclass_field(default_factory=LossyList) + views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList) + query_parse_attempts: int = 0 + query_parse_failures: int = 0 + query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList) + _looker_api: Optional[LookerAPI] = None + + def report_models_scanned(self) -> None: + self.models_discovered += 1 + + def report_views_scanned(self) -> None: + self.views_discovered += 1 + + def report_models_dropped(self, model: str) -> None: + self.models_dropped.append(model) + + def report_views_dropped(self, view: str) -> None: + self.views_dropped.append(view) + + def report_unreachable_view_dropped(self, view: str) -> None: + self.views_dropped_unreachable.append(view) + + def compute_stats(self) -> None: + if self._looker_api: + self.api_stats = self._looker_api.compute_stats() + return super().compute_stats() + + +class LookMLSourceConfig( + LookerCommonConfig, StatefulIngestionConfigBase, EnvConfigMixin +): + git_info: Optional[GitInfo] = Field( + None, + description="Reference to your git location. If present, supplies handy links to your lookml on the dataset " + "entity page.", + ) + _github_info_deprecated = pydantic_renamed_field("github_info", "git_info") + base_folder: Optional[pydantic.DirectoryPath] = Field( + None, + description="Required if not providing github configuration and deploy keys. A pointer to a local directory (" + "accessible to the ingestion system) where the root of the LookML repo has been checked out (" + "typically via a git clone). This is typically the root folder where the `*.model.lkml` and " + "`*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under " + "`/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to " + "`/Users/jdoe/workspace/my-lookml-repo`.", + ) + project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitInfo]] = Field( + {}, + description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. " + "Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should " + "have a corresponding entry here." + "If a deploy key is not provided, the ingestion system will use the same deploy key as the main project. ", + ) + connection_to_platform_map: Optional[Dict[str, LookerConnectionDefinition]] = Field( + None, + description="A mapping of [Looker connection names](" + "https://docs.looker.com/reference/model-params/connection-for-model) to DataHub platform, " + "database, and schema values.", + ) + model_pattern: AllowDenyPattern = Field( + AllowDenyPattern.allow_all(), + description="List of regex patterns for LookML models to include in the extraction.", + ) + view_pattern: AllowDenyPattern = Field( + AllowDenyPattern.allow_all(), + description="List of regex patterns for LookML views to include in the extraction.", + ) + parse_table_names_from_sql: bool = Field(False, description="See note below.") + sql_parser: str = Field( + "datahub.utilities.sql_parser.DefaultSQLParser", description="See note below." + ) + api: Optional[LookerAPIConfig] + project_name: Optional[str] = Field( + None, + description="Required if you don't specify the `api` section. The project name within which all the model " + "files live. See (https://docs.looker.com/data-modeling/getting-started/how-project-works) to " + "understand what the Looker project name should be. The simplest way to see your projects is to " + "click on `Develop` followed by `Manage LookML Projects` in the Looker application.", + ) + transport_options: Optional[TransportOptionsConfig] = Field( + None, + description="Populates the [TransportOptions](https://github.com/looker-open-source/sdk-codegen/blob" + "/94d6047a0d52912ac082eb91616c1e7c379ab262/python/looker_sdk/rtl/transport.py#L70) struct for " + "looker client", + ) + max_file_snippet_length: int = Field( + 512000, # 512KB should be plenty + description="When extracting the view definition from a lookml file, the maximum number of characters to " + "extract.", + ) + emit_reachable_views_only: bool = Field( + True, + description="When enabled, only views that are reachable from explores defined in the model files are emitted", + ) + populate_sql_logic_for_missing_descriptions: bool = Field( + False, + description="When enabled, field descriptions will include the sql logic for computed fields if descriptions " + "are missing", + ) + process_isolation_for_sql_parsing: bool = Field( + False, + description="When enabled, sql parsing will be executed in a separate process to prevent memory leaks.", + ) + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( + default=None, description="" + ) + process_refinements: bool = Field( + False, + description="When enabled, looker refinement will be processed to adapt an existing view.", + ) + + liquid_variable: Dict[Any, Any] = Field( + {}, + description="A dictionary containing Liquid variables and their corresponding values, utilized in SQL-defined " + "derived views. The Liquid template will be resolved in view.derived_table.sql and " + "view.sql_table_name. Defaults to an empty dictionary.", + ) + + @validator("connection_to_platform_map", pre=True) + def convert_string_to_connection_def(cls, conn_map): + # Previous version of config supported strings in connection map. This upconverts strings to ConnectionMap + for key in conn_map: + if isinstance(conn_map[key], str): + platform = conn_map[key] + if "." in platform: + platform_db_split = conn_map[key].split(".") + connection = LookerConnectionDefinition( + platform=platform_db_split[0], + default_db=platform_db_split[1], + default_schema="", + ) + conn_map[key] = connection + else: + logger.warning( + f"Connection map for {key} provides platform {platform} but does not provide a default " + f"database name. This might result in failed resolution" + ) + conn_map[key] = LookerConnectionDefinition( + platform=platform, default_db="", default_schema="" + ) + return conn_map + + @root_validator(skip_on_failure=True) + def check_either_connection_map_or_connection_provided(cls, values): + """Validate that we must either have a connection map or an api credential""" + if not values.get("connection_to_platform_map", {}) and not values.get( + "api", {} + ): + raise ValueError( + "Neither api not connection_to_platform_map config was found. LookML source requires either api " + "credentials for Looker or a map of connection names to platform identifiers to work correctly" + ) + return values + + @root_validator(skip_on_failure=True) + def check_either_project_name_or_api_provided(cls, values): + """Validate that we must either have a project name or an api credential to fetch project names""" + if not values.get("project_name") and not values.get("api"): + raise ValueError( + "Neither project_name not an API credential was found. LookML source requires either api credentials " + "for Looker or a project_name to accurately name views and models." + ) + return values + + @validator("base_folder", always=True) + def check_base_folder_if_not_provided( + cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any] + ) -> Optional[pydantic.DirectoryPath]: + if v is None: + git_info: Optional[GitInfo] = values.get("git_info") + if git_info: + if not git_info.deploy_key: + logger.warning( + "git_info is provided, but no SSH key is present. If the repo is not public, we'll fail to " + "clone it." + ) + else: + raise ValueError("Neither base_folder nor git_info has been provided.") + return v diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py new file mode 100644 index 0000000000000..892ed79754a1c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_refinement.py @@ -0,0 +1,251 @@ +import copy +import itertools +import logging +from typing import ClassVar, Dict, List, Set + +from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_dataclasses import LookerModel +from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewFileLoader +from datahub.ingestion.source.looker.lookml_config import ( + NAME, + LookMLSourceConfig, + LookMLSourceReport, +) + +logger = logging.getLogger(__name__) + + +class LookerRefinementResolver: + """ + Refinements are a way to "edit" an existing view or explore. + Refer: https://cloud.google.com/looker/docs/lookml-refinements + + A refinement to an existing view/explore is only applied if it's refinement is reachable from include files in a + model. For refinement applied order please refer: + https://cloud.google.com/looker/docs/lookml-refinements#refinements_are_applied_in_order + """ + + REFINEMENT_PREFIX: ClassVar[str] = "+" + DIMENSIONS: ClassVar[str] = "dimensions" + MEASURES: ClassVar[str] = "measures" + DIMENSION_GROUPS: ClassVar[str] = "dimension_groups" + EXTENDS: ClassVar[str] = "extends" + EXTENDS_ALL: ClassVar[str] = "extends__all" + + looker_model: LookerModel + looker_viewfile_loader: LookerViewFileLoader + connection_definition: LookerConnectionDefinition + source_config: LookMLSourceConfig + reporter: LookMLSourceReport + view_refinement_cache: Dict[ + str, dict + ] # Map of view-name as key, and it is raw view dictionary after applying refinement process + explore_refinement_cache: Dict[ + str, dict + ] # Map of explore-name as key, and it is raw view dictionary after applying refinement process + + def __init__( + self, + looker_model: LookerModel, + looker_viewfile_loader: LookerViewFileLoader, + connection_definition: LookerConnectionDefinition, + source_config: LookMLSourceConfig, + reporter: LookMLSourceReport, + ): + self.looker_model = looker_model + self.looker_viewfile_loader = looker_viewfile_loader + self.connection_definition = connection_definition + self.source_config = source_config + self.reporter = reporter + self.view_refinement_cache = {} + self.explore_refinement_cache = {} + + @staticmethod + def is_refinement(view_name: str) -> bool: + return view_name.startswith(LookerRefinementResolver.REFINEMENT_PREFIX) + + @staticmethod + def merge_column( + original_dict: dict, refinement_dict: dict, key: str + ) -> List[dict]: + """ + Merge a dimension/measure/other column with one from a refinement. + This follows the process documented at https://help.looker.com/hc/en-us/articles/4419773929107-LookML-refinements + """ + merge_column: List[dict] = [] + original_value: List[dict] = original_dict.get(key, []) + refine_value: List[dict] = refinement_dict.get(key, []) + # name is required field, not going to be None + original_column_map = {column[NAME]: column for column in original_value} + refine_column_map = {column[NAME]: column for column in refine_value} + for existing_column_name in original_column_map: + existing_column = original_column_map[existing_column_name] + refine_column = refine_column_map.get(existing_column_name) + if refine_column is not None: + existing_column.update(refine_column) + + merge_column.append(existing_column) + + # merge any remaining column from refine_column_map + for new_column_name in refine_column_map: + if new_column_name not in original_column_map: + merge_column.append(refine_column_map[new_column_name]) + + return merge_column + + @staticmethod + def merge_and_set_column( + new_raw_view: dict, refinement_view: dict, key: str + ) -> None: + merged_column = LookerRefinementResolver.merge_column( + new_raw_view, refinement_view, key + ) + if merged_column: + new_raw_view[key] = merged_column + + @staticmethod + def merge_refinements(raw_view: dict, refinement_views: List[dict]) -> dict: + """ + Iterate over refinement_views and merge parameter of each view with raw_view. + Detail of merging order can be found at https://cloud.google.com/looker/docs/lookml-refinements + """ + new_raw_view: dict = copy.deepcopy(raw_view) + + for refinement_view in refinement_views: + # Merge dimension and measure + # TODO: low priority: handle additive parameters + # https://cloud.google.com/looker/docs/lookml-refinements#some_parameters_are_additive + + # Merge Dimension + LookerRefinementResolver.merge_and_set_column( + new_raw_view, refinement_view, LookerRefinementResolver.DIMENSIONS + ) + # Merge Measure + LookerRefinementResolver.merge_and_set_column( + new_raw_view, refinement_view, LookerRefinementResolver.MEASURES + ) + # Merge Dimension Group + LookerRefinementResolver.merge_and_set_column( + new_raw_view, refinement_view, LookerRefinementResolver.DIMENSION_GROUPS + ) + + return new_raw_view + + def get_refinements(self, views: List[dict], view_name: str) -> List[dict]: + """ + Refinement syntax for view and explore are same. + This function can be used to filter out view/explore refinement from raw dictionary list + """ + view_refinement_name: str = self.REFINEMENT_PREFIX + view_name + refined_views: List[dict] = [] + + for raw_view in views: + if view_refinement_name == raw_view[NAME]: + refined_views.append(raw_view) + + return refined_views + + def get_refinement_from_model_includes(self, view_name: str) -> List[dict]: + refined_views: List[dict] = [] + + for include in self.looker_model.resolved_includes: + included_looker_viewfile = self.looker_viewfile_loader.load_viewfile( + include.include, + include.project, + self.connection_definition, + self.reporter, + ) + + if not included_looker_viewfile: + continue + + refined_views.extend( + self.get_refinements(included_looker_viewfile.views, view_name) + ) + + return refined_views + + def should_skip_processing(self, raw_view_name: str) -> bool: + if LookerRefinementResolver.is_refinement(raw_view_name): + return True + + if self.source_config.process_refinements is False: + return True + + return False + + def apply_view_refinement(self, raw_view: dict) -> dict: + """ + Looker process the lkml file in include order and merge the all refinement to original view. + """ + assert raw_view.get(NAME) is not None + + raw_view_name: str = raw_view[NAME] + + if self.should_skip_processing(raw_view_name): + return raw_view + + if raw_view_name in self.view_refinement_cache: + logger.debug(f"Returning applied refined view {raw_view_name} from cache") + return self.view_refinement_cache[raw_view_name] + + logger.debug(f"Processing refinement for view {raw_view_name}") + + refinement_views: List[dict] = self.get_refinement_from_model_includes( + raw_view_name + ) + + self.view_refinement_cache[raw_view_name] = self.merge_refinements( + raw_view, refinement_views + ) + + return self.view_refinement_cache[raw_view_name] + + @staticmethod + def add_extended_explore( + raw_explore: dict, refinement_explores: List[Dict] + ) -> None: + extended_explores: Set[str] = set() + for view in refinement_explores: + extends = list( + itertools.chain.from_iterable( + view.get( + LookerRefinementResolver.EXTENDS, + view.get(LookerRefinementResolver.EXTENDS_ALL, []), + ) + ) + ) + extended_explores.update(extends) + + if extended_explores: # if it is not empty then add to the original view + raw_explore[LookerRefinementResolver.EXTENDS] = list(extended_explores) + + def apply_explore_refinement(self, raw_view: dict) -> dict: + """ + In explore refinement `extends` parameter is additive. + Refer looker refinement document: https://cloud.google.com/looker/docs/lookml-refinements#additive + """ + assert raw_view.get(NAME) is not None + + raw_view_name: str = raw_view[NAME] + + if self.should_skip_processing(raw_view_name): + return raw_view + + if raw_view_name in self.explore_refinement_cache: + logger.debug( + f"Returning applied refined explore {raw_view_name} from cache" + ) + return self.explore_refinement_cache[raw_view_name] + + logger.debug(f"Processing refinement for explore {raw_view_name}") + + refinement_explore: List[dict] = self.get_refinements( + self.looker_model.explores, raw_view_name + ) + + self.add_extended_explore(raw_view, refinement_explore) + + self.explore_refinement_cache[raw_view_name] = raw_view + + return self.explore_refinement_cache[raw_view_name] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 7f8ae5ead81a7..6efb8d6fba2a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -1,39 +1,16 @@ -import copy -import glob -import itertools import logging import pathlib -import re import tempfile -from dataclasses import dataclass, field as dataclass_field, replace -from datetime import datetime, timedelta, timezone -from typing import ( - Any, - ClassVar, - Dict, - Iterable, - List, - Optional, - Set, - Tuple, - Type, - Union, -) +from collections import OrderedDict +from dataclasses import dataclass +from datetime import datetime, timezone +from typing import Dict, Iterable, List, Optional, Set, Tuple, Type import lkml import lkml.simple -import pydantic from looker_sdk.error import SDKError -from looker_sdk.sdk.api40.models import DBConnection -from pydantic import root_validator, validator -from pydantic.fields import Field -import datahub.emitter.mce_builder as builder -from datahub.configuration import ConfigModel -from datahub.configuration.common import AllowDenyPattern, ConfigurationError from datahub.configuration.git import GitInfo -from datahub.configuration.source_common import EnvConfigMixin -from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.emitter.mce_builder import make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import gen_containers @@ -56,28 +33,46 @@ from datahub.ingestion.source.looker.lkml_patched import load_lkml from datahub.ingestion.source.looker.looker_common import ( CORPUSER_DATAHUB, - LookerCommonConfig, LookerExplore, LookerUtil, LookerViewId, - ProjectInclude, ViewField, ViewFieldType, ViewFieldValue, + deduplicate_fields, gen_project_key, ) -from datahub.ingestion.source.looker.looker_lib_wrapper import ( - LookerAPI, - LookerAPIConfig, - TransportOptionsConfig, +from datahub.ingestion.source.looker.looker_connection import ( + get_connection_def_based_on_connection_string, +) +from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI +from datahub.ingestion.source.looker.looker_view_id_cache import ( + LookerModel, + LookerViewFileLoader, + LookerViewIdCache, +) +from datahub.ingestion.source.looker.lookml_concept_context import ( + LookerFieldContext, + LookerViewContext, +) +from datahub.ingestion.source.looker.lookml_config import ( + _BASE_PROJECT_NAME, + _MODEL_FILE_EXTENSION, + VIEW_LANGUAGE_LOOKML, + VIEW_LANGUAGE_SQL, + LookerConnectionDefinition, + LookMLSourceConfig, + LookMLSourceReport, +) +from datahub.ingestion.source.looker.lookml_refinement import LookerRefinementResolver +from datahub.ingestion.source.looker.view_upstream import ( + AbstractViewUpstream, + create_view_upstream, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, - StaleEntityRemovalSourceReport, - StatefulStaleMetadataRemovalConfig, ) from datahub.ingestion.source.state.stateful_ingestion_base import ( - StatefulIngestionConfigBase, StatefulIngestionSourceBase, ) from datahub.metadata.com.linkedin.pegasus2avro.common import BrowsePaths, Status @@ -100,914 +95,18 @@ FineGrainedLineageUpstreamTypeClass, SubTypesClass, ) -from datahub.utilities.lossy_collections import LossyList +from datahub.sql_parsing.sqlglot_lineage import ColumnRef from datahub.utilities.sql_parser import SQLParser logger = logging.getLogger(__name__) -_BASE_PROJECT_NAME = "__BASE" - -_EXPLORE_FILE_EXTENSION = ".explore.lkml" -_VIEW_FILE_EXTENSION = ".view.lkml" -_MODEL_FILE_EXTENSION = ".model.lkml" - - -def deduplicate_fields(fields: List[ViewField]) -> List[ViewField]: - # Remove duplicates filed from self.fields - # Logic is: If more than a field has same ViewField.name then keep only one filed where ViewField.field_type - # is DIMENSION_GROUP. - # Looker Constraint: - # - Any field declared as dimension or measure can be redefined as dimension_group. - # - Any field declared in dimension can't be redefined in measure and vice-versa. - - dimension_group_field_names: List[str] = [ - field.name - for field in fields - if field.field_type == ViewFieldType.DIMENSION_GROUP - ] - - new_fields: List[ViewField] = [] - - for field in fields: - if ( - field.name in dimension_group_field_names - and field.field_type != ViewFieldType.DIMENSION_GROUP - ): - continue - - new_fields.append(field) - - return new_fields - - -def _get_bigquery_definition( - looker_connection: DBConnection, -) -> Tuple[str, Optional[str], Optional[str]]: - platform = "bigquery" - # bigquery project ids are returned in the host field - db = looker_connection.host - schema = looker_connection.database - return (platform, db, schema) - - -def _get_generic_definition( - looker_connection: DBConnection, platform: Optional[str] = None -) -> Tuple[str, Optional[str], Optional[str]]: - if platform is None: - # We extract the platform from the dialect name - dialect_name = looker_connection.dialect_name - assert dialect_name is not None - # generally the first part of the dialect name before _ is the name of the platform - # versions are encoded as numbers and can be removed - # e.g. spark1 or hive2 or druid_18 - platform = re.sub(r"[0-9]+", "", dialect_name.split("_")[0]) - - assert ( - platform is not None - ), f"Failed to extract a valid platform from connection {looker_connection}" - db = looker_connection.database - schema = looker_connection.schema # ok for this to be None - return (platform, db, schema) - - -class LookerConnectionDefinition(ConfigModel): - platform: str - default_db: str - default_schema: Optional[str] # Optional since some sources are two-level only - platform_instance: Optional[str] = None - platform_env: Optional[str] = Field( - default=None, - description="The environment that the platform is located in. Leaving this empty will inherit defaults from the top level Looker configuration", - ) - - @validator("platform_env") - def platform_env_must_be_one_of(cls, v: Optional[str]) -> Optional[str]: - if v is not None: - return EnvConfigMixin.env_must_be_one_of(v) - return v - - @validator("platform", "default_db", "default_schema") - def lower_everything(cls, v): - """We lower case all strings passed in to avoid casing issues later""" - if v is not None: - return v.lower() - - @classmethod - def from_looker_connection( - cls, looker_connection: DBConnection - ) -> "LookerConnectionDefinition": - """Dialect definitions are here: https://docs.looker.com/setup-and-management/database-config""" - extractors: Dict[str, Any] = { - "^bigquery": _get_bigquery_definition, - ".*": _get_generic_definition, - } - - if looker_connection.dialect_name is None: - raise ConfigurationError( - f"Unable to fetch a fully filled out connection for {looker_connection.name}. Please check your API permissions." - ) - for extractor_pattern, extracting_function in extractors.items(): - if re.match(extractor_pattern, looker_connection.dialect_name): - (platform, db, schema) = extracting_function(looker_connection) - return cls(platform=platform, default_db=db, default_schema=schema) - raise ConfigurationError( - f"Could not find an appropriate platform for looker_connection: {looker_connection.name} with dialect: {looker_connection.dialect_name}" - ) - - -class LookMLSourceConfig( - LookerCommonConfig, StatefulIngestionConfigBase, EnvConfigMixin -): - git_info: Optional[GitInfo] = Field( - None, - description="Reference to your git location. If present, supplies handy links to your lookml on the dataset entity page.", - ) - _github_info_deprecated = pydantic_renamed_field("github_info", "git_info") - base_folder: Optional[pydantic.DirectoryPath] = Field( - None, - description="Required if not providing github configuration and deploy keys. A pointer to a local directory (accessible to the ingestion system) where the root of the LookML repo has been checked out (typically via a git clone). This is typically the root folder where the `*.model.lkml` and `*.view.lkml` files are stored. e.g. If you have checked out your LookML repo under `/Users/jdoe/workspace/my-lookml-repo`, then set `base_folder` to `/Users/jdoe/workspace/my-lookml-repo`.", - ) - project_dependencies: Dict[str, Union[pydantic.DirectoryPath, GitInfo]] = Field( - {}, - description="A map of project_name to local directory (accessible to the ingestion system) or Git credentials. " - "Every local_dependencies or private remote_dependency listed in the main project's manifest.lkml file should have a corresponding entry here. " - "If a deploy key is not provided, the ingestion system will use the same deploy key as the main project. ", - ) - connection_to_platform_map: Optional[Dict[str, LookerConnectionDefinition]] = Field( - None, - description="A mapping of [Looker connection names](https://docs.looker.com/reference/model-params/connection-for-model) to DataHub platform, database, and schema values.", - ) - model_pattern: AllowDenyPattern = Field( - AllowDenyPattern.allow_all(), - description="List of regex patterns for LookML models to include in the extraction.", - ) - view_pattern: AllowDenyPattern = Field( - AllowDenyPattern.allow_all(), - description="List of regex patterns for LookML views to include in the extraction.", - ) - parse_table_names_from_sql: bool = Field(False, description="See note below.") - sql_parser: str = Field( - "datahub.utilities.sql_parser.DefaultSQLParser", description="See note below." - ) - api: Optional[LookerAPIConfig] - project_name: Optional[str] = Field( - None, - description="Required if you don't specify the `api` section. The project name within which all the model files live. See (https://docs.looker.com/data-modeling/getting-started/how-project-works) to understand what the Looker project name should be. The simplest way to see your projects is to click on `Develop` followed by `Manage LookML Projects` in the Looker application.", - ) - transport_options: Optional[TransportOptionsConfig] = Field( - None, - description="Populates the [TransportOptions](https://github.com/looker-open-source/sdk-codegen/blob/94d6047a0d52912ac082eb91616c1e7c379ab262/python/looker_sdk/rtl/transport.py#L70) struct for looker client", - ) - max_file_snippet_length: int = Field( - 512000, # 512KB should be plenty - description="When extracting the view definition from a lookml file, the maximum number of characters to extract.", - ) - emit_reachable_views_only: bool = Field( - True, - description="When enabled, only views that are reachable from explores defined in the model files are emitted", - ) - populate_sql_logic_for_missing_descriptions: bool = Field( - False, - description="When enabled, field descriptions will include the sql logic for computed fields if descriptions are missing", - ) - process_isolation_for_sql_parsing: bool = Field( - False, - description="When enabled, sql parsing will be executed in a separate process to prevent memory leaks.", - ) - stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field( - default=None, description="" - ) - process_refinements: bool = Field( - False, - description="When enabled, looker refinement will be processed to adapt an existing view.", - ) - - @validator("connection_to_platform_map", pre=True) - def convert_string_to_connection_def(cls, conn_map): - # Previous version of config supported strings in connection map. This upconverts strings to ConnectionMap - for key in conn_map: - if isinstance(conn_map[key], str): - platform = conn_map[key] - if "." in platform: - platform_db_split = conn_map[key].split(".") - connection = LookerConnectionDefinition( - platform=platform_db_split[0], - default_db=platform_db_split[1], - default_schema="", - ) - conn_map[key] = connection - else: - logger.warning( - f"Connection map for {key} provides platform {platform} but does not provide a default database name. This might result in failed resolution" - ) - conn_map[key] = LookerConnectionDefinition( - platform=platform, default_db="", default_schema="" - ) - return conn_map - - @root_validator(skip_on_failure=True) - def check_either_connection_map_or_connection_provided(cls, values): - """Validate that we must either have a connection map or an api credential""" - if not values.get("connection_to_platform_map", {}) and not values.get( - "api", {} - ): - raise ValueError( - "Neither api not connection_to_platform_map config was found. LookML source requires either api credentials for Looker or a map of connection names to platform identifiers to work correctly" - ) - return values - - @root_validator(skip_on_failure=True) - def check_either_project_name_or_api_provided(cls, values): - """Validate that we must either have a project name or an api credential to fetch project names""" - if not values.get("project_name") and not values.get("api"): - raise ValueError( - "Neither project_name not an API credential was found. LookML source requires either api credentials for Looker or a project_name to accurately name views and models." - ) - return values - - @validator("base_folder", always=True) - def check_base_folder_if_not_provided( - cls, v: Optional[pydantic.DirectoryPath], values: Dict[str, Any] - ) -> Optional[pydantic.DirectoryPath]: - if v is None: - git_info: Optional[GitInfo] = values.get("git_info") - if git_info: - if not git_info.deploy_key: - logger.warning( - "git_info is provided, but no SSH key is present. If the repo is not public, we'll fail to clone it." - ) - else: - raise ValueError("Neither base_folder nor git_info has been provided.") - return v - - -@dataclass -class LookMLSourceReport(StaleEntityRemovalSourceReport): - git_clone_latency: Optional[timedelta] = None - models_discovered: int = 0 - models_dropped: List[str] = dataclass_field(default_factory=LossyList) - views_discovered: int = 0 - views_dropped: List[str] = dataclass_field(default_factory=LossyList) - views_dropped_unreachable: List[str] = dataclass_field(default_factory=LossyList) - query_parse_attempts: int = 0 - query_parse_failures: int = 0 - query_parse_failure_views: List[str] = dataclass_field(default_factory=LossyList) - _looker_api: Optional[LookerAPI] = None - - def report_models_scanned(self) -> None: - self.models_discovered += 1 - - def report_views_scanned(self) -> None: - self.views_discovered += 1 - - def report_models_dropped(self, model: str) -> None: - self.models_dropped.append(model) - - def report_views_dropped(self, view: str) -> None: - self.views_dropped.append(view) - - def report_unreachable_view_dropped(self, view: str) -> None: - self.views_dropped_unreachable.append(view) - - def compute_stats(self) -> None: - if self._looker_api: - self.api_stats = self._looker_api.compute_stats() - return super().compute_stats() - - -@dataclass -class LookerModel: - connection: str - includes: List[str] - explores: List[dict] - resolved_includes: List[ProjectInclude] - - @staticmethod - def from_looker_dict( - looker_model_dict: dict, - base_project_name: str, - root_project_name: Optional[str], - base_projects_folders: Dict[str, pathlib.Path], - path: str, - reporter: LookMLSourceReport, - ) -> "LookerModel": - logger.debug(f"Loading model from {path}") - connection = looker_model_dict["connection"] - includes = looker_model_dict.get("includes", []) - resolved_includes = LookerModel.resolve_includes( - includes, - base_project_name, - root_project_name, - base_projects_folders, - path, - reporter, - seen_so_far=set(), - traversal_path=pathlib.Path(path).stem, - ) - logger.debug(f"{path} has resolved_includes: {resolved_includes}") - explores = looker_model_dict.get("explores", []) - - explore_files = [ - x.include - for x in resolved_includes - if x.include.endswith(_EXPLORE_FILE_EXTENSION) - ] - for included_file in explore_files: - try: - parsed = load_lkml(included_file) - included_explores = parsed.get("explores", []) - explores.extend(included_explores) - except Exception as e: - reporter.report_warning( - title="Error Loading Include File", - message="Failed to load included file", - context=f"Include Details: {included_file}", - exc=e, - ) - # continue in this case, as it might be better to load and resolve whatever we can - - return LookerModel( - connection=connection, - includes=includes, - resolved_includes=resolved_includes, - explores=explores, - ) - - @staticmethod - def resolve_includes( - includes: List[str], - project_name: str, - root_project_name: Optional[str], - base_projects_folder: Dict[str, pathlib.Path], - path: str, - reporter: LookMLSourceReport, - seen_so_far: Set[str], - traversal_path: str = "", # a cosmetic parameter to aid debugging - ) -> List[ProjectInclude]: - """Resolve ``include`` statements in LookML model files to a list of ``.lkml`` files. - - For rules on how LookML ``include`` statements are written, see - https://docs.looker.com/data-modeling/getting-started/ide-folders#wildcard_examples - """ - - resolved = [] - for inc in includes: - # Filter out dashboards - we get those through the looker source. - if ( - inc.endswith(".dashboard") - or inc.endswith(".dashboard.lookml") - or inc.endswith(".dashboard.lkml") - ): - logger.debug(f"include '{inc}' is a dashboard, skipping it") - continue - - resolved_project_name = project_name - resolved_project_folder = str(base_projects_folder[project_name]) - - # Massage the looker include into a valid glob wildcard expression - if inc.startswith("//"): - # remote include, let's see if we have the project checked out locally - (remote_project, project_local_path) = inc[2:].split("/", maxsplit=1) - if remote_project in base_projects_folder: - resolved_project_folder = str(base_projects_folder[remote_project]) - glob_expr = f"{resolved_project_folder}/{project_local_path}" - resolved_project_name = remote_project - else: - logger.warning( - f"Resolving {inc} failed. Could not find a locally checked out reference for {remote_project}" - ) - continue - elif inc.startswith("/"): - glob_expr = f"{resolved_project_folder}{inc}" - - # The include path is sometimes '/{project_name}/{path_within_project}' - # instead of '//{project_name}/{path_within_project}' or '/{path_within_project}'. - # - # TODO: I can't seem to find any documentation on this pattern, but we definitely - # have seen it in the wild. Example from Mozilla's public looker-hub repo: - # https://github.com/mozilla/looker-hub/blob/f491ca51ce1add87c338e6723fd49bc6ae4015ca/fenix/explores/activation.explore.lkml#L7 - # As such, we try to handle it but are as defensive as possible. - - non_base_project_name = project_name - if project_name == _BASE_PROJECT_NAME and root_project_name is not None: - non_base_project_name = root_project_name - if non_base_project_name != _BASE_PROJECT_NAME and inc.startswith( - f"/{non_base_project_name}/" - ): - # This might be a local include. Let's make sure that '/{project_name}' doesn't - # exist as normal include in the project. - if not pathlib.Path( - f"{resolved_project_folder}/{non_base_project_name}" - ).exists(): - path_within_project = pathlib.Path(*pathlib.Path(inc).parts[2:]) - glob_expr = f"{resolved_project_folder}/{path_within_project}" - else: - # Need to handle a relative path. - glob_expr = str(pathlib.Path(path).parent / inc) - # "**" matches an arbitrary number of directories in LookML - # we also resolve these paths to absolute paths so we can de-dup effectively later on - included_files = [ - str(p.resolve()) - for p in [ - pathlib.Path(p) - for p in sorted( - glob.glob(glob_expr, recursive=True) - + glob.glob(f"{glob_expr}.lkml", recursive=True) - ) - ] - # We don't want to match directories. The '**' glob can be used to - # recurse into directories. - if p.is_file() - ] - logger.debug( - f"traversal_path={traversal_path}, included_files = {included_files}, seen_so_far: {seen_so_far}" - ) - if "*" not in inc and not included_files: - reporter.report_failure( - title="Error Resolving Include", - message=f"Cannot resolve include {inc}", - context=f"Path: {path}", - ) - elif not included_files: - reporter.report_failure( - title="Error Resolving Include", - message=f"Did not resolve anything for wildcard include {inc}", - context=f"Path: {path}", - ) - # only load files that we haven't seen so far - included_files = [x for x in included_files if x not in seen_so_far] - for included_file in included_files: - # Filter out dashboards - we get those through the looker source. - if ( - included_file.endswith(".dashboard") - or included_file.endswith(".dashboard.lookml") - or included_file.endswith(".dashboard.lkml") - ): - logger.debug( - f"include '{included_file}' is a dashboard, skipping it" - ) - continue - - logger.debug( - f"Will be loading {included_file}, traversed here via {traversal_path}" - ) - try: - parsed = load_lkml(included_file) - seen_so_far.add(included_file) - if "includes" in parsed: # we have more includes to resolve! - resolved.extend( - LookerModel.resolve_includes( - parsed["includes"], - resolved_project_name, - root_project_name, - base_projects_folder, - included_file, - reporter, - seen_so_far, - traversal_path=traversal_path - + "." - + pathlib.Path(included_file).stem, - ) - ) - except Exception as e: - reporter.report_warning( - title="Error Loading Include", - message="Failed to load include file", - context=f"Include Details: {included_file}", - exc=e, - ) - # continue in this case, as it might be better to load and resolve whatever we can - - resolved.extend( - [ - ProjectInclude(project=resolved_project_name, include=f) - for f in included_files - ] - ) - return resolved - - -@dataclass -class LookerViewFile: - absolute_file_path: str - connection: Optional[LookerConnectionDefinition] - includes: List[str] - resolved_includes: List[ProjectInclude] - views: List[Dict] - raw_file_content: str - - @classmethod - def from_looker_dict( - cls, - absolute_file_path: str, - looker_view_file_dict: dict, - project_name: str, - root_project_name: Optional[str], - base_projects_folder: Dict[str, pathlib.Path], - raw_file_content: str, - reporter: LookMLSourceReport, - ) -> "LookerViewFile": - logger.debug(f"Loading view file at {absolute_file_path}") - includes = looker_view_file_dict.get("includes", []) - resolved_path = str(pathlib.Path(absolute_file_path).resolve()) - seen_so_far = set() - seen_so_far.add(resolved_path) - resolved_includes = LookerModel.resolve_includes( - includes, - project_name, - root_project_name, - base_projects_folder, - absolute_file_path, - reporter, - seen_so_far=seen_so_far, - ) - logger.debug( - f"resolved_includes for {absolute_file_path} is {resolved_includes}" - ) - views = looker_view_file_dict.get("views", []) - - return cls( - absolute_file_path=absolute_file_path, - connection=None, - includes=includes, - resolved_includes=resolved_includes, - views=views, - raw_file_content=raw_file_content, - ) - - -@dataclass -class SQLInfo: - table_names: List[str] - column_names: List[str] - - -class LookerViewFileLoader: - """ - Loads the looker viewfile at a :path and caches the LookerViewFile in memory - This is to avoid reloading the same file off of disk many times during the recursive include resolution process - """ - - def __init__( - self, - root_project_name: Optional[str], - base_projects_folder: Dict[str, pathlib.Path], - reporter: LookMLSourceReport, - ) -> None: - self.viewfile_cache: Dict[str, LookerViewFile] = {} - self._root_project_name = root_project_name - self._base_projects_folder = base_projects_folder - self.reporter = reporter - - def is_view_seen(self, path: str) -> bool: - return path in self.viewfile_cache - - def _load_viewfile( - self, project_name: str, path: str, reporter: LookMLSourceReport - ) -> Optional[LookerViewFile]: - # always fully resolve paths to simplify de-dup - path = str(pathlib.Path(path).resolve()) - allowed_extensions = [_VIEW_FILE_EXTENSION, _EXPLORE_FILE_EXTENSION] - matched_any_extension = [ - match for match in [path.endswith(x) for x in allowed_extensions] if match - ] - if not matched_any_extension: - # not a view file - logger.debug( - f"Skipping file {path} because it doesn't appear to be a view file. Matched extensions {allowed_extensions}" - ) - return None - - if self.is_view_seen(str(path)): - return self.viewfile_cache[path] - - try: - with open(path) as file: - raw_file_content = file.read() - except Exception as e: - self.reporter.report_failure( - message="Failed to read view file", - context=f"Path: {path}", - exc=e, - ) - return None - try: - logger.debug(f"Loading viewfile {path}") - parsed = load_lkml(path) - looker_viewfile = LookerViewFile.from_looker_dict( - absolute_file_path=path, - looker_view_file_dict=parsed, - project_name=project_name, - root_project_name=self._root_project_name, - base_projects_folder=self._base_projects_folder, - raw_file_content=raw_file_content, - reporter=reporter, - ) - logger.debug(f"adding viewfile for path {path} to the cache") - self.viewfile_cache[path] = looker_viewfile - return looker_viewfile - except Exception as e: - self.reporter.report_failure( - message="Failed to parse view file", - context=f"Path: {path}", - exc=e, - ) - return None - - def load_viewfile( - self, - path: str, - project_name: str, - connection: Optional[LookerConnectionDefinition], - reporter: LookMLSourceReport, - ) -> Optional[LookerViewFile]: - viewfile = self._load_viewfile( - project_name=project_name, path=path, reporter=reporter - ) - if viewfile is None: - return None - - return replace(viewfile, connection=connection) - - -class LookerRefinementResolver: - """ - Refinements are a way to "edit" an existing view or explore. - Refer: https://cloud.google.com/looker/docs/lookml-refinements - - A refinement to an existing view/explore is only applied if it's refinement is reachable from include files in a model. - For refinement applied order please refer: https://cloud.google.com/looker/docs/lookml-refinements#refinements_are_applied_in_order - """ - - REFINEMENT_PREFIX: ClassVar[str] = "+" - DIMENSIONS: ClassVar[str] = "dimensions" - MEASURES: ClassVar[str] = "measures" - DIMENSION_GROUPS: ClassVar[str] = "dimension_groups" - NAME: ClassVar[str] = "name" - EXTENDS: ClassVar[str] = "extends" - EXTENDS_ALL: ClassVar[str] = "extends__all" - - looker_model: LookerModel - looker_viewfile_loader: LookerViewFileLoader - connection_definition: LookerConnectionDefinition - source_config: LookMLSourceConfig - reporter: LookMLSourceReport - view_refinement_cache: Dict[ - str, dict - ] # Map of view-name as key, and it is raw view dictionary after applying refinement process - explore_refinement_cache: Dict[ - str, dict - ] # Map of explore-name as key, and it is raw view dictionary after applying refinement process - - def __init__( - self, - looker_model: LookerModel, - looker_viewfile_loader: LookerViewFileLoader, - connection_definition: LookerConnectionDefinition, - source_config: LookMLSourceConfig, - reporter: LookMLSourceReport, - ): - self.looker_model = looker_model - self.looker_viewfile_loader = looker_viewfile_loader - self.connection_definition = connection_definition - self.source_config = source_config - self.reporter = reporter - self.view_refinement_cache = {} - self.explore_refinement_cache = {} - - @staticmethod - def is_refinement(view_name: str) -> bool: - return view_name.startswith(LookerRefinementResolver.REFINEMENT_PREFIX) - - @staticmethod - def merge_column( - original_dict: dict, refinement_dict: dict, key: str - ) -> List[dict]: - """ - Merge a dimension/measure/other column with one from a refinement. - This follows the process documented at https://help.looker.com/hc/en-us/articles/4419773929107-LookML-refinements - """ - merge_column: List[dict] = [] - original_value: List[dict] = original_dict.get(key, []) - refine_value: List[dict] = refinement_dict.get(key, []) - # name is required field, not going to be None - original_column_map = { - column[LookerRefinementResolver.NAME]: column for column in original_value - } - refine_column_map = { - column[LookerRefinementResolver.NAME]: column for column in refine_value - } - for existing_column_name in original_column_map: - existing_column = original_column_map[existing_column_name] - refine_column = refine_column_map.get(existing_column_name) - if refine_column is not None: - existing_column.update(refine_column) - - merge_column.append(existing_column) - - # merge any remaining column from refine_column_map - for new_column_name in refine_column_map: - if new_column_name not in original_column_map: - merge_column.append(refine_column_map[new_column_name]) - - return merge_column - - @staticmethod - def merge_and_set_column( - new_raw_view: dict, refinement_view: dict, key: str - ) -> None: - merged_column = LookerRefinementResolver.merge_column( - new_raw_view, refinement_view, key - ) - if merged_column: - new_raw_view[key] = merged_column - - @staticmethod - def merge_refinements(raw_view: dict, refinement_views: List[dict]) -> dict: - """ - Iterate over refinement_views and merge parameter of each view with raw_view. - Detail of merging order can be found at https://cloud.google.com/looker/docs/lookml-refinements - """ - new_raw_view: dict = copy.deepcopy(raw_view) - - for refinement_view in refinement_views: - # Merge dimension and measure - # TODO: low priority: handle additive parameters - # https://cloud.google.com/looker/docs/lookml-refinements#some_parameters_are_additive - - # Merge Dimension - LookerRefinementResolver.merge_and_set_column( - new_raw_view, refinement_view, LookerRefinementResolver.DIMENSIONS - ) - # Merge Measure - LookerRefinementResolver.merge_and_set_column( - new_raw_view, refinement_view, LookerRefinementResolver.MEASURES - ) - # Merge Dimension Group - LookerRefinementResolver.merge_and_set_column( - new_raw_view, refinement_view, LookerRefinementResolver.DIMENSION_GROUPS - ) - - return new_raw_view - - def get_refinements(self, views: List[dict], view_name: str) -> List[dict]: - """ - Refinement syntax for view and explore are same. - This function can be used to filter out view/explore refinement from raw dictionary list - """ - view_refinement_name: str = self.REFINEMENT_PREFIX + view_name - refined_views: List[dict] = [] - - for raw_view in views: - if view_refinement_name == raw_view[LookerRefinementResolver.NAME]: - refined_views.append(raw_view) - - return refined_views - - def get_refinement_from_model_includes(self, view_name: str) -> List[dict]: - refined_views: List[dict] = [] - - for include in self.looker_model.resolved_includes: - included_looker_viewfile = self.looker_viewfile_loader.load_viewfile( - include.include, - include.project, - self.connection_definition, - self.reporter, - ) - - if not included_looker_viewfile: - continue - - refined_views.extend( - self.get_refinements(included_looker_viewfile.views, view_name) - ) - - return refined_views - - def should_skip_processing(self, raw_view_name: str) -> bool: - if LookerRefinementResolver.is_refinement(raw_view_name): - return True - - if self.source_config.process_refinements is False: - return True - - return False - - def apply_view_refinement(self, raw_view: dict) -> dict: - """ - Looker process the lkml file in include order and merge the all refinement to original view. - """ - assert raw_view.get(LookerRefinementResolver.NAME) is not None - - raw_view_name: str = raw_view[LookerRefinementResolver.NAME] - - if self.should_skip_processing(raw_view_name): - return raw_view - - if raw_view_name in self.view_refinement_cache: - logger.debug(f"Returning applied refined view {raw_view_name} from cache") - return self.view_refinement_cache[raw_view_name] - - logger.debug(f"Processing refinement for view {raw_view_name}") - - refinement_views: List[dict] = self.get_refinement_from_model_includes( - raw_view_name - ) - - self.view_refinement_cache[raw_view_name] = self.merge_refinements( - raw_view, refinement_views - ) - - return self.view_refinement_cache[raw_view_name] - - @staticmethod - def add_extended_explore( - raw_explore: dict, refinement_explores: List[Dict] - ) -> None: - extended_explores: Set[str] = set() - for view in refinement_explores: - extends = list( - itertools.chain.from_iterable( - view.get( - LookerRefinementResolver.EXTENDS, - view.get(LookerRefinementResolver.EXTENDS_ALL, []), - ) - ) - ) - extended_explores.update(extends) - - if extended_explores: # if it is not empty then add to the original view - raw_explore[LookerRefinementResolver.EXTENDS] = list(extended_explores) - - def apply_explore_refinement(self, raw_view: dict) -> dict: - """ - In explore refinement `extends` parameter is additive. - Refer looker refinement document: https://cloud.google.com/looker/docs/lookml-refinements#additive - """ - assert raw_view.get(LookerRefinementResolver.NAME) is not None - - raw_view_name: str = raw_view[LookerRefinementResolver.NAME] - - if self.should_skip_processing(raw_view_name): - return raw_view - - if raw_view_name in self.explore_refinement_cache: - logger.debug( - f"Returning applied refined explore {raw_view_name} from cache" - ) - return self.explore_refinement_cache[raw_view_name] - - logger.debug(f"Processing refinement for explore {raw_view_name}") - - refinement_explore: List[dict] = self.get_refinements( - self.looker_model.explores, raw_view_name - ) - - self.add_extended_explore(raw_view, refinement_explore) - - self.explore_refinement_cache[raw_view_name] = raw_view - - return self.explore_refinement_cache[raw_view_name] - - -VIEW_LANGUAGE_LOOKML: str = "lookml" -VIEW_LANGUAGE_SQL: str = "sql" - - -def _find_view_from_resolved_includes( - connection: Optional[LookerConnectionDefinition], - resolved_includes: List[ProjectInclude], - looker_viewfile_loader: LookerViewFileLoader, - target_view_name: str, - reporter: LookMLSourceReport, -) -> Optional[Tuple[ProjectInclude, dict]]: - # It could live in one of the included files. We do not know which file the base view - # lives in, so we try them all! - for include in resolved_includes: - included_looker_viewfile = looker_viewfile_loader.load_viewfile( - include.include, include.project, connection, reporter - ) - if not included_looker_viewfile: - continue - for raw_view in included_looker_viewfile.views: - raw_view_name = raw_view["name"] - # Make sure to skip loading view we are currently trying to resolve - if raw_view_name == target_view_name: - return include, raw_view - - return None - - -_SQL_FUNCTIONS = ["UNNEST"] - @dataclass class LookerView: id: LookerViewId absolute_file_path: str connection: LookerConnectionDefinition - sql_table_names: List[str] - upstream_explores: List[str] + upstream_dataset_urns: List[str] fields: List[ViewField] raw_file_content: str view_details: Optional[ViewProperties] = None @@ -1021,100 +120,6 @@ def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]: raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}") return parser_cls - @classmethod - def _get_sql_info( - cls, sql: str, sql_parser_path: str, use_external_process: bool = True - ) -> SQLInfo: - parser_cls = cls._import_sql_parser_cls(sql_parser_path) - - try: - parser_instance: SQLParser = parser_cls( - sql, use_external_process=use_external_process - ) - except Exception as e: - logger.warning(f"Sql parser failed on {sql} with {e}") - return SQLInfo(table_names=[], column_names=[]) - - sql_table_names: List[str] - try: - sql_table_names = parser_instance.get_tables() - except Exception as e: - logger.warning(f"Sql parser failed on {sql} with {e}") - sql_table_names = [] - - try: - column_names: List[str] = parser_instance.get_columns() - except Exception as e: - logger.warning(f"Sql parser failed on {sql} with {e}") - column_names = [] - - logger.debug(f"Column names parsed = {column_names}") - # Drop table names with # in them - sql_table_names = [t for t in sql_table_names if "#" not in t] - - # Remove quotes from table names - sql_table_names = [t.replace('"', "") for t in sql_table_names] - sql_table_names = [t.replace("`", "") for t in sql_table_names] - # Remove reserved words from table names - sql_table_names = [ - t for t in sql_table_names if t.upper() not in _SQL_FUNCTIONS - ] - - return SQLInfo(table_names=sql_table_names, column_names=column_names) - - @classmethod - def _get_fields( - cls, - field_list: List[Dict], - type_cls: ViewFieldType, - extract_column_level_lineage: bool, - populate_sql_logic_in_descriptions: bool, - ) -> List[ViewField]: - fields = [] - for field_dict in field_list: - is_primary_key = field_dict.get("primary_key", "no") == "yes" - name = field_dict["name"] - native_type = field_dict.get("type", "string") - default_description = ( - f"sql:{field_dict['sql']}" - if "sql" in field_dict and populate_sql_logic_in_descriptions - else "" - ) - - description = field_dict.get("description", default_description) - label = field_dict.get("label", "") - upstream_fields = [] - if extract_column_level_lineage: - if field_dict.get("sql") is not None: - for upstream_field_match in re.finditer( - r"\${TABLE}\.[\"]*([\.\w]+)", field_dict["sql"] - ): - matched_field = upstream_field_match.group(1) - # Remove quotes from field names - matched_field = ( - matched_field.replace('"', "").replace("`", "").lower() - ) - upstream_fields.append(matched_field) - else: - # If no SQL is specified, we assume this is referencing an upstream field - # with the same name. This commonly happens for extends and derived tables. - upstream_fields.append(name) - - upstream_fields = sorted(list(set(upstream_fields))) - - field = ViewField( - name=name, - type=native_type, - label=label, - description=description, - is_primary_key=is_primary_key, - field_type=type_cls, - upstream_fields=upstream_fields, - tags=field_dict.get("tags") or [], - ) - fields.append(field) - return fields - @classmethod def determine_view_file_path( cls, base_folder_path: str, absolute_file_path: str @@ -1137,354 +142,114 @@ def determine_view_file_path( def from_looker_dict( cls, project_name: str, - base_folder_path: str, model_name: str, - looker_view: dict, - connection: LookerConnectionDefinition, - looker_viewfile: LookerViewFile, - looker_viewfile_loader: LookerViewFileLoader, - looker_refinement_resolver: LookerRefinementResolver, + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, reporter: LookMLSourceReport, max_file_snippet_length: int, - parse_table_names_from_sql: bool = False, - sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser", + config: LookMLSourceConfig, + ctx: PipelineContext, extract_col_level_lineage: bool = False, populate_sql_logic_in_descriptions: bool = False, - process_isolation_for_sql_parsing: bool = False, ) -> Optional["LookerView"]: - view_name = looker_view["name"] + + view_name = view_context.name() + logger.debug(f"Handling view {view_name} in model {model_name}") - # The sql_table_name might be defined in another view and this view is extending that view, - # so we resolve this field while taking that into account. - sql_table_name: Optional[str] = LookerView.get_including_extends( + + looker_view_id: LookerViewId = LookerViewId( + project_name=project_name, + model_name=model_name, view_name=view_name, - looker_view=looker_view, - connection=connection, - looker_viewfile=looker_viewfile, - looker_viewfile_loader=looker_viewfile_loader, - looker_refinement_resolver=looker_refinement_resolver, - field="sql_table_name", - reporter=reporter, + file_path=view_context.view_file_name(), ) - # Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes - sql_table_name = ( - sql_table_name.replace('"', "").replace("`", "") - if sql_table_name is not None - else None - ) - derived_table = LookerView.get_including_extends( - view_name=view_name, - looker_view=looker_view, - connection=connection, - looker_viewfile=looker_viewfile, - looker_viewfile_loader=looker_viewfile_loader, - looker_refinement_resolver=looker_refinement_resolver, - field="derived_table", + view_upstream: AbstractViewUpstream = create_view_upstream( + view_context=view_context, + looker_view_id_cache=looker_view_id_cache, + config=config, + ctx=ctx, reporter=reporter, ) - dimensions = cls._get_fields( - looker_view.get("dimensions", []), - ViewFieldType.DIMENSION, - extract_col_level_lineage, - populate_sql_logic_in_descriptions=populate_sql_logic_in_descriptions, - ) - dimension_groups = cls._get_fields( - looker_view.get("dimension_groups", []), - ViewFieldType.DIMENSION_GROUP, - extract_col_level_lineage, - populate_sql_logic_in_descriptions=populate_sql_logic_in_descriptions, - ) - measures = cls._get_fields( - looker_view.get("measures", []), - ViewFieldType.MEASURE, - extract_col_level_lineage, - populate_sql_logic_in_descriptions=populate_sql_logic_in_descriptions, - ) - fields: List[ViewField] = dimensions + dimension_groups + measures + field_type_vs_raw_fields = OrderedDict( + { + ViewFieldType.DIMENSION: view_context.dimensions(), + ViewFieldType.DIMENSION_GROUP: view_context.dimension_groups(), + ViewFieldType.MEASURE: view_context.measures(), + } + ) # in order to maintain order in golden file - fields = deduplicate_fields(fields) + view_fields: List[ViewField] = [] - # Prep "default" values for the view, which will be overridden by the logic below. - view_logic = looker_viewfile.raw_file_content[:max_file_snippet_length] - sql_table_names: List[str] = [] - upstream_explores: List[str] = [] - - if derived_table is not None: - # Derived tables can either be a SQL query or a LookML explore. - # See https://cloud.google.com/looker/docs/derived-tables. - - if "sql" in derived_table: - view_logic = derived_table["sql"] - view_lang = VIEW_LANGUAGE_SQL - - # Parse SQL to extract dependencies. - if parse_table_names_from_sql: - ( - fields, - sql_table_names, - ) = cls._extract_metadata_from_derived_table_sql( - reporter, - sql_parser_path, - view_name, - sql_table_name, - view_logic, - fields, - use_external_process=process_isolation_for_sql_parsing, + for field_type, fields in field_type_vs_raw_fields.items(): + for field in fields: + upstream_column_ref: List[ColumnRef] = [] + if extract_col_level_lineage: + upstream_column_ref = view_upstream.get_upstream_column_ref( + field_context=LookerFieldContext(raw_field=field) ) - elif "explore_source" in derived_table: - # This is called a "native derived table". - # See https://cloud.google.com/looker/docs/creating-ndts. - explore_source = derived_table["explore_source"] - - # We want this to render the full lkml block - # e.g. explore_source: source_name { ... } - # As such, we use the full derived_table instead of the explore_source. - view_logic = str(lkml.dump(derived_table))[:max_file_snippet_length] - view_lang = VIEW_LANGUAGE_LOOKML - - ( - fields, - upstream_explores, - ) = cls._extract_metadata_from_derived_table_explore( - reporter, view_name, explore_source, fields + view_fields.append( + ViewField.view_fields_from_dict( + field_dict=field, + upstream_column_ref=upstream_column_ref, + type_cls=field_type, + populate_sql_logic_in_descriptions=populate_sql_logic_in_descriptions, + ) ) - materialized = False - for k in derived_table: - if k in ["datagroup_trigger", "sql_trigger_value", "persist_for"]: - materialized = True - if "materialized_view" in derived_table: - materialized = derived_table["materialized_view"] == "yes" + # special case where view is defined as derived sql, however fields are not defined + if ( + len(view_fields) == 0 + and view_context.is_sql_based_derived_view_without_fields_case() + ): + view_fields = view_upstream.create_fields() + + view_fields = deduplicate_fields(view_fields) + + # Prep "default" values for the view, which will be overridden by the logic below. + view_logic = view_context.view_file.raw_file_content[:max_file_snippet_length] + + if view_context.is_sql_based_derived_case(): + view_logic = view_context.sql(transformed=False) + # Parse SQL to extract dependencies. + view_details = ViewProperties( + materialized=False, + viewLogic=view_logic, + viewLanguage=VIEW_LANGUAGE_SQL, + ) + elif view_context.is_native_derived_case(): + # We want this to render the full lkml block + # e.g. explore_source: source_name { ... } + # As such, we use the full derived_table instead of the explore_source. + view_logic = str(lkml.dump(view_context.derived_table()))[ + :max_file_snippet_length + ] + view_lang = VIEW_LANGUAGE_LOOKML + + materialized = view_context.is_materialized_derived_view() view_details = ViewProperties( materialized=materialized, viewLogic=view_logic, viewLanguage=view_lang ) else: - # If not a derived table, then this view essentially wraps an existing - # object in the database. If sql_table_name is set, there is a single - # dependency in the view, on the sql_table_name. - # Otherwise, default to the view name as per the docs: - # https://docs.looker.com/reference/view-params/sql_table_name-for-view - sql_table_names = ( - [view_name] if sql_table_name is None else [sql_table_name] - ) view_details = ViewProperties( materialized=False, viewLogic=view_logic, viewLanguage=VIEW_LANGUAGE_LOOKML, ) - file_path = LookerView.determine_view_file_path( - base_folder_path, looker_viewfile.absolute_file_path - ) - return LookerView( - id=LookerViewId( - project_name=project_name, - model_name=model_name, - view_name=view_name, - file_path=file_path, - ), - absolute_file_path=looker_viewfile.absolute_file_path, - connection=connection, - sql_table_names=sql_table_names, - upstream_explores=upstream_explores, - fields=fields, - raw_file_content=looker_viewfile.raw_file_content, + id=looker_view_id, + absolute_file_path=view_context.view_file.absolute_file_path, + connection=view_context.view_connection, + upstream_dataset_urns=view_upstream.get_upstream_dataset_urn(), + fields=view_fields, + raw_file_content=view_context.view_file.raw_file_content, view_details=view_details, ) - @classmethod - def _extract_metadata_from_derived_table_sql( - cls, - reporter: LookMLSourceReport, - sql_parser_path: str, - view_name: str, - sql_table_name: Optional[str], - sql_query: str, - fields: List[ViewField], - use_external_process: bool, - ) -> Tuple[List[ViewField], List[str]]: - sql_table_names: List[str] = [] - - logger.debug(f"Parsing sql from derived table section of view: {view_name}") - reporter.query_parse_attempts += 1 - - # Skip queries that contain liquid variables. We currently don't parse them correctly. - # Docs: https://cloud.google.com/looker/docs/liquid-variable-reference. - # TODO: also support ${EXTENDS} and ${TABLE} - if "{%" in sql_query: - try: - # test if parsing works - sql_info: SQLInfo = cls._get_sql_info( - sql_query, sql_parser_path, use_external_process - ) - if not sql_info.table_names: - raise Exception("Failed to find any tables") - except Exception: - logger.debug( - f"{view_name}: SQL Parsing didn't return any tables, trying a hail-mary" - ) - # A hail-mary simple parse. - for maybe_table_match in re.finditer( - r"FROM\s*([a-zA-Z0-9_.`]+)", sql_query - ): - if maybe_table_match.group(1) not in sql_table_names: - sql_table_names.append(maybe_table_match.group(1)) - return fields, sql_table_names - - # Looker supports sql fragments that omit the SELECT and FROM parts of the query - # Add those in if we detect that it is missing - if not re.search(r"SELECT\s", sql_query, flags=re.I): - # add a SELECT clause at the beginning - sql_query = f"SELECT {sql_query}" - if not re.search(r"FROM\s", sql_query, flags=re.I): - # add a FROM clause at the end - sql_query = f"{sql_query} FROM {sql_table_name if sql_table_name is not None else view_name}" - # Get the list of tables in the query - try: - sql_info = cls._get_sql_info( - sql_query, sql_parser_path, use_external_process - ) - sql_table_names = sql_info.table_names - column_names = sql_info.column_names - - if not fields: - # it seems like the view is defined purely as sql, let's try using the column names to populate the schema - fields = [ - # set types to unknown for now as our sql parser doesn't give us column types yet - ViewField(c, "", "unknown", "", ViewFieldType.UNKNOWN) - for c in sorted(column_names) - ] - # remove fields or sql tables that contain liquid variables - fields = [f for f in fields if "{%" not in f.name] - - if not sql_info.table_names: - reporter.query_parse_failures += 1 - reporter.query_parse_failure_views.append(view_name) - except Exception as e: - reporter.query_parse_failures += 1 - reporter.report_warning( - title="Error Parsing SQL", - message="Failed to parse sql query, lineage will not be accurate.", - context=f"Table Name: {sql_table_name}, Query: {sql_query}", - exc=e, - ) - - sql_table_names = [table for table in sql_table_names if "{%" not in table] - - return fields, sql_table_names - - @classmethod - def _extract_metadata_from_derived_table_explore( - cls, - reporter: LookMLSourceReport, - view_name: str, - explore_source: dict, - fields: List[ViewField], - ) -> Tuple[List[ViewField], List[str]]: - logger.debug( - f"Parsing explore_source from derived table section of view: {view_name}" - ) - - upstream_explores = [explore_source["name"]] - - explore_columns = explore_source.get("columns", []) - # TODO: We currently don't support column-level lineage for derived_column. - # In order to support it, we'd need to parse the `sql` field of the derived_column. - - # The fields in the view are actually references to the fields in the explore. - # As such, we need to perform an extra mapping step to update - # the upstream column names. - for field in fields: - for i, upstream_field in enumerate(field.upstream_fields): - # Find the matching column in the explore. - for explore_column in explore_columns: - if explore_column["name"] == upstream_field: - field.upstream_fields[i] = explore_column.get( - "field", explore_column["name"] - ) - break - - return fields, upstream_explores - - @classmethod - def resolve_extends_view_name( - cls, - connection: LookerConnectionDefinition, - looker_viewfile: LookerViewFile, - looker_viewfile_loader: LookerViewFileLoader, - looker_refinement_resolver: LookerRefinementResolver, - target_view_name: str, - reporter: LookMLSourceReport, - ) -> Optional[dict]: - # The view could live in the same file. - for raw_view in looker_viewfile.views: - raw_view_name = raw_view["name"] - if raw_view_name == target_view_name: - return looker_refinement_resolver.apply_view_refinement(raw_view) - - # Or, it could live in one of the imports. - view = _find_view_from_resolved_includes( - connection, - looker_viewfile.resolved_includes, - looker_viewfile_loader, - target_view_name, - reporter, - ) - if view: - return looker_refinement_resolver.apply_view_refinement(view[1]) - else: - logger.warning( - f"failed to resolve view {target_view_name} included from {looker_viewfile.absolute_file_path}" - ) - return None - - @classmethod - def get_including_extends( - cls, - view_name: str, - looker_view: dict, - connection: LookerConnectionDefinition, - looker_viewfile: LookerViewFile, - looker_viewfile_loader: LookerViewFileLoader, - looker_refinement_resolver: LookerRefinementResolver, - field: str, - reporter: LookMLSourceReport, - ) -> Optional[Any]: - extends = list( - itertools.chain.from_iterable( - looker_view.get("extends", looker_view.get("extends__all", [])) - ) - ) - - # First, check the current view. - if field in looker_view: - return looker_view[field] - - # Then, check the views this extends, following Looker's precedence rules. - for extend in reversed(extends): - assert extend != view_name, "a view cannot extend itself" - extend_view = LookerView.resolve_extends_view_name( - connection, - looker_viewfile, - looker_viewfile_loader, - looker_refinement_resolver, - extend, - reporter, - ) - if not extend_view: - raise NameError( - f"failed to resolve extends view {extend} in view {view_name} of file {looker_viewfile.absolute_file_path}" - ) - if field in extend_view: - return extend_view[field] - - return None - @dataclass class LookerRemoteDependency: @@ -1539,6 +304,7 @@ class LookMLSource(StatefulIngestionSourceBase): def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) self.source_config = config + self.ctx = ctx self.reporter = LookMLSourceReport() # To keep track of projects (containers) which have already been ingested @@ -1549,10 +315,11 @@ def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): self.reporter._looker_api = self.looker_client try: self.looker_client.all_connections() - except SDKError: + except SDKError as err: raise ValueError( - "Failed to retrieve connections from looker client. Please check to ensure that you have manage_models permission enabled on this API key." - ) + "Failed to retrieve connections from looker client. Please check to ensure that you have " + "manage_models permission enabled on this API key." + ) from err def _load_model(self, path: str) -> LookerModel: logger.debug(f"Loading model from file {path}") @@ -1567,145 +334,14 @@ def _load_model(self, path: str) -> LookerModel: ) return looker_model - def _platform_names_have_2_parts(self, platform: str) -> bool: - return platform in {"hive", "mysql", "athena"} - - def _generate_fully_qualified_name( - self, sql_table_name: str, connection_def: LookerConnectionDefinition - ) -> str: - """Returns a fully qualified dataset name, resolved through a connection definition. - Input sql_table_name can be in three forms: table, db.table, db.schema.table""" - # TODO: This function should be extracted out into a Platform specific naming class since name translations are required across all connectors - - # Bigquery has "project.db.table" which can be mapped to db.schema.table form - # All other relational db's follow "db.schema.table" - # With the exception of mysql, hive, athena which are "db.table" - - # first detect which one we have - parts = len(sql_table_name.split(".")) - - if parts == 3: - # fully qualified, but if platform is of 2-part, we drop the first level - if self._platform_names_have_2_parts(connection_def.platform): - sql_table_name = ".".join(sql_table_name.split(".")[1:]) - return sql_table_name.lower() - - if parts == 1: - # Bare table form - if self._platform_names_have_2_parts(connection_def.platform): - dataset_name = f"{connection_def.default_db}.{sql_table_name}" - else: - dataset_name = f"{connection_def.default_db}.{connection_def.default_schema}.{sql_table_name}" - return dataset_name.lower() - - if parts == 2: - # if this is a 2 part platform, we are fine - if self._platform_names_have_2_parts(connection_def.platform): - return sql_table_name.lower() - # otherwise we attach the default top-level container - dataset_name = f"{connection_def.default_db}.{sql_table_name}" - return dataset_name.lower() - - self.reporter.report_warning( - title="Malformed Table Name", - message="Table name has more than 3 parts.", - context=f"Table Name: {sql_table_name}", - ) - return sql_table_name.lower() - - def _construct_datalineage_urn( - self, sql_table_name: str, looker_view: LookerView - ) -> str: - logger.debug(f"sql_table_name={sql_table_name}") - connection_def: LookerConnectionDefinition = looker_view.connection - - # Check if table name matches cascading derived tables pattern - # derived tables can be referred to using aliases that look like table_name.SQL_TABLE_NAME - # See https://docs.looker.com/data-modeling/learning-lookml/derived-tables#syntax_for_referencing_a_derived_table - if re.fullmatch(r"\w+\.SQL_TABLE_NAME", sql_table_name, flags=re.I): - sql_table_name = sql_table_name.lower().split(".")[0] - # upstream dataset is a looker view based on current view id's project and model - view_id = LookerViewId( - project_name=looker_view.id.project_name, - model_name=looker_view.id.model_name, - view_name=sql_table_name, - file_path=looker_view.id.file_path, - ) - return view_id.get_urn(self.source_config) - - # Ensure sql_table_name is in canonical form (add in db, schema names) - sql_table_name = self._generate_fully_qualified_name( - sql_table_name, connection_def - ) - - return builder.make_dataset_urn_with_platform_instance( - platform=connection_def.platform, - name=sql_table_name.lower(), - platform_instance=connection_def.platform_instance, - env=connection_def.platform_env or self.source_config.env, - ) - - def _get_connection_def_based_on_connection_string( - self, connection: str - ) -> Optional[LookerConnectionDefinition]: - if self.source_config.connection_to_platform_map is None: - self.source_config.connection_to_platform_map = {} - assert self.source_config.connection_to_platform_map is not None - if connection in self.source_config.connection_to_platform_map: - return self.source_config.connection_to_platform_map[connection] - elif self.looker_client: - try: - looker_connection: DBConnection = self.looker_client.connection( - connection - ) - except SDKError: - logger.error( - f"Failed to retrieve connection {connection} from Looker. This usually happens when the credentials provided are not admin credentials." - ) - else: - try: - connection_def: LookerConnectionDefinition = ( - LookerConnectionDefinition.from_looker_connection( - looker_connection - ) - ) - - # Populate the cache (using the config map) to avoid calling looker again for this connection - self.source_config.connection_to_platform_map[ - connection - ] = connection_def - return connection_def - except ConfigurationError: - self.reporter.report_warning( - title="Failed to Resolve Connection", - message="Failed to resolve connection from Looker", - context=f"Connection: {connection}", - ) - - return None - def _get_upstream_lineage( self, looker_view: LookerView ) -> Optional[UpstreamLineage]: - # Merge dataset upstreams with sql table upstreams. - upstream_dataset_urns = [] - for upstream_explore in looker_view.upstream_explores: - # We're creating a "LookerExplore" just to use the urn generator. - upstream_dataset_urn = LookerExplore( - name=upstream_explore, model_name=looker_view.id.model_name - ).get_explore_urn(self.source_config) - upstream_dataset_urns.append(upstream_dataset_urn) - for sql_table_name in looker_view.sql_table_names: - sql_table_name = sql_table_name.replace('"', "").replace("`", "") - upstream_dataset_urn = self._construct_datalineage_urn( - sql_table_name, looker_view - ) - upstream_dataset_urns.append(upstream_dataset_urn) + upstream_dataset_urns = looker_view.upstream_dataset_urns # Generate the upstream + fine grained lineage objects. upstreams = [] observed_lineage_ts = datetime.now(tz=timezone.utc) - fine_grained_lineages: List[FineGrainedLineageClass] = [] for upstream_dataset_urn in upstream_dataset_urns: upstream = UpstreamClass( dataset=upstream_dataset_urn, @@ -1717,32 +353,27 @@ def _get_upstream_lineage( ) upstreams.append(upstream) - if self.source_config.extract_column_level_lineage and ( - looker_view.view_details is not None - and looker_view.view_details.viewLanguage - != VIEW_LANGUAGE_SQL # we currently only map col-level lineage for views without sql - ): - for field in looker_view.fields: - if field.upstream_fields: - fine_grained_lineage = FineGrainedLineageClass( - upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, - upstreams=[ - make_schema_field_urn( - upstream_dataset_urn, upstream_field - ) - for upstream_field in field.upstream_fields - ], - downstreamType=FineGrainedLineageDownstreamType.FIELD, - downstreams=[ - make_schema_field_urn( - looker_view.id.get_urn(self.source_config), - field.name, - ) - ], + fine_grained_lineages: List[FineGrainedLineageClass] = [] + + for field in looker_view.fields: + fine_grained_lineages.append( + FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + upstreams=[ + make_schema_field_urn(cll_ref.table, cll_ref.column) + for cll_ref in field.upstream_fields + ], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[ + make_schema_field_urn( + looker_view.id.get_urn(self.source_config), + field.name, ) - fine_grained_lineages.append(fine_grained_lineage) + ], + ) + ) - if upstreams != []: + if upstreams: return UpstreamLineage( upstreams=upstreams, fineGrainedLineages=fine_grained_lineages or None ) @@ -1801,7 +432,6 @@ def _build_dataset_mcps( ) events = [subTypeEvent] if looker_view.view_details is not None: - viewEvent = MetadataChangeProposalWrapper( entityUrn=view_urn, aspect=looker_view.view_details, @@ -1870,7 +500,8 @@ def get_project_name(self, model_name: str) -> str: return model.project_name except SDKError: raise ValueError( - f"Could not locate a project name for model {model_name}. Consider configuring a static project name in your config file" + f"Could not locate a project name for model {model_name}. Consider configuring a static project name " + f"in your config file" ) def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManifest]: @@ -2050,6 +681,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 self.source_config.project_name, self.base_projects_folder, self.reporter, + self.source_config.liquid_variable, ) # Some views can be mentioned by multiple 'include' statements and can be included via different connections. @@ -2091,11 +723,14 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 continue assert model.connection is not None - connectionDefinition = self._get_connection_def_based_on_connection_string( - model.connection + connection_definition = get_connection_def_based_on_connection_string( + connection=model.connection, + looker_client=self.looker_client, + source_config=self.source_config, + reporter=self.reporter, ) - if connectionDefinition is None: + if connection_definition is None: self.reporter.report_warning( title="Failed to Load Connection", message="Failed to load connection. Check your API key permissions and/or connection_to_platform_map configuration.", @@ -2108,12 +743,13 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 looker_refinement_resolver: LookerRefinementResolver = ( LookerRefinementResolver( looker_model=model, - connection_definition=connectionDefinition, + connection_definition=connection_definition, looker_viewfile_loader=viewfile_loader, source_config=self.source_config, reporter=self.reporter, ) ) + if self.source_config.emit_reachable_views_only: model_explores_map = {d["name"]: d for d in model.explores} for explore_dict in model.explores: @@ -2152,6 +788,14 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 project_name = self.get_project_name(model_name) + looker_view_id_cache: LookerViewIdCache = LookerViewIdCache( + project_name=project_name, + model_name=model_name, + looker_model=model, + looker_viewfile_loader=viewfile_loader, + reporter=self.reporter, + ) + logger.debug(f"Model: {model_name}; Includes: {model.resolved_includes}") for include in model.resolved_includes: @@ -2163,7 +807,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 looker_viewfile = viewfile_loader.load_viewfile( path=include.include, project_name=include.project, - connection=connectionDefinition, + connection=connection_definition, reporter=self.reporter, ) @@ -2195,7 +839,8 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 else project_name ) - # if project is base project then it is available as self.base_projects_folder[_BASE_PROJECT_NAME] + # if project is base project then it is available as self.base_projects_folder[ + # _BASE_PROJECT_NAME] base_folder_path: str = str( self.base_projects_folder.get( current_project_name, @@ -2203,22 +848,27 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 ) ) + view_context: LookerViewContext = LookerViewContext( + raw_view=raw_view, + view_file=looker_viewfile, + view_connection=connection_definition, + view_file_loader=viewfile_loader, + looker_refinement_resolver=looker_refinement_resolver, + base_folder_path=base_folder_path, + reporter=self.reporter, + ) + maybe_looker_view = LookerView.from_looker_dict( project_name=current_project_name, - base_folder_path=base_folder_path, model_name=model_name, - looker_view=raw_view, - connection=connectionDefinition, - looker_viewfile=looker_viewfile, - looker_viewfile_loader=viewfile_loader, - looker_refinement_resolver=looker_refinement_resolver, + view_context=view_context, + looker_view_id_cache=looker_view_id_cache, reporter=self.reporter, max_file_snippet_length=self.source_config.max_file_snippet_length, - parse_table_names_from_sql=self.source_config.parse_table_names_from_sql, - sql_parser_path=self.source_config.sql_parser, extract_col_level_lineage=self.source_config.extract_column_level_lineage, populate_sql_logic_in_descriptions=self.source_config.populate_sql_logic_for_missing_descriptions, - process_isolation_for_sql_parsing=self.source_config.process_isolation_for_sql_parsing, + config=self.source_config, + ctx=self.ctx, ) except Exception as e: self.reporter.report_warning( @@ -2227,6 +877,9 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 context=f"View Details: {raw_view}", exc=e, ) + + logger.debug(e, exc_info=e) + continue if maybe_looker_view: @@ -2277,7 +930,8 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 prev_model_connection, ) = view_connection_mapping if prev_model_connection != model.connection: - # this view has previously been discovered and emitted using a different connection + # this view has previously been discovered and emitted using a different + # connection logger.warning( f"view {maybe_looker_view.id.view_name} from model {model_name}, connection {model.connection} was previously processed via model {prev_model_name}, connection {prev_model_connection} and will likely lead to incorrect lineage to the underlying tables" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/str_functions.py b/metadata-ingestion/src/datahub/ingestion/source/looker/str_functions.py new file mode 100644 index 0000000000000..5426d2b8ab952 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/str_functions.py @@ -0,0 +1,23 @@ +""" +Here write down functions which are operating on string. Like replacing some character and so on +""" +import re + + +def remove_suffix(original: str, suffix: str) -> str: + # This can be removed in favour of original.removesuffix for python>3.8 + if original.endswith(suffix): + return original[: -len(suffix)] + return original + + +def remove_extra_spaces_and_newlines(original: str) -> str: + """ + python-liquid library is not removing extra spaces and new lines from template and hence spaces and newlines + are appearing in urn. This function can be used to remove such characters from urn or text. + """ + return re.sub(r"\s*\n\s*", "", original) + + +def replace_quotes(value: str) -> str: + return value.replace('"', "").replace("`", "") diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/urn_functions.py b/metadata-ingestion/src/datahub/ingestion/source/looker/urn_functions.py new file mode 100644 index 0000000000000..7286beb1f977a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/urn_functions.py @@ -0,0 +1,18 @@ +def get_qualified_table_name(urn: str) -> str: + part: str = urn.split(",")[-2] + + if len(part.split(".")) >= 4: + return ".".join( + part.split(".")[-3:] + ) # return only db.schema.table skip platform instance as higher code is + # failing if encounter platform-instance in qualified table name + else: + return part + + +def get_table_name(urn: str) -> str: + qualified_table_name: str = get_qualified_table_name( + urn=urn, + ) + + return qualified_table_name.split(".")[-1] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py new file mode 100644 index 0000000000000..390e71ef9d4bd --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -0,0 +1,636 @@ +import logging +import re +from abc import ABC, abstractmethod +from functools import lru_cache +from typing import Dict, List, Optional + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.looker.looker_common import ( + LookerExplore, + LookerViewId, + ViewField, + ViewFieldType, +) +from datahub.ingestion.source.looker.looker_view_id_cache import LookerViewIdCache +from datahub.ingestion.source.looker.lookml_concept_context import ( + LookerFieldContext, + LookerViewContext, +) +from datahub.ingestion.source.looker.lookml_config import ( + DERIVED_VIEW_SUFFIX, + NAME, + LookerConnectionDefinition, + LookMLSourceConfig, + LookMLSourceReport, +) +from datahub.ingestion.source.looker.urn_functions import get_qualified_table_name +from datahub.sql_parsing.sqlglot_lineage import ( + ColumnLineageInfo, + ColumnRef, + SqlParsingResult, + Urn, + create_lineage_sql_parsed_result, +) + +logger = logging.getLogger(__name__) + + +def is_derived_view(view_name: str) -> bool: + if DERIVED_VIEW_SUFFIX in view_name.lower(): + return True + + return False + + +def get_derived_looker_view_id( + qualified_table_name: str, + looker_view_id_cache: "LookerViewIdCache", + base_folder_path: str, +) -> Optional[LookerViewId]: + # qualified_table_name can be in either of below format + # 1) db.schema.employee_income_source.sql_table_name + # 2) db.employee_income_source.sql_table_name + # 3) employee_income_source.sql_table_name + # In any of the form we need the text coming before ".sql_table_name" and after last "." + parts: List[str] = re.split( + DERIVED_VIEW_SUFFIX, qualified_table_name, flags=re.IGNORECASE + ) + view_name: str = parts[0].split(".")[-1] + + looker_view_id: Optional[LookerViewId] = looker_view_id_cache.get_looker_view_id( + view_name=view_name, + base_folder_path=base_folder_path, + ) + + return looker_view_id + + +def resolve_derived_view_urn_of_col_ref( + column_refs: List[ColumnRef], + looker_view_id_cache: "LookerViewIdCache", + base_folder_path: str, + config: LookMLSourceConfig, +) -> List[ColumnRef]: + + new_column_refs: List[ColumnRef] = [] + for col_ref in column_refs: + if is_derived_view(col_ref.table.lower()): + new_urns: List[str] = fix_derived_view_urn( + urns=[col_ref.table], + looker_view_id_cache=looker_view_id_cache, + base_folder_path=base_folder_path, + config=config, + ) + if not new_urns: + logger.warning( + f"Not able to resolve to derived view looker id for {col_ref.table}" + ) + continue + + new_column_refs.append(ColumnRef(table=new_urns[0], column=col_ref.column)) + else: + new_column_refs.append(col_ref) + + return new_column_refs + + +def fix_derived_view_urn( + urns: List[str], + looker_view_id_cache: "LookerViewIdCache", + base_folder_path: str, + config: LookMLSourceConfig, +) -> List[str]: + # Regenerate view urn if .sql_table_name is present in urn + new_urns: List[str] = [] + for urn in urns: + if is_derived_view(urn): + looker_view_id = get_derived_looker_view_id( + qualified_table_name=get_qualified_table_name(urn), + looker_view_id_cache=looker_view_id_cache, + base_folder_path=base_folder_path, + ) + + if looker_view_id is None: + logger.warning( + f"Not able to resolve to derived view looker id for {urn}" + ) + continue + + new_urns.append(looker_view_id.get_urn(config=config)) + else: + new_urns.append(urn) + + return new_urns + + +def _platform_names_have_2_parts(platform: str) -> bool: + return platform in {"hive", "mysql", "athena"} + + +def _drop_hive_dot(urn: str) -> str: + """ + This is special handling for hive platform where "hive." is coming in urn's id because of the way SQL + is written in lookml. + + Example: urn:li:dataset:(urn:li:dataPlatform:hive,hive.my_database.my_table,PROD) + + Here we need to transform hive.my_database.my_table to my_database.my_table + """ + if urn.startswith("urn:li:dataset:(urn:li:dataPlatform:hive"): + return re.sub(r"hive\.", "", urn) + + return urn + + +def _drop_hive_dot_from_upstream(upstreams: List[ColumnRef]) -> List[ColumnRef]: + return [ + ColumnRef(table=_drop_hive_dot(column_ref.table), column=column_ref.column) + for column_ref in upstreams + ] + + +def _generate_fully_qualified_name( + sql_table_name: str, + connection_def: LookerConnectionDefinition, + reporter: LookMLSourceReport, +) -> str: + """Returns a fully qualified dataset name, resolved through a connection definition. + Input sql_table_name can be in three forms: table, db.table, db.schema.table""" + # TODO: This function should be extracted out into a Platform specific naming class since name translations + # are required across all connectors + + # Bigquery has "project.db.table" which can be mapped to db.schema.table form + # All other relational db's follow "db.schema.table" + # With the exception of mysql, hive, athena which are "db.table" + + # first detect which one we have + parts = len(sql_table_name.split(".")) + + if parts == 3: + # fully qualified, but if platform is of 2-part, we drop the first level + if _platform_names_have_2_parts(connection_def.platform): + sql_table_name = ".".join(sql_table_name.split(".")[1:]) + return sql_table_name.lower() + + if parts == 1: + # Bare table form + if _platform_names_have_2_parts(connection_def.platform): + dataset_name = f"{connection_def.default_db}.{sql_table_name}" + else: + dataset_name = f"{connection_def.default_db}.{connection_def.default_schema}.{sql_table_name}" + return dataset_name.lower() + + if parts == 2: + # if this is a 2 part platform, we are fine + if _platform_names_have_2_parts(connection_def.platform): + return sql_table_name.lower() + # otherwise we attach the default top-level container + dataset_name = f"{connection_def.default_db}.{sql_table_name}" + return dataset_name.lower() + + reporter.report_warning( + title="Malformed Table Name", + message="Table name has more than 3 parts.", + context=f"Table Name: {sql_table_name}", + ) + return sql_table_name.lower() + + +class AbstractViewUpstream(ABC): + """ + Implementation of this interface extracts the view upstream as per the way the view is bound to datasets. + For detail explanation please refer lookml_concept_context.LookerViewContext documentation. + """ + + view_context: LookerViewContext + looker_view_id_cache: LookerViewIdCache + config: LookMLSourceConfig + ctx: PipelineContext + + def __init__( + self, + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, + config: LookMLSourceConfig, + ctx: PipelineContext, + ): + self.view_context = view_context + self.looker_view_id_cache = looker_view_id_cache + self.config = config + self.ctx = ctx + + @abstractmethod + def get_upstream_column_ref( + self, field_context: LookerFieldContext + ) -> List[ColumnRef]: + pass + + @abstractmethod + def get_upstream_dataset_urn(self) -> List[Urn]: + pass + + def create_fields(self) -> List[ViewField]: + return [] # it is for the special case + + +class SqlBasedDerivedViewUpstream(AbstractViewUpstream): + """ + Handle the case where upstream dataset is defined in derived_table.sql + """ + + def __init__( + self, + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, + config: LookMLSourceConfig, + ctx: PipelineContext, + ): + super().__init__(view_context, looker_view_id_cache, config, ctx) + # These are the function where we need to catch the response once calculated + self._get_spr = lru_cache(maxsize=1)(self.__get_spr) + self._get_upstream_dataset_urn = lru_cache(maxsize=1)( + self.__get_upstream_dataset_urn + ) + + def __get_spr(self) -> Optional[SqlParsingResult]: + # for backward compatibility + if not self.config.parse_table_names_from_sql: + return None + + spr = create_lineage_sql_parsed_result( + query=self.view_context.sql(), + default_schema=self.view_context.view_connection.default_schema, + default_db=self.view_context.view_connection.default_db, + platform=self.view_context.view_connection.platform, + platform_instance=self.view_context.view_connection.platform_instance, + env=self.view_context.view_connection.platform_env or self.config.env, + graph=self.ctx.graph, + ) + + if ( + spr.debug_info.table_error is not None + or spr.debug_info.column_error is not None + ): + logging.debug( + f"Failed to parsed the sql query. table_error={spr.debug_info.table_error} and " + f"column_error={spr.debug_info.column_error}" + ) + return None + + return spr + + def __get_upstream_dataset_urn(self) -> List[Urn]: + sql_parsing_result: Optional[SqlParsingResult] = self._get_spr() + + if sql_parsing_result is None: + return [] + + upstream_dataset_urns: List[str] = [ + _drop_hive_dot(urn) for urn in sql_parsing_result.in_tables + ] + + # fix any derived view reference present in urn + upstream_dataset_urns = fix_derived_view_urn( + urns=upstream_dataset_urns, + looker_view_id_cache=self.looker_view_id_cache, + base_folder_path=self.view_context.base_folder_path, + config=self.config, + ) + + return upstream_dataset_urns + + def create_fields(self) -> List[ViewField]: + spr: Optional[SqlParsingResult] = self._get_spr() + + if spr is None: + return [] + + fields: List[ViewField] = [] + + column_lineages: List[ColumnLineageInfo] = ( + spr.column_lineage if spr.column_lineage is not None else [] + ) + + for cll in column_lineages: + fields.append( + ViewField( + name=cll.downstream.column, + label="", + type=cll.downstream.native_column_type + if cll.downstream.native_column_type is not None + else "unknown", + description="", + field_type=ViewFieldType.UNKNOWN, + upstream_fields=_drop_hive_dot_from_upstream(cll.upstreams), + ) + ) + + return fields + + def get_upstream_column_ref( + self, field_context: LookerFieldContext + ) -> List[ColumnRef]: + sql_parsing_result: Optional[SqlParsingResult] = self._get_spr() + + if sql_parsing_result is None: + return [] + + upstreams_column_refs: List[ColumnRef] = [] + if sql_parsing_result.column_lineage: + for cll in sql_parsing_result.column_lineage: + if cll.downstream.column == field_context.name(): + upstreams_column_refs = cll.upstreams + break + + # field might get skip either because of Parser not able to identify the column from GMS + # in-case of "select * from look_ml_view.SQL_TABLE_NAME" or extra field are defined in the looker view which is + # referring to upstream table + if self._get_upstream_dataset_urn() and not upstreams_column_refs: + upstreams_column_refs = [ + ColumnRef( + table=self._get_upstream_dataset_urn()[ + 0 + ], # 0th index has table of from clause + column=column, + ) + for column in field_context.column_name_in_sql_attribute() + ] + + # fix any derived view reference present in urn + upstreams_column_refs = resolve_derived_view_urn_of_col_ref( + column_refs=upstreams_column_refs, + looker_view_id_cache=self.looker_view_id_cache, + base_folder_path=self.view_context.base_folder_path, + config=self.config, + ) + + return upstreams_column_refs + + def get_upstream_dataset_urn(self) -> List[Urn]: + return self._get_upstream_dataset_urn() + + +class NativeDerivedViewUpstream(AbstractViewUpstream): + """ + Handle the case where upstream dataset is defined as derived_table.explore_source + """ + + upstream_dataset_urns: List[str] + explore_column_mapping: Dict + + def __init__( + self, + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, + config: LookMLSourceConfig, + ctx: PipelineContext, + ): + super().__init__(view_context, looker_view_id_cache, config, ctx) + self._get_upstream_dataset_urn = lru_cache(maxsize=1)( + self.__get_upstream_dataset_urn + ) + self._get_explore_column_mapping = lru_cache(maxsize=1)( + self.__get_explore_column_mapping + ) + + def __get_upstream_dataset_urn(self) -> List[str]: + current_view_id: Optional[ + LookerViewId + ] = self.looker_view_id_cache.get_looker_view_id( + view_name=self.view_context.name(), + base_folder_path=self.view_context.base_folder_path, + ) + + # Current view will always be present in cache. The assert will silence the lint + assert current_view_id + + # We're creating a "LookerExplore" just to use the urn generator. + upstream_dataset_urns: List[str] = [ + LookerExplore( + name=self.view_context.explore_source()[NAME], + model_name=current_view_id.model_name, + ).get_explore_urn(self.config) + ] + + return upstream_dataset_urns + + def __get_explore_column_mapping(self) -> Dict: + explore_columns: Dict = self.view_context.explore_source().get("columns", {}) + + explore_column_mapping = {} + + for column in explore_columns: + explore_column_mapping[column[NAME]] = column + + return explore_column_mapping + + def get_upstream_column_ref( + self, field_context: LookerFieldContext + ) -> List[ColumnRef]: + upstream_column_refs: List[ColumnRef] = [] + + if not self._get_upstream_dataset_urn(): + # No upstream explore dataset found + logging.debug( + f"upstream explore not found for field {field_context.name()} of view {self.view_context.name()}" + ) + return upstream_column_refs + + explore_urn: str = self._get_upstream_dataset_urn()[0] + + for column in field_context.column_name_in_sql_attribute(): + if column in self._get_explore_column_mapping(): + explore_column: Dict = self._get_explore_column_mapping()[column] + upstream_column_refs.append( + ColumnRef( + column=explore_column.get("field", explore_column[NAME]), + table=explore_urn, + ) + ) + + return upstream_column_refs + + def get_upstream_dataset_urn(self) -> List[Urn]: + return self._get_upstream_dataset_urn() + + +class RegularViewUpstream(AbstractViewUpstream): + """ + Handle the case where upstream dataset name is equal to view-name + """ + + upstream_dataset_urn: Optional[str] + + def __init__( + self, + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, + config: LookMLSourceConfig, + ctx: PipelineContext, + ): + super().__init__(view_context, looker_view_id_cache, config, ctx) + self.upstream_dataset_urn = None + + self._get_upstream_dataset_urn = lru_cache(maxsize=1)( + self.__get_upstream_dataset_urn + ) + + def __get_upstream_dataset_urn(self) -> Urn: + # In regular case view's upstream dataset is either same as view-name or mentioned in "sql_table_name" field + # view_context.sql_table_name() handle this condition to return dataset name + qualified_table_name: str = _generate_fully_qualified_name( + sql_table_name=self.view_context.sql_table_name(), + connection_def=self.view_context.view_connection, + reporter=self.view_context.reporter, + ) + + self.upstream_dataset_urn = make_dataset_urn_with_platform_instance( + platform=self.view_context.view_connection.platform, + name=qualified_table_name.lower(), + platform_instance=self.view_context.view_connection.platform_instance, + env=self.view_context.view_connection.platform_env or self.config.env, + ) + + return self.upstream_dataset_urn + + def get_upstream_column_ref( + self, field_context: LookerFieldContext + ) -> List[ColumnRef]: + upstream_column_ref: List[ColumnRef] = [] + + for column_name in field_context.column_name_in_sql_attribute(): + upstream_column_ref.append( + ColumnRef(table=self._get_upstream_dataset_urn(), column=column_name) + ) + + return upstream_column_ref + + def get_upstream_dataset_urn(self) -> List[Urn]: + return [self._get_upstream_dataset_urn()] + + +class DotSqlTableNameViewUpstream(AbstractViewUpstream): + """ + Handle the case where upstream dataset name is mentioned as sql_table_name: ${view-name.SQL_TABLE_NAME} + """ + + upstream_dataset_urn: List[Urn] + + def __init__( + self, + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, + config: LookMLSourceConfig, + ctx: PipelineContext, + ): + super().__init__(view_context, looker_view_id_cache, config, ctx) + self.upstream_dataset_urn = [] + + self._get_upstream_dataset_urn = lru_cache(maxsize=1)( + self.__get_upstream_dataset_urn + ) + + def __get_upstream_dataset_urn(self) -> List[Urn]: + # In this case view_context.sql_table_name() refers to derived view name + looker_view_id = get_derived_looker_view_id( + qualified_table_name=_generate_fully_qualified_name( + self.view_context.sql_table_name(), + self.view_context.view_connection, + self.view_context.reporter, + ), + base_folder_path=self.view_context.base_folder_path, + looker_view_id_cache=self.looker_view_id_cache, + ) + + if looker_view_id is not None: + self.upstream_dataset_urn = [ + looker_view_id.get_urn( + config=self.config, + ) + ] + + return self.upstream_dataset_urn + + def get_upstream_column_ref( + self, field_context: LookerFieldContext + ) -> List[ColumnRef]: + upstream_column_ref: List[ColumnRef] = [] + if not self._get_upstream_dataset_urn(): + return upstream_column_ref + + for column_name in field_context.column_name_in_sql_attribute(): + upstream_column_ref.append( + ColumnRef(table=self._get_upstream_dataset_urn()[0], column=column_name) + ) + + return upstream_column_ref + + def get_upstream_dataset_urn(self) -> List[Urn]: + return self._get_upstream_dataset_urn() + + +class EmptyImplementation(AbstractViewUpstream): + def get_upstream_column_ref( + self, field_context: LookerFieldContext + ) -> List[ColumnRef]: + return [] + + def get_upstream_dataset_urn(self) -> List[Urn]: + return [] + + +def create_view_upstream( + view_context: LookerViewContext, + looker_view_id_cache: LookerViewIdCache, + config: LookMLSourceConfig, + ctx: PipelineContext, + reporter: LookMLSourceReport, +) -> AbstractViewUpstream: + if view_context.is_regular_case(): + return RegularViewUpstream( + view_context=view_context, + config=config, + ctx=ctx, + looker_view_id_cache=looker_view_id_cache, + ) + + if view_context.is_sql_table_name_referring_to_view(): + return DotSqlTableNameViewUpstream( + view_context=view_context, + config=config, + ctx=ctx, + looker_view_id_cache=looker_view_id_cache, + ) + + if ( + view_context.is_sql_based_derived_case() + or view_context.is_sql_based_derived_view_without_fields_case() + ): + return SqlBasedDerivedViewUpstream( + view_context=view_context, + config=config, + ctx=ctx, + looker_view_id_cache=looker_view_id_cache, + ) + + if view_context.is_native_derived_case(): + return NativeDerivedViewUpstream( + view_context=view_context, + config=config, + ctx=ctx, + looker_view_id_cache=looker_view_id_cache, + ) + + reporter.report_warning( + title="Implementation Not Found", + message="No implementation found to resolve upstream of the view", + context=view_context.view_file_name(), + ) + + return EmptyImplementation( + view_context=view_context, + config=config, + ctx=ctx, + looker_view_id_cache=looker_view_id_cache, + ) diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 3e049f8b2ef4e..e1cedee33dcb6 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -1050,23 +1050,34 @@ def test_upstream_cll(pytestconfig, tmp_path, mock_time, mock_datahub_graph): ), ], ) + config = mock.MagicMock() + + config.view_naming_pattern.replace_variables.return_value = "dataset_lineages" + config.platform_name = "snowflake" + config.platform_instance = "sales" + config.env = "DEV" looker_explore: Optional[LookerExplore] = looker_common.LookerExplore.from_api( model="fake", explore_name="my_explore_name", client=mocked_client, reporter=mock.MagicMock(), - source_config=mock.MagicMock(), + source_config=config, ) assert looker_explore is not None assert looker_explore.name == "my_explore_name" assert looker_explore.fields is not None assert len(looker_explore.fields) == 3 + assert ( - looker_explore.fields[2].upstream_fields[0] == "dataset_lineages.createdon" + looker_explore.fields[2].upstream_fields[0].table + == "urn:li:dataset:(urn:li:dataPlatform:snowflake," + "sales.dataset_lineages,DEV)" ) + assert looker_explore.fields[2].upstream_fields[0].column == "createdon" + @freeze_time(FROZEN_TIME) def test_explore_tags(pytestconfig, tmp_path, mock_time, mock_datahub_graph): diff --git a/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json index 149610768af51..ca9a1503a6854 100644 --- a/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/duplicate_field_ingestion_golden.json @@ -167,7 +167,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD)", "type": "VIEW" } ], @@ -175,7 +175,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),entity)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),entity)" ], "downstreamType": "FIELD", "downstreams": [ @@ -186,7 +186,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),metadata)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),metadata)" ], "downstreamType": "FIELD", "downstreams": [ @@ -197,7 +197,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),urn)" ], "downstreamType": "FIELD", "downstreams": [ @@ -208,7 +208,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),version)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),version)" ], "downstreamType": "FIELD", "downstreams": [ @@ -219,7 +219,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),createdon)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),createdon)" ], "downstreamType": "FIELD", "downstreams": [ @@ -230,7 +230,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),count)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),count)" ], "downstreamType": "FIELD", "downstreams": [ diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index 1a789af60a855..d870c6dee4065 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -167,9 +167,66 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -428,6 +485,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -662,7 +765,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.include_able,PROD)", "type": "VIEW" } ] @@ -788,7 +891,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.events,PROD)", "type": "VIEW" } ] @@ -914,7 +1017,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.events,PROD)", "type": "VIEW" } ], @@ -922,7 +1025,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD),additional_measure)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.events,PROD),additional_measure)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1098,7 +1201,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" } ] @@ -1224,7 +1327,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.include_able,PROD)", "type": "VIEW" } ] @@ -1299,7 +1402,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1350,9 +1453,44 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1377,7 +1515,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1391,7 +1529,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1405,7 +1543,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1494,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1545,12 +1683,88 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,order,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } ] } }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -1671,7 +1885,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.ecommerce.ability,PROD)", "type": "VIEW" } ], @@ -1679,7 +1893,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD),pk)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.ecommerce.ability,PROD),pk)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1690,7 +1904,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD),count)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.ecommerce.ability,PROD),count)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1887,7 +2101,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD)", "type": "VIEW" } ], @@ -1895,7 +2109,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD),id)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1906,7 +2120,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),owner_name)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD),owner_name)" ], "downstreamType": "FIELD", "downstreams": [ @@ -2385,7 +2599,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.flightstats.accidents,PROD)", "type": "VIEW" } ], @@ -2393,7 +2607,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.flightstats.accidents,PROD),id)" ], "downstreamType": "FIELD", "downstreams": [ diff --git a/metadata-ingestion/tests/integration/lookml/field_tag_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/field_tag_ingestion_golden.json index fdd37139880bd..19352a85249ba 100644 --- a/metadata-ingestion/tests/integration/lookml/field_tag_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/field_tag_ingestion_golden.json @@ -167,7 +167,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD)", "type": "VIEW" } ], @@ -175,7 +175,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),entity)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),entity)" ], "downstreamType": "FIELD", "downstreams": [ @@ -186,7 +186,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),metadata)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),metadata)" ], "downstreamType": "FIELD", "downstreams": [ @@ -197,7 +197,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),urn)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),urn)" ], "downstreamType": "FIELD", "downstreams": [ @@ -208,7 +208,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),version)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),version)" ], "downstreamType": "FIELD", "downstreams": [ @@ -219,7 +219,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),createdon)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),createdon)" ], "downstreamType": "FIELD", "downstreams": [ @@ -230,7 +230,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.public.dataset_lineages,PROD),count)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.public.dataset_lineages,PROD),count)" ], "downstreamType": "FIELD", "downstreams": [ diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples/liquid.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples/liquid.view.lkml index bfeebfba09aa6..8d26db7931b0d 100644 --- a/metadata-ingestion/tests/integration/lookml/lkml_samples/liquid.view.lkml +++ b/metadata-ingestion/tests/integration/lookml/lkml_samples/liquid.view.lkml @@ -1,14 +1,14 @@ view: customer_facts { derived_table: { -sql: - SELECT - customer_id, - SUM(sale_price) AS lifetime_spend - FROM - order - WHERE - {% condition order_region %} order.region {% endcondition %} - GROUP BY 1 - ;; - } + sql: + SELECT + customer_id, + SUM(sale_price) AS lifetime_spend + FROM + order + WHERE + {% condition order_region %} order.region {% endcondition %} + GROUP BY 1 + ;; + } } diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples/nested/fragment_derived.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples/nested/fragment_derived.view.lkml index 284bf4172cfdc..d8397cc355e0e 100644 --- a/metadata-ingestion/tests/integration/lookml/lkml_samples/nested/fragment_derived.view.lkml +++ b/metadata-ingestion/tests/integration/lookml/lkml_samples/nested/fragment_derived.view.lkml @@ -1,9 +1,9 @@ view: fragment_derived_view { derived_table: { - sql: date DATE encode ZSTD, - platform VARCHAR(20) encode ZSTD AS aliased_platform, - country VARCHAR(20) encode ZSTD + sql: SELECT date AS DATE, + platform AS aliased_platform, + country ;; } } diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/included_view_file.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/included_view_file.view.lkml index 6e1ccda22cac7..06439ad14ef88 100644 --- a/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/included_view_file.view.lkml +++ b/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/included_view_file.view.lkml @@ -1,3 +1,3 @@ view: include_able_view { - sql_table_name: looker_schema.include_able ;; + sql_table_name: "looker_schema"."include_able" ;; } diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/liquid.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/liquid.view.lkml index bfeebfba09aa6..4ce62579d08ed 100644 --- a/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/liquid.view.lkml +++ b/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/liquid.view.lkml @@ -1,6 +1,6 @@ view: customer_facts { derived_table: { -sql: + sql: SELECT customer_id, SUM(sale_price) AS lifetime_spend diff --git a/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/nested/fragment_derived.view.lkml b/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/nested/fragment_derived.view.lkml index 284bf4172cfdc..d8397cc355e0e 100644 --- a/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/nested/fragment_derived.view.lkml +++ b/metadata-ingestion/tests/integration/lookml/lkml_samples_hive/nested/fragment_derived.view.lkml @@ -1,9 +1,9 @@ view: fragment_derived_view { derived_table: { - sql: date DATE encode ZSTD, - platform VARCHAR(20) encode ZSTD AS aliased_platform, - country VARCHAR(20) encode ZSTD + sql: SELECT date AS DATE, + platform AS aliased_platform, + country ;; } } diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 05c950f9e1051..8813ea532fa2b 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -170,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -428,6 +485,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1299,7 +1402,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1353,6 +1456,41 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1377,7 +1515,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1391,7 +1529,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1405,7 +1543,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1494,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1548,9 +1686,85 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:bigquery,project-foo.default-db.order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } ] } }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 23384d6070d20..4bc1a0f2f7da5 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -170,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -428,6 +485,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1299,7 +1402,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1353,6 +1456,41 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1377,7 +1515,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1391,7 +1529,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1405,7 +1543,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1494,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1548,9 +1686,85 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,default-hive-db.order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } ] } }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 7e323170e58da..3fd37c4722185 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -159,6 +159,77 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } + ] + } + }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { "schemaName": "my_view", @@ -403,6 +474,66 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } + ] + } + }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { "schemaName": "my_derived_view", @@ -1271,7 +1402,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1314,6 +1445,121 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "fragment_derived_view", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "date", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "aliased_platform", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "country", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -1386,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1429,20 +1675,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", - "type": "VIEW" - } - ] - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index 579a984b88243..3fd37c4722185 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -170,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -428,6 +485,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1299,7 +1402,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1353,6 +1456,41 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1377,7 +1515,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1391,7 +1529,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1405,7 +1543,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1494,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1537,20 +1675,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", - "type": "VIEW" - } - ] - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index d1487a62e95a8..bb8a379fdde22 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -170,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -428,6 +485,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1299,7 +1402,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1353,6 +1456,41 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.fragment_derived_view,DEV),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1377,7 +1515,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1391,7 +1529,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1405,7 +1543,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1494,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1537,20 +1675,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)", - "type": "VIEW" - } - ] - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index d7793fbed8ef0..b8a2bcc020c34 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -170,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -429,6 +486,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1306,7 +1409,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1360,6 +1463,41 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1384,7 +1522,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1398,7 +1536,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1412,7 +1550,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1502,7 +1640,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1545,20 +1683,6 @@ "removed": false } }, - { - "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { - "upstreams": [ - { - "auditStamp": { - "time": 1586847600000, - "actor": "urn:li:corpuser:datahub" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", - "type": "VIEW" - } - ] - } - }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 5c43d1dc5ceaf..8c3504a736490 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -142,30 +142,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { @@ -194,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -350,13 +383,19 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } ] } }, @@ -370,12 +409,12 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "subTypes", "aspect": { "json": { - "materialized": false, - "viewLogic": "view: owners {\n dimension: id {\n primary_key: yes\n sql: ${TABLE}.id ;;\n }\n dimension: owner_name {\n sql: ${TABLE}.owner_name ;;\n }\n}", - "viewLanguage": "lookml" + "typeNames": [ + "View" + ] } }, "systemMetadata": { @@ -388,10 +427,12 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "viewProperties", "aspect": { "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + "materialized": false, + "viewLogic": "view: owners {\n dimension: id {\n primary_key: yes\n sql: ${TABLE}.id ;;\n }\n dimension: owner_name {\n sql: ${TABLE}.owner_name ;;\n }\n}", + "viewLanguage": "lookml" } }, "systemMetadata": { @@ -404,18 +445,10 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" } }, "systemMetadata": { @@ -568,13 +601,19 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.owners,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } ] } }, @@ -588,12 +627,12 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "subTypes", "aspect": { "json": { - "materialized": false, - "viewLogic": "SELECT\n is_latest,\n country,\n city,\n timestamp,\n measurement\n FROM\n my_table", - "viewLanguage": "sql" + "typeNames": [ + "View" + ] } }, "systemMetadata": { @@ -606,10 +645,12 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "viewProperties", "aspect": { "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + "materialized": false, + "viewLogic": "SELECT\n is_latest,\n country,\n city,\n timestamp,\n measurement\n FROM\n my_table", + "viewLanguage": "sql" } }, "systemMetadata": { @@ -622,18 +663,10 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - } - ] + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" } }, "systemMetadata": { @@ -670,6 +703,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,rs_warehouse.default_db.default_schema.my_table,DEV),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -824,6 +914,30 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_view2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_same_name_views_different_file_path.json b/metadata-ingestion/tests/integration/lookml/lookml_same_name_views_different_file_path.json index a5357ccada8a3..b86f7703e6f5e 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_same_name_views_different_file_path.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_same_name_views_different_file_path.json @@ -142,36 +142,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - }, - { - "id": "path1" - }, - { - "id": "foo" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1586847600000, - "runId": "lookml-test", - "lastRunId": "no-run-id-provided" - } -}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { @@ -200,6 +170,63 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -356,13 +383,25 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path1.foo.view.my_view,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "browsePathsV2", "aspect": { "json": { - "typeNames": [ - "View" + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + }, + { + "id": "path1" + }, + { + "id": "foo" + } ] } }, @@ -376,12 +415,12 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD)", "changeType": "UPSERT", - "aspectName": "viewProperties", + "aspectName": "subTypes", "aspect": { "json": { - "materialized": false, - "viewLogic": "SELECT\n is_latest,\n country,\n city,\n timestamp,\n measurement\n FROM\n my_table", - "viewLanguage": "sql" + "typeNames": [ + "View" + ] } }, "systemMetadata": { @@ -394,10 +433,12 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "viewProperties", "aspect": { "json": { - "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + "materialized": false, + "viewLogic": "SELECT\n is_latest,\n country,\n city,\n timestamp,\n measurement\n FROM\n my_table", + "viewLanguage": "sql" } }, "systemMetadata": { @@ -410,24 +451,10 @@ "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "Develop" - }, - { - "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", - "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" - }, - { - "id": "path2" - }, - { - "id": "foo" - } - ] + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" } }, "systemMetadata": { @@ -464,6 +491,52 @@ "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.my_table,DEV),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -597,6 +670,36 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.path2.foo.view.my_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + }, + { + "id": "path2" + }, + { + "id": "foo" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index 25d6511d172a6..7265ee3c6c62b 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -167,9 +167,66 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..my_table,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),is_latest)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),is_latest)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,my_table,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -428,9 +485,55 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_view,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),city)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD),city)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),timestamp)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD),timestamp)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.foo.view.my_view,PROD),measurement)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.bar.view.my_derived_view,PROD),average_measurement)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -668,7 +771,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.include_able,PROD)", "type": "VIEW" } ] @@ -797,7 +900,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.events,PROD)", "type": "VIEW" } ] @@ -926,7 +1029,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.events,PROD)", "type": "VIEW" } ], @@ -934,7 +1037,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.events,PROD),additional_measure)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.events,PROD),additional_measure)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1113,7 +1216,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..autodetect_sql_name_based_on_view_name,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,..autodetect_sql_name_based_on_view_name,PROD)", "type": "VIEW" } ] @@ -1242,7 +1345,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.looker_schema.include_able,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.looker_schema.include_able,PROD)", "type": "VIEW" } ] @@ -1320,7 +1423,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "date DATE encode ZSTD, \n platform VARCHAR(20) encode ZSTD AS aliased_platform, \n country VARCHAR(20) encode ZSTD", + "viewLogic": "SELECT date AS DATE,\n platform AS aliased_platform,\n country", "viewLanguage": "sql" } }, @@ -1371,9 +1474,44 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..fragment_derived_view,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD),date)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD),date)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD),platform)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD),aliased_platform)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,fragment_derived_view,PROD),country)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.nested.fragment_derived.view.fragment_derived_view,PROD),country)" + ], + "confidenceScore": 1.0 + } ] } }, @@ -1398,7 +1536,7 @@ }, "fields": [ { - "fieldPath": "aliased_platform", + "fieldPath": "date", "nullable": false, "description": "", "label": "", @@ -1412,7 +1550,7 @@ "isPartOfKey": false }, { - "fieldPath": "country", + "fieldPath": "aliased_platform", "nullable": false, "description": "", "label": "", @@ -1426,7 +1564,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", + "fieldPath": "country", "nullable": false, "description": "", "label": "", @@ -1518,7 +1656,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", "viewLanguage": "sql" } }, @@ -1569,12 +1707,88 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..order,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,order,PROD)", "type": "VIEW" } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.liquid.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.liquid.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } ] } }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { @@ -1698,7 +1912,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.ecommerce.ability,PROD)", "type": "VIEW" } ], @@ -1706,7 +1920,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD),pk)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.ecommerce.ability,PROD),pk)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1717,7 +1931,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.ecommerce.ability,PROD),count)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.ecommerce.ability,PROD),count)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1917,7 +2131,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD)", "type": "VIEW" } ], @@ -1925,7 +2139,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD),id)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1936,7 +2150,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),owner_name)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD),owner_name)" ], "downstreamType": "FIELD", "downstreams": [ @@ -1947,7 +2161,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,..owners,PROD),owner_name)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,..owners,PROD),owner_name)" ], "downstreamType": "FIELD", "downstreams": [ @@ -2453,7 +2667,7 @@ "time": 1586847600000, "actor": "urn:li:corpuser:datahub" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,.flightstats.accidents,PROD)", "type": "VIEW" } ], @@ -2461,7 +2675,7 @@ { "upstreamType": "FIELD_SET", "upstreams": [ - "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:conn,.flightstats.accidents,PROD),id)" + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,.flightstats.accidents,PROD),id)" ], "downstreamType": "FIELD", "downstreams": [ diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 1099a29ba3b8c..9e051995d0b94 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -12,6 +12,10 @@ from datahub.configuration.common import PipelineExecutionError from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.file import read_metadata_file +from datahub.ingestion.source.looker.looker_template_language import ( + SpecialVariable, + resolve_liquid_variable, +) from datahub.ingestion.source.looker.lookml_source import ( LookerModel, LookerRefinementResolver, @@ -40,12 +44,13 @@ def get_default_recipe(output_file_path, base_folder_path): "type": "lookml", "config": { "base_folder": base_folder_path, - "connection_to_platform_map": {"my_connection": "conn"}, + "connection_to_platform_map": {"my_connection": "postgres"}, "parse_table_names_from_sql": True, "tag_measures_and_dimensions": False, "project_name": "lkml_samples", "model_pattern": {"deny": ["data2"]}, "emit_reachable_views_only": False, + "liquid_variable": {"order_region": "ap-south-1"}, }, }, "sink": { @@ -63,9 +68,9 @@ def test_lookml_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out_file = "expected_output.json" - # Note this config below is known to create "bad" lineage since the config author has not provided enough information - # to resolve relative table names (which are not fully qualified) - # We keep this check just to validate that ingestion doesn't croak on this config + # Note this config below is known to create "bad" lineage since the config author has not provided enough + # information to resolve relative table names (which are not fully qualified) We keep this check just to validate + # that ingestion doesn't croak on this config pipeline = Pipeline.create( get_default_recipe( @@ -488,6 +493,9 @@ def ingestion_test( "model_pattern": {"deny": ["data2"]}, "emit_reachable_views_only": False, "process_refinements": False, + "liquid_variable": { + "order_region": "ap-south-1", + }, }, }, "sink": { @@ -890,6 +898,117 @@ def test_duplicate_field_ingest(pytestconfig, tmp_path, mock_time): ) +@freeze_time(FROZEN_TIME) +def test_view_to_view_lineage_and_liquid_template(pytestconfig, tmp_path, mock_time): + test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" + mce_out_file = "vv_lineage_liquid_template_golden.json" + + new_recipe = get_default_recipe( + f"{tmp_path}/{mce_out_file}", + f"{test_resources_dir}/vv-lineage-and-liquid-templates", + ) + + new_recipe["source"]["config"]["liquid_variable"] = { + "_user_attributes": { + "looker_env": "dev", + "dev_database_prefix": "employee", + "dev_schema_prefix": "public", + }, + "dw_eff_dt_date": { + "_is_selected": True, + }, + "source_region": "ap-south-1", + } + + pipeline = Pipeline.create(new_recipe) + pipeline.run() + pipeline.pretty_print_summary() + pipeline.raise_from_status(raise_warnings=True) + + golden_path = test_resources_dir / "vv_lineage_liquid_template_golden.json" + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / mce_out_file, + golden_path=golden_path, + ) + + +@freeze_time(FROZEN_TIME) +def test_special_liquid_variables(): + text: str = """ + SELECT + employee_id, + employee_name, + {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %} + prod_core.data.r_metric_summary_v2 + {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._in_query %} + prod_core.data.r_metric_summary_v3 + {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_filtered %} + prod_core.data.r_metric_summary_v4 + {% else %} + 'default_table' as source + {% endif %}, + employee_income + FROM source_table + """ + input_liquid_variable: dict = {} + + expected_liquid_variable: dict = { + **input_liquid_variable, + "dw_eff_dt_date": {"_is_selected": True}, + "finance_dw_eff_dt_date": {"_is_selected": True}, + "dw_eff_dt_week": {"_is_selected": True}, + "finance_dw_eff_dt_week": { + "_in_query": True, + "_is_filtered": True, + }, + } + + actual_liquid_variable = SpecialVariable( + input_liquid_variable + ).liquid_variable_with_default(text) + assert ( + expected_liquid_variable == actual_liquid_variable + ) # Here new keys with default value should get added + + # change input + input_liquid_variable = { + "finance_dw_eff_dt_week": {"_is_filtered": False}, + } + + expected_liquid_variable = { + **input_liquid_variable, + "dw_eff_dt_date": {"_is_selected": True}, + "finance_dw_eff_dt_date": {"_is_selected": True}, + "dw_eff_dt_week": {"_is_selected": True}, + "finance_dw_eff_dt_week": { + "_in_query": True, + "_is_filtered": False, + }, + } + + actual_liquid_variable = SpecialVariable( + input_liquid_variable + ).liquid_variable_with_default(text) + assert ( + expected_liquid_variable == actual_liquid_variable + ) # should not overwrite the actual value present in + # input_liquid_variable + + # Match template after resolution of liquid variables + actual_text = resolve_liquid_variable( + text=text, + liquid_variable=input_liquid_variable, + ) + + expected_text: str = ( + "\n SELECT\n employee_id,\n employee_name,\n \n " + "prod_core.data.r_metric_summary_v2\n ,\n employee_income\n FROM " + "source_table\n " + ) + assert actual_text == expected_text + + @freeze_time(FROZEN_TIME) def test_field_tag_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/activity_logs.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/activity_logs.view.lkml new file mode 100644 index 0000000000000..f0e2dec6e4678 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/activity_logs.view.lkml @@ -0,0 +1,18 @@ +view: activity_logs { + sql_table_name: + {% if _user_attributes['looker_env'] == 'dev' %} + {{ _user_attributes['dev_database_prefix'] }}analytics.{{ _user_attributes['dev_schema_prefix'] }}staging_app.stg_app__activity_logs + {% elsif _user_attributes['looker_env'] == 'prod' %} + analytics.staging_app.stg_app__activity_logs + {% else %} + analytics.staging_app.stg_app__activity_logs + {% endif %} + ;; + + dimension: generated_message_id { + group_label: "IDs" + primary_key: yes + type: number + sql: ${TABLE}."GENERATED_MESSAGE_ID" ;; + } +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml new file mode 100644 index 0000000000000..ea55512c5ca06 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml @@ -0,0 +1,22 @@ +connection: "my_connection" + +include: "activity_logs.view.lkml" +include: "employee_income_source.view.lkml" +include: "employee_total_income.view.lkml" +include: "top_10_employee_income_source.view.lkml" +include: "employee_tax_report.view.lkml" + +explore: activity_logs { +} + +explore: employee_income_source { +} + +explore: employee_total_income { +} + +explore: top_10_employee_income_source { +} + +explore: employee_tax_report { +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source.view.lkml new file mode 100644 index 0000000000000..f4a443ab11537 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_income_source.view.lkml @@ -0,0 +1,40 @@ +view: employee_income_source { + derived_table: { + sql: SELECT + employee_id, + employee_name, + {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %} + prod_core.data.r_metric_summary_v2 + {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %} + prod_core.data.r_metric_summary_v3 + {% else %} + 'default_table' as source + {% endif %}, + employee_income + FROM source_table + WHERE + {% condition source_region %} source_table.region {% endcondition %} + ;; + } + + dimension: id { + type: number + sql: ${TABLE}.employee_id;; + } + + dimension: name { + type: string + sql: ${TABLE}.employee_name;; + } + + dimension: source { + type: string + sql: ${TABLE}.source ;; + } + + dimension: income { + type: number + sql: ${TABLE}.employee_income ;; + } + +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_tax_report.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_tax_report.view.lkml new file mode 100644 index 0000000000000..6608921e6f095 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_tax_report.view.lkml @@ -0,0 +1,18 @@ +view: employee_tax_report { + sql_table_name: data-warehouse.finance.form-16;; + + dimension: id { + type: number + sql: ${TABLE}.id;; + } + + dimension: name { + type: string + sql: ${TABLE}.name;; + } + + measure: taxable_income { + type: sum + sql: ${TABLE}.tax;; + } +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_total_income.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_total_income.view.lkml new file mode 100644 index 0000000000000..18a1ab660b3a1 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_total_income.view.lkml @@ -0,0 +1,18 @@ +view: employee_total_income { + sql_table_name: ${employee_income_source.SQL_TABLE_NAME} ;; + + dimension: id { + type: number + sql: ${TABLE}.id;; + } + + dimension: name { + type: string + sql: ${TABLE}.name;; + } + + measure: total_income { + type: sum + sql: ${TABLE}.income;; + } +} diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/top_10_employee_income_source.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/top_10_employee_income_source.view.lkml new file mode 100644 index 0000000000000..5371ed5e3ca56 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/top_10_employee_income_source.view.lkml @@ -0,0 +1,26 @@ +view: top_10_employee_income_source { + derived_table: { + sql: SELECT id, + name, + source + FROM ${employee_income_source.SQL_TABLE_NAME} + ORDER BY source desc + LIMIT 10 + ;; + } + + dimension: id { + type: number + sql: ${TABLE}.id ;; + } + + dimension: name { + type: string + sql: ${TABLE}.name ;; + } + + dimension: source { + type: string + sql: ${TABLE}.source ;; + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json new file mode 100644 index 0000000000000..75cd50c5c6059 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json @@ -0,0 +1,1335 @@ +[ +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "containerProperties", + "aspect": { + "json": { + "customProperties": { + "platform": "looker", + "env": "PROD", + "project_name": "lkml_samples" + }, + "name": "lkml_samples" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:looker" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "LookML Project" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Folders" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: activity_logs {\n sql_table_name:\n {% if _user_attributes['looker_env'] == 'dev' %}\n {{ _user_attributes['dev_database_prefix'] }}analytics.{{ _user_attributes['dev_schema_prefix'] }}staging_app.stg_app__activity_logs\n {% elsif _user_attributes['looker_env'] == 'prod' %}\n analytics.staging_app.stg_app__activity_logs\n {% else %}\n analytics.staging_app.stg_app__activity_logs\n {% endif %}\n ;;\n\n dimension: generated_message_id {\n group_label: \"IDs\"\n primary_key: yes\n type: number\n sql: ${TABLE}.\"GENERATED_MESSAGE_ID\" ;;\n }\n}\n", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.activity_logs,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,employeeanalytics.publicstaging_app.stg_app__activity_logs,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,employeeanalytics.publicstaging_app.stg_app__activity_logs,PROD),generated_message_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.activity_logs,PROD),generated_message_id)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "activity_logs", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "generated_message_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": true + } + ], + "primaryKeys": [ + "generated_message_id" + ] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "activity_logs.view.lkml", + "looker.model": "data" + }, + "name": "activity_logs", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.activity_logs,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT\n employee_id,\n employee_name,\n \n prod_core.data.r_metric_summary_v2\n ,\n employee_income\n FROM source_table\n WHERE\n source_table.region='ap-south-1'", + "viewLanguage": "sql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),employee_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),employee_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),source)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),source)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),employee_income)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),income)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "employee_income_source", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "source", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "income", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "employee_income_source.view.lkml", + "looker.model": "data" + }, + "name": "employee_income_source", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: employee_total_income {\n sql_table_name: ${employee_income_source.SQL_TABLE_NAME} ;;\n\n dimension: id {\n type: number\n sql: ${TABLE}.id;;\n }\n\n dimension: name {\n type: string\n sql: ${TABLE}.name;;\n }\n\n measure: total_income {\n type: sum\n sql: ${TABLE}.income;;\n }\n}\n", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),income)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD),total_income)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "employee_total_income", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "total_income", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "sum", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Measure" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "employee_total_income.view.lkml", + "looker.model": "data" + }, + "name": "employee_total_income", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_total_income,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT id,\n name,\n source\n FROM ${employee_income_source.SQL_TABLE_NAME}\n ORDER BY source desc\n LIMIT 10", + "viewLanguage": "sql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_income_source,PROD),source)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD),source)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "top_10_employee_income_source", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "source", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "top_10_employee_income_source.view.lkml", + "looker.model": "data" + }, + "name": "top_10_employee_income_source", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.top_10_employee_income_source,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "view: employee_tax_report {\n sql_table_name: data-warehouse.finance.form-16;;\n\n dimension: id {\n type: number\n sql: ${TABLE}.id;;\n }\n\n dimension: name {\n type: string\n sql: ${TABLE}.name;;\n }\n\n measure: taxable_income {\n type: sum\n sql: ${TABLE}.tax;;\n }\n}\n", + "viewLanguage": "lookml" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,data-warehouse.finance.form-16,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,data-warehouse.finance.form-16,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,data-warehouse.finance.form-16,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,data-warehouse.finance.form-16,PROD),tax)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD),taxable_income)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "employee_tax_report", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "taxable_income", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "sum", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Measure" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "employee_tax_report.view.lkml", + "looker.model": "data" + }, + "name": "employee_tax_report", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_tax_report,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Measure", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Measure" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file