diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index 52d2e4a8f56e3b..a1fab13fd4bbb7 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -417,15 +417,35 @@ def _field_from_complex_type( inner_field_path, ) elif datahub_field_type == ArrayTypeClass: - field_path = field_path.expand_type("array", schema) - # default items schema is string + field_path = field_path.expand_type(discriminated_type, schema) + yield SchemaField( + fieldPath=field_path.as_string(), + type=type_override or SchemaFieldDataTypeClass(type=ArrayTypeClass()), + nativeDataType=native_type_override + or JsonSchemaTranslator._get_discriminated_type_from_schema(schema), + description=JsonSchemaTranslator._get_description_from_any_schema( + schema + ), + nullable=nullable, + jsonProps=JsonSchemaTranslator._get_jsonprops_for_any_schema( + schema, required=required + ), + isPartOfKey=field_path.is_key_schema, + ) + items_schema = schema.get("items", {"type": "string"}) items_type = JsonSchemaTranslator._get_type_from_schema(items_schema) - field_path._set_parent_type_if_not_exists( - DataHubType(type=ArrayTypeClass, nested_type=items_type) + field_name = items_schema.get("title", None) + if not field_name: + field_name = items_type + inner_field_path = field_path.clone_plus( + FieldElement(type=[], name=field_name, schema_types=[]) ) yield from JsonSchemaTranslator.get_fields( - items_type, items_schema, required=False, base_field_path=field_path + items_type, + items_schema, + required=False, + base_field_path=inner_field_path, ) elif datahub_field_type == MapTypeClass: diff --git a/metadata-ingestion/tests/unit/schema/test_json_schema_util.py b/metadata-ingestion/tests/unit/schema/test_json_schema_util.py index 5e095fc0df8dce..34ccc3d4fb9225 100644 --- a/metadata-ingestion/tests/unit/schema/test_json_schema_util.py +++ b/metadata-ingestion/tests/unit/schema/test_json_schema_util.py @@ -153,15 +153,20 @@ def test_json_schema_with_recursion(): }, } fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) + expected_field_paths = [ { "path": "[version=2.0].[type=TreeNode].[type=integer].value", "type": NumberTypeClass, }, { - "path": "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode].children", + "path": "[version=2.0].[type=TreeNode].[type=array].children", "type": ArrayTypeClass, }, + { + "path": "[version=2.0].[type=TreeNode].[type=array].children.[type=TreeNode].TreeNode", + "type": RecordTypeClass, + }, ] assert_field_paths_match(fields, expected_field_paths) assert_fields_are_valid(fields) @@ -372,8 +377,10 @@ def test_nested_arrays(): fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) expected_field_paths: List[str] = [ - "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar", - "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=integer].a", + "[version=2.0].[type=NestedArray].[type=array].ar", + "[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array", + "[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo", + "[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo.[type=integer].a", ] assert_field_paths_match(fields, expected_field_paths) assert isinstance(fields[0].type.type, ArrayTypeClass) @@ -496,14 +503,17 @@ def test_needs_disambiguation_nested_union_of_records_with_same_field_name(): }, } fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) + expected_field_paths: List[str] = [ "[version=2.0].[type=ABFooUnion].[type=union].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=integer].f", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo", + "[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=integer].f", ] assert_field_paths_match(fields, expected_field_paths) @@ -578,8 +588,10 @@ def test_key_schema_handling(): "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=number].f", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=number].f", ] assert_field_paths_match(fields, expected_field_paths) for f in fields: @@ -664,7 +676,8 @@ def test_simple_array(): fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) expected_field_paths: List[str] = [ - "[version=2.0].[type=ObjectWithArray].[type=array].[type=string].ar", + "[version=2.0].[type=ObjectWithArray].[type=array].ar", + "[version=2.0].[type=ObjectWithArray].[type=array].ar.[type=string].string", ] assert_field_paths_match(fields, expected_field_paths) assert isinstance(fields[0].type.type, ArrayTypeClass) @@ -846,3 +859,31 @@ def test_top_level_trival_allof(): assert json.loads(fields[1].jsonProps or "{}")["required"] is False assert json.loads(fields[2].jsonProps or "{}")["required"] is True assert json.loads(fields[3].jsonProps or "{}")["required"] is False + + +def test_description_extraction(): + schema = { + "$id": "test", + "$schema": "http://json-schema.org/draft-07/schema#", + "properties": { + "bar": { + "type": "array", + "items": {"type": "string"}, + "description": "XYZ", + } + }, + } + fields = list(JsonSchemaTranslator.get_fields_from_schema(schema)) + expected_field_paths: List[str] = [ + "[version=2.0].[type=object].[type=array].bar", + "[version=2.0].[type=object].[type=array].bar.[type=string].string", + ] + assert_field_paths_match(fields, expected_field_paths) + assert_fields_are_valid(fields) + # Additional check for the description extraction + array_field = next( + field + for field in fields + if field.fieldPath == "[version=2.0].[type=object].[type=array].bar" + ) + assert array_field.description == "XYZ"