Skip to content

Commit

Permalink
feat(doc): Generating json schema for ingestion recipes (datahub-proj…
Browse files Browse the repository at this point in the history
  • Loading branch information
treff7es authored and maggiehays committed Aug 1, 2022
1 parent c0ce5a2 commit fc7fc8a
Show file tree
Hide file tree
Showing 5 changed files with 247 additions and 1 deletion.
3 changes: 3 additions & 0 deletions docs-website/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
# Generated GraphQL
/graphql/combined.graphql

# Generated Json Schema
/static/schemas/datahub_ingestion_schema.json

# Dependencies
/node_modules

Expand Down
7 changes: 6 additions & 1 deletion docs-website/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ task generateGraphQLSchema(type: Exec) {
commandLine './generateGraphQLSchema.sh'
}

task generateJsonSchema(type: Exec, dependsOn: [':metadata-ingestion:docGen']) {
workingDir "$projectDir/genJsonSchema"
commandLine './generateJsonSchema.sh'
}

/*
Wrappers around Yarn Tasks.
*/
Expand All @@ -67,7 +72,7 @@ task generateGraphQLDocumentation(type: YarnTask, dependsOn: [yarnInstall, gener
args = ['docusaurus', 'docs:generate:graphql']
}

task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLDocumentation, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen'] ) {
task yarnGenerate(type: YarnTask, dependsOn: [yarnInstall, generateGraphQLDocumentation, generateJsonSchema, ':metadata-ingestion:modelDocGen', ':metadata-ingestion:docGen'] ) {
inputs.files(projectMdFiles)
outputs.cacheIf { true }
args = ['run', 'generate']
Expand Down
229 changes: 229 additions & 0 deletions docs-website/genJsonSchema/gen_json_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import glob
import json
import os
import sys
from typing import Any, Dict, List


def get_base() -> Any:
return {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://json.schemastore.org/datahub-ingestion",
"title": "Datahub Ingestion",
"description": "Root schema of Datahub Ingestion",
"definitions": {
"console_sink": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["console"],
}
},
"required": ["type"],
},
"file_sink": {
"type": "object",
"properties": {
"type": {"type": "string", "enum": ["file"]},
"config": {"$ref": "#/definitions/file_sink_config"},
},
"required": ["type", "config"],
},
"file_sink_config": {
"type": "object",
"properties": {
"filename": {
"description": "Path to file to write to.",
"type": "string",
}
},
"required": ["filename"],
"additionalProperties": False,
},
"datahub_rest_sink": {
"type": "object",
"properties": {
"type": {"type": "string", "enum": ["datahub-rest"]},
"config": {"$ref": "#/definitions/datahub_rest_sink_config"},
},
"required": ["type", "config"],
"additionalProperties": False,
},
"datahub_rest_sink_config": {
"type": "object",
"properties": {
"ca_certificate_path": {
"type": "string",
"description": "Path to CA certificate for HTTPS communications.",
},
"max_threads": {
"type": "number",
"description": "Experimental: Max parallelism for REST API calls",
"default": 1,
},
"retry_status_codes": {
"type": "array",
"items": {"type": "number"},
"description": "Retry HTTP request also on these status codes",
"default": [429, 502, 503, 504],
},
"server": {
"type": "string",
"description": "URL of DataHub GMS endpoint.",
},
"timeout_sec": {
"type": "number",
"description": "Per-HTTP request timeout.",
"default": 30,
},
"token": {
"type": "string",
"description": "Bearer token used for authentication.",
},
"extra_headers": {
"type": "string",
"description": "Extra headers which will be added to the request.",
},
},
"required": ["server"],
"additionalProperties": False,
},
"datahub_kafka_sink": {
"type": "object",
"properties": {
"type": {"type": "string", "enum": ["datahub-kafka"]},
"config": {"$ref": "#/definitions/datahub_kafka_sink_config"},
},
"required": ["type", "config"],
"additionalProperties": False,
},
"datahub_kafka_sink_config": {
"type": "object",
"properties": {
"connection": {
"type": "object",
"properties": {
"bootstrap": {
"type": "string",
"description": "Kafka bootstrap URL.",
},
"producer_config": {
"type": "object",
"description": "Passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.SerializingProducer",
},
"schema_registry_url": {
"type": "string",
"description": "URL of schema registry being used.",
},
"schema_registry_config": {
"type": "object",
"description": "Passed to https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#confluent_kafka.schema_registry.SchemaRegistryClient",
},
},
"additionalProperties": False,
"required": ["bootstrap", "schema_registry_url"],
},
"topic_routes": {
"type": "object",
"properties": {
"MetadataChangeEvent": {
"type": "string",
"description": "Overridden Kafka topic name for the MetadataChangeEvent",
"default": "MetadataChangeEvent",
},
"MetadataChangeProposal": {
"type": "string",
"description": "Overridden Kafka topic name for the MetadataChangeProposal",
"default": "MetadataChangeProposal",
},
},
"additionalProperties": False,
},
},
"required": ["connection"],
"additionalProperties": False,
},
},
"type": "object",
"properties": {
"source": {"anyOf": []},
"transformers": {
"type": "array",
"items": {
"type": "object",
"description": "Transformer configs see at https://datahubproject.io/docs/metadata-ingestion/transformers",
"properties": {
"type": {"type": "string", "description": "Transformer type"},
"config": {
"type": "object",
"description": "Transformer config",
},
},
"required": ["type"],
"additionalProperties": False,
},
},
"sink": {
"description": "sink",
"anyOf": [
{"$ref": "#/definitions/datahub_kafka_sink"},
{"$ref": "#/definitions/datahub_rest_sink"},
{"$ref": "#/definitions/console_sink"},
{"$ref": "#/definitions/file_sink"},
],
},
},
"required": ["source", "sink"],
}


configs: Dict[str, Any] = {}
definitions = {}
refs: List[Dict] = []

if len(sys.argv) != 3:
print(
"""\
Usage:
gen_json_schema.py config_schema_dir output_file
"""
)
sys.exit(0)

config_schemas_dir: str = sys.argv[1]
output_file: str = sys.argv[2]

for jfile in glob.glob(f"{config_schemas_dir}/*"):
config_name: str = os.path.splitext(os.path.basename(jfile))[0].split("_")[0]
print(f"ConfigName: {config_name}")

f = open(jfile)
data = json.load(f)

source_obj = {
"type": "object",
"properties": {
"type": {"type": "string", "enum": [f"{config_name}"]},
"config": {"$ref": f"#/definitions/{config_name}_config"},
},
"required": ["type", "config"],
}
configs[f"{config_name}"] = source_obj
if "definitions" in data:
definitions.update(data["definitions"])
data.pop("definitions", None)

configs[f"{config_name}_config"] = data
ref = {"$ref": f"#/definitions/{config_name}"}
refs.append(ref)

base = get_base()
base["definitions"].update(configs)
base["definitions"].update(definitions)

print(base["properties"]["source"])

base["properties"]["source"]["anyOf"] = base["properties"]["source"]["anyOf"] + refs
with open(f"{output_file}", "w") as outfile:
json.dump(base, outfile, indent=4)
9 changes: 9 additions & 0 deletions docs-website/genJsonSchema/generateJsonSchema.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/sh
SCHEMA_FILE_NAME=datahub_ingestion_schema.json
SCHEMA_ROOT_DIR=../static/schemas
if [ -f "${SCHEMA_ROOT_DIR}/${SCHEMA_FILE_NAME}" ]; then
rm ${SCHEMA_ROOT_DIR}/${SCHEMA_FILE_NAME}
fi

echo "Generating Json schema..."
python gen_json_schema.py ../../docs/generated/ingestion/config_schemas/ ${SCHEMA_ROOT_DIR}/${SCHEMA_FILE_NAME}
Empty file.

0 comments on commit fc7fc8a

Please sign in to comment.