From def047d0bf31887fb3336b25bd0d01f702a21230 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 11 Jun 2024 02:24:21 -0700 Subject: [PATCH 1/2] docs(ingestion): Rename csv / s3 / file source and sink --- .gitignore | 3 ++ docs-website/build.gradle | 4 +++ docs/cli.md | 16 +++++----- docs/troubleshooting/quickstart.md | 2 +- metadata-ingestion/cli-ingestion.md | 2 +- .../csv-enricher_recipe.yml | 0 .../metadata-file_recipe.yml} | 0 metadata-ingestion/docs/sources/s3/README.md | 30 ++++++++++--------- .../docs/sources/s3/s3_recipe.yml | 11 +++++-- metadata-ingestion/setup.py | 3 +- .../sink_docs/{file.md => metadata-file.md} | 5 ++-- metadata-ingestion/sink_overview.md | 2 +- .../datahub/ingestion/source/csv_enricher.py | 9 ++++-- .../src/datahub/ingestion/source/file.py | 21 +++++++++---- .../src/datahub/ingestion/source/s3/source.py | 2 +- metadata-integration/java/as-a-library.md | 4 +-- 16 files changed, 71 insertions(+), 43 deletions(-) rename metadata-ingestion/docs/sources/{csv => csv-enricher}/csv-enricher_recipe.yml (100%) rename metadata-ingestion/docs/sources/{file/file_recipe.yml => metadata-file/metadata-file_recipe.yml} (100%) rename metadata-ingestion/sink_docs/{file.md => metadata-file.md} (93%) diff --git a/.gitignore b/.gitignore index 1fcca8751131fc..fdb9ed91da4328 100644 --- a/.gitignore +++ b/.gitignore @@ -126,3 +126,6 @@ metadata-service/war/bin/ metadata-utils/bin/ test-models/bin/ +datahub-executor/ +datahub-integrations-service/ +metadata-ingestion-modules/ diff --git a/docs-website/build.gradle b/docs-website/build.gradle index f3bedd2516319a..798047a562ffd2 100644 --- a/docs-website/build.gradle +++ b/docs-website/build.gradle @@ -148,8 +148,12 @@ clean { delete 'tmp' delete 'build' delete 'just' + delete 'sphinx/venv' + delete 'sphinx/_build' + delete 'versioned_docs' delete fileTree(dir: 'genDocs', exclude: '.gitignore') delete fileTree(dir: 'docs', exclude: '.gitignore') + delete fileTree(dir: 'genStatic', exclude: '.gitignore') delete 'graphql/combined.graphql' yarnClear } diff --git a/docs/cli.md b/docs/cli.md index 411cb2d1ab77f0..32036a11dfb3d0 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -655,8 +655,8 @@ We use a plugin architecture so that you can install only the dependencies you a Please see our [Integrations page](https://datahubproject.io/integrations) if you want to filter on the features offered by each source. | Plugin Name | Install Command | Provides | -| ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | --------------------------------------- | -| [file](./generated/ingestion/sources/file.md) | _included by default_ | File source and sink | +|------------------------------------------------------------------------------------------------| ---------------------------------------------------------- | --------------------------------------- | +| [metadata-file](./generated/ingestion/sources/metadata-file.md) | _included by default_ | File source and sink | | [athena](./generated/ingestion/sources/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source | | [bigquery](./generated/ingestion/sources/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source | | [datahub-lineage-file](./generated/ingestion/sources/file-based-lineage.md) | _no additional dependencies_ | Lineage File source | @@ -696,12 +696,12 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo ### Sinks -| Plugin Name | Install Command | Provides | -| ----------------------------------------------------------- | -------------------------------------------- | -------------------------- | -| [file](../metadata-ingestion/sink_docs/file.md) | _included by default_ | File source and sink | -| [console](../metadata-ingestion/sink_docs/console.md) | _included by default_ | Console sink | -| [datahub-rest](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API | -| [datahub-kafka](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka | +| Plugin Name | Install Command | Provides | +|-------------------------------------------------------------------| -------------------------------------------- | -------------------------- | +| [metadata-file](../metadata-ingestion/sink_docs/metadata-file.md) | _included by default_ | File source and sink | +| [console](../metadata-ingestion/sink_docs/console.md) | _included by default_ | Console sink | +| [datahub-rest](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API | +| [datahub-kafka](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka | These plugins can be mixed and matched as desired. For example: diff --git a/docs/troubleshooting/quickstart.md b/docs/troubleshooting/quickstart.md index 9da5aa443069e1..cafc1e30c50520 100644 --- a/docs/troubleshooting/quickstart.md +++ b/docs/troubleshooting/quickstart.md @@ -246,7 +246,7 @@ ALTER TABLE metadata_aspect_v2 CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_ ## I've modified the default user.props file to include a custom username and password, but I don't see the new user(s) inside the Users & Groups tab. Why not? Currently, `user.props` is a file used by the JAAS PropertyFileLoginModule solely for the purpose of **Authentication**. The file is not used as an source from which to -ingest additional metadata about the user. For that, you'll need to ingest some custom information about your new user using the Rest.li APIs or the [File-based ingestion source](../generated/ingestion/sources/file.md). +ingest additional metadata about the user. For that, you'll need to ingest some custom information about your new user using the Rest.li APIs or the [Metadata File ingestion source](../generated/ingestion/sources/metadata-file.md). For an example of a file that ingests user information, check out [single_mce.json](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/mce_files/single_mce.json), which ingests a single user object into DataHub. Notice that the "urn" field provided will need to align with the custom username you've provided in user.props file. For example, if your user.props file contains: diff --git a/metadata-ingestion/cli-ingestion.md b/metadata-ingestion/cli-ingestion.md index b15dd2a5019959..64a2b370dd93cf 100644 --- a/metadata-ingestion/cli-ingestion.md +++ b/metadata-ingestion/cli-ingestion.md @@ -58,7 +58,7 @@ Please refer the following pages for advanced guids on CLI ingestion. - [Reference for `datahub ingest` command](../docs/cli.md#ingest) - [UI Ingestion Guide](../docs/ui-ingestion.md) -:::Tip Compatibility +:::tip Compatibility DataHub server uses a 3 digit versioning scheme, while the CLI uses a 4 digit scheme. For example, if you're using DataHub server version 0.10.0, you should use CLI version 0.10.0.x, where x is a patch version. We do this because we do CLI releases at a much higher frequency than server releases, usually every few days vs twice a month. diff --git a/metadata-ingestion/docs/sources/csv/csv-enricher_recipe.yml b/metadata-ingestion/docs/sources/csv-enricher/csv-enricher_recipe.yml similarity index 100% rename from metadata-ingestion/docs/sources/csv/csv-enricher_recipe.yml rename to metadata-ingestion/docs/sources/csv-enricher/csv-enricher_recipe.yml diff --git a/metadata-ingestion/docs/sources/file/file_recipe.yml b/metadata-ingestion/docs/sources/metadata-file/metadata-file_recipe.yml similarity index 100% rename from metadata-ingestion/docs/sources/file/file_recipe.yml rename to metadata-ingestion/docs/sources/metadata-file/metadata-file_recipe.yml diff --git a/metadata-ingestion/docs/sources/s3/README.md b/metadata-ingestion/docs/sources/s3/README.md index 7944f78280a428..b0d354a9b3c2ac 100644 --- a/metadata-ingestion/docs/sources/s3/README.md +++ b/metadata-ingestion/docs/sources/s3/README.md @@ -1,19 +1,11 @@ -This connector ingests S3 datasets into DataHub. It allows mapping an individual file or a folder of files to a dataset in DataHub. +This connector ingests AWS S3 datasets into DataHub. It allows mapping an individual file or a folder of files to a dataset in DataHub. To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details. -### Concept Mapping - -This ingestion source maps the following Source System Concepts to DataHub Concepts: - -| Source Concept | DataHub Concept | Notes | -| ---------------------------------------- |--------------------------------------------------------------------------------------------| ------------------- | -| `"s3"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) | | -| s3 object / Folder containing s3 objects | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | | -| s3 bucket | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `S3 bucket` | -| s3 folder | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `Folder` | +:::tip +This connector can also be used to ingest local files. +Just replace `s3://` in your path_specs with an absolute path to files on the machine running ingestion. +::: -This connector supports both local files as well as those stored on AWS S3 (which must be identified using the prefix `s3://`). -[a] ### Supported file types Supported file types are as follows: @@ -30,6 +22,16 @@ Schemas for schemaless formats (CSV, TSV, JSONL, JSON) are inferred. For CSV, TS JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance. We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object. +### Concept Mapping + +This ingestion source maps the following Source System Concepts to DataHub Concepts: + +| Source Concept | DataHub Concept | Notes | +| ---------------------------------------- |--------------------------------------------------------------------------------------------| ------------------- | +| `"s3"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) | | +| s3 object / Folder containing s3 objects | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | | +| s3 bucket | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `S3 bucket` | +| s3 folder | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `Folder` | ### Profiling @@ -42,4 +44,4 @@ This plugin extracts: - histograms or frequencies of unique values Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details). If profiling, make sure that permissions for **s3a://** access are set because Spark and Hadoop use the s3a:// protocol to interface with AWS (schema inference outside of profiling requires s3:// access). -Enabling profiling will slow down ingestion runs. \ No newline at end of file +Enabling profiling will slow down ingestion runs. diff --git a/metadata-ingestion/docs/sources/s3/s3_recipe.yml b/metadata-ingestion/docs/sources/s3/s3_recipe.yml index 693b9528373ab2..301e811b769260 100644 --- a/metadata-ingestion/docs/sources/s3/s3_recipe.yml +++ b/metadata-ingestion/docs/sources/s3/s3_recipe.yml @@ -1,9 +1,9 @@ +# Ingest data from S3 source: type: s3 config: path_specs: - - - include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*" + - include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*" aws_config: aws_access_key_id: ***** @@ -13,4 +13,9 @@ source: profiling: enabled: false -# sink configs +# Ingest data from local filesystem +source: + type: s3 + config: + path_specs: + - include: "/absolute/path/*.csv" diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 4702c9d540ec0e..07a60c913b0efd 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -259,7 +259,8 @@ delta_lake = { *s3_base, - "deltalake>=0.6.3, != 0.6.4", + "deltalake>=0.6.3, != 0.6.4, < 0.18.0; platform_system == 'Darwin' and platform_machine == 'arm64'", + "deltalake>=0.6.3, != 0.6.4; platform_system != 'Darwin' or platform_machine != 'arm64'", } powerbi_report_server = {"requests", "requests_ntlm"} diff --git a/metadata-ingestion/sink_docs/file.md b/metadata-ingestion/sink_docs/metadata-file.md similarity index 93% rename from metadata-ingestion/sink_docs/file.md rename to metadata-ingestion/sink_docs/metadata-file.md index 2991afacbd93d2..7cac8d55422438 100644 --- a/metadata-ingestion/sink_docs/file.md +++ b/metadata-ingestion/sink_docs/metadata-file.md @@ -1,4 +1,4 @@ -# File +# Metadata File For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md). @@ -10,7 +10,7 @@ Works with `acryl-datahub` out of the box. Outputs metadata to a file. This can be used to decouple metadata sourcing from the process of pushing it into DataHub, and is particularly useful for debugging purposes. -Note that the [file source](../../docs/generated/ingestion/sources/file.md) can read files generated by this sink. +Note that the [file source](../../docs/generated/ingestion/sources/metadata-file.md) can read files generated by this sink. ## Quickstart recipe @@ -35,4 +35,3 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | Field | Required | Default | Description | | -------- | -------- | ------- | ------------------------- | | filename | ✅ | | Path to file to write to. | - diff --git a/metadata-ingestion/sink_overview.md b/metadata-ingestion/sink_overview.md index c71ba1f97932cf..95f18a0a6cb944 100644 --- a/metadata-ingestion/sink_overview.md +++ b/metadata-ingestion/sink_overview.md @@ -25,7 +25,7 @@ When configuring ingestion for DataHub, you're likely to be sending the metadata For debugging purposes or troubleshooting, the following sinks can be useful: -- [File](sink_docs/file.md) +- [Metadata File](sink_docs/metadata-file.md) - [Console](sink_docs/console.md) ## Default Sink diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index d998c37d32ed2a..feee89ba579837 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -93,13 +93,18 @@ class CSVEnricherReport(SourceReport): num_domain_workunits_produced: int = 0 -@platform_name("CSV") +@platform_name("CSV Enricher") @config_class(CSVEnricherConfig) @support_status(SupportStatus.INCUBATING) class CSVEnricherSource(Source): """ + :::tip Looking to ingest a CSV data file into DataHub, as an asset? + Use the [Local File](./s3.md) ingestion source. + The CSV enricher is used for enriching entities already ingested into DataHub. + ::: + This plugin is used to bulk upload metadata to Datahub. - It will apply glossary terms, tags, decription, owners and domain at the entity level. It can also be used to apply tags, + It will apply glossary terms, tags, description, owners and domain at the entity level. It can also be used to apply tags, glossary terms, and documentation at the column level. These values are read from a CSV file. You have the option to either overwrite or append existing values. diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py index 49cc314426eb55..3e8c88b725de50 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/file.py +++ b/metadata-ingestion/src/datahub/ingestion/source/file.py @@ -56,11 +56,17 @@ class FileSourceConfig(ConfigModel): message="filename is deprecated. Use path instead.", ) path: str = Field( - description="File path to folder or file to ingest, or URL to a remote file. If pointed to a folder, all files with extension {file_extension} (default json) within that folder will be processed." + description=( + "File path to folder or file to ingest, or URL to a remote file. " + "If pointed to a folder, all files with extension {file_extension} (default json) within that folder will be processed." + ) ) file_extension: str = Field( ".json", - description="When providing a folder to use to read files, set this field to control file extensions that you want the source to process. * is a special value that means process every file regardless of extension", + description=( + "When providing a folder to use to read files, set this field to control file extensions that you want the source to process. " + "* is a special value that means process every file regardless of extension" + ), ) read_mode: FileReadMode = FileReadMode.AUTO aspect: Optional[str] = Field( @@ -69,7 +75,10 @@ class FileSourceConfig(ConfigModel): ) count_all_before_starting: bool = Field( default=True, - description="When enabled, counts total number of records in the file before starting. Used for accurate estimation of completion time. Turn it off if startup time is too high.", + description=( + "When enabled, counts total number of records in the file before starting. " + "Used for accurate estimation of completion time. Turn it off if startup time is too high." + ), ) _minsize_for_streaming_mode_in_bytes: int = ( @@ -163,12 +172,14 @@ def compute_stats(self) -> None: self.percentage_completion = f"{percentage_completion:.2f}%" -@platform_name("File") +@platform_name("Metadata File") @config_class(FileSourceConfig) @support_status(SupportStatus.CERTIFIED) class GenericFileSource(TestableSource): """ - This plugin pulls metadata from a previously generated file. The [file sink](../../../../metadata-ingestion/sink_docs/file.md) can produce such files, and a number of samples are included in the [examples/mce_files](../../../../metadata-ingestion/examples/mce_files) directory. + This plugin pulls metadata from a previously generated file. + The [metadata file sink](../../../../metadata-ingestion/sink_docs/metadata-file.md) can produce such files, and a number of + samples are included in the [examples/mce_files](../../../../metadata-ingestion/examples/mce_files) directory. """ def __init__(self, ctx: PipelineContext, config: FileSourceConfig): diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 8bc075f720cc55..c35f500df1b8c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -217,7 +217,7 @@ class TableData: number_of_files: int -@platform_name("S3 Data Lake", id="s3") +@platform_name("S3 / Local Files", id="s3") @config_class(DataLakeSourceConfig) @support_status(SupportStatus.INCUBATING) @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") diff --git a/metadata-integration/java/as-a-library.md b/metadata-integration/java/as-a-library.md index 59ee6595132fa1..e38cfef080f5f1 100644 --- a/metadata-integration/java/as-a-library.md +++ b/metadata-integration/java/as-a-library.md @@ -169,7 +169,7 @@ If you're interested in looking at the Kafka emitter code, it is available [here ## File Emitter -The File emitter writes metadata change proposal events (MCPs) into a JSON file that can be later handed off to the Python [File source](docs/generated/ingestion/sources/file.md) for ingestion. This works analogous to the [File sink](../../metadata-ingestion/sink_docs/file.md) in Python. This mechanism can be used when the system producing metadata events doesn't have direct connection to DataHub's REST server or Kafka brokers. The generated JSON file can be transferred later and then ingested into DataHub using the [File source](docs/generated/ingestion/sources/file.md). +The File emitter writes metadata change proposal events (MCPs) into a JSON file that can be later handed off to the Python [Metadata File source](docs/generated/ingestion/sources/metadata-file.md) for ingestion. This works analogous to the [Metadata File sink](../../metadata-ingestion/sink_docs/metadata-file.md) in Python. This mechanism can be used when the system producing metadata events doesn't have direct connection to DataHub's REST server or Kafka brokers. The generated JSON file can be transferred later and then ingested into DataHub using the [Metadata File source](docs/generated/ingestion/sources/metadata-file.md). ### Usage @@ -223,5 +223,3 @@ The File emitter only supports writing to the local filesystem currently. If you Emitter API-s are also supported for: - [Python](../../metadata-ingestion/as-a-library.md) - - From 10ec82cf1d2aab85644a1057ebf1e575f4fd00d1 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 11 Jun 2024 10:51:00 -0700 Subject: [PATCH 2/2] fix links; pr feedback --- .gitignore | 2 +- docs-website/filterTagIndexes.json | 4 ++-- docs/how/updating-datahub.md | 2 +- metadata-ingestion/setup.py | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index fdb9ed91da4328..43c627f9ed244f 100644 --- a/.gitignore +++ b/.gitignore @@ -128,4 +128,4 @@ test-models/bin/ datahub-executor/ datahub-integrations-service/ -metadata-ingestion-modules/ +metadata-ingestion-modules/acryl-cloud diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json index 0c1f541cf53d34..8caff3497a2002 100644 --- a/docs-website/filterTagIndexes.json +++ b/docs-website/filterTagIndexes.json @@ -67,7 +67,7 @@ } }, { - "Path": "docs/generated/ingestion/sources/csv", + "Path": "docs/generated/ingestion/sources/csv-enricher", "imgPath": "img/datahub-logo-color-mark.svg", "Title": "CSV", "Description": "An ingestion source for enriching metadata provided in CSV format provided by DataHub", @@ -177,7 +177,7 @@ } }, { - "Path": "docs/generated/ingestion/sources/file", + "Path": "docs/generated/ingestion/sources/metadata-file", "imgPath": "img/datahub-logo-color-mark.svg", "Title": "File", "Description": "An ingestion source for single files provided by DataHub", diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index c29c20e7d48a3c..bd559d1a24d273 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -484,7 +484,7 @@ Helm with `--atomic`: In general, it is recommended to not use the `--atomic` se ### Breaking Changes -- The `should_overwrite` flag in `csv-enricher` has been replaced with `write_semantics` to match the format used for other sources. See the [documentation](https://datahubproject.io/docs/generated/ingestion/sources/csv/) for more details +- The `should_overwrite` flag in `csv-enricher` has been replaced with `write_semantics` to match the format used for other sources. See the [documentation](https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher/) for more details - Closing an authorization hole in creating tags adding a Platform Privilege called `Create Tags` for creating tags. This is assigned to `datahub` root user, along with default All Users policy. Notice: You may need to add this privilege (or `Manage Tags`) to existing users that need the ability to create tags on the platform. - #5329 Below profiling config parameters are now supported in `BigQuery`: diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 07a60c913b0efd..ade1e1a6ee5ba4 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -259,6 +259,7 @@ delta_lake = { *s3_base, + # Version 0.18.0 broken on ARM Macs: https://github.com/delta-io/delta-rs/issues/2577 "deltalake>=0.6.3, != 0.6.4, < 0.18.0; platform_system == 'Darwin' and platform_machine == 'arm64'", "deltalake>=0.6.3, != 0.6.4; platform_system != 'Darwin' or platform_machine != 'arm64'", }