feat(ingest): s3 - add support for multiple pathspecs in one recipe (#…

…4777)
datahub-project · May 5, 2022 · 56ee4d9 · 56ee4d9
1 parent 5b1fb4b
commit 56ee4d9
Show file tree

Hide file tree

Showing 40 changed files with 5,477 additions and 4,373 deletions.
diff --git a/metadata-ingestion/archived/source_docs/data_lake.md b/metadata-ingestion/archived/source_docs/data_lake.md
@@ -4,8 +4,7 @@ For context on getting started with ingestion, check out our [metadata ingestion
 
 :::caution
 
-This source is in **Beta** and under active development. Not yet considered ready for production.
-
+This source is going to be deprecated. Please, use [S3 Data Lake](./s3_data_lake.md) instead of this.
 :::
 
 ## Setup

diff --git a/metadata-ingestion/archived/source_docs/s3.md b/metadata-ingestion/archived/source_docs/s3.md
@@ -1,8 +1,8 @@
-# S3
+# S3 Crawling with Glue
 
 :::note
 
-Our [data-lake connector](./data_lake.md) allows you to ingest S3 files directly with the option to compute profiles as well. The following guide describes how to ingest S3 datasets through cataloging them in AWS Glue.
+Our [S3 data lake](./s3_data_lake.md) source allows you to ingest S3 files directly with the option to compute profiles as well. The following guide describes how to ingest S3 datasets through cataloging them in AWS Glue.
 
 :::
 

diff --git a/metadata-ingestion/archived/source_docs/s3_data_lake.md b/metadata-ingestion/archived/source_docs/s3_data_lake.md
@@ -74,11 +74,13 @@ sink:
 Note that a `.` is used to denote nested fields in the YAML recipe.
 
 | Field                                                | Required                 | Default                                   | Description                                                                                                                                                                                                    |
-| ---------------------------------------------------- | ------------------------ | ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `path_spec.include`                                  | ✅                       |                                           | Path to table (s3 or local file system). Name variable {table} is used to mark the folder with dataset. In absence of {table}, file level dataset will be created. Check below examples for more details.      |
-| `path_spec.exclude`                                  |                          |                                           | list of paths in glob pattern which will be excluded while scanning for the datasets                                                                                                                           |
-| `path_spec.table_name`                               |                          | {table}                                   | Display name of the dataset.Combination of named variableds from include path and strings                                                                                                                      |
-| `path_spec.file_types`                               |                          | ["csv", "tsv", "json", "parquet", "avro"] | Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.                                                                           |
+|------------------------------------------------------|--------------------------|-------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `path_specs`                                         | ✅                        |                                           | List of PathSpec -> see below                                                                                                                                                                                  |
+| `path_specs[].include`                               | ✅                        |                                           | Path to table (s3 or local file system). Name variable {table} is used to mark the folder with dataset. In absence of {table}, file level dataset will be created. Check below examples for more details.      |
+| `path_specs[].exclude`                               |                          |                                           | list of paths in glob pattern which will be excluded while scanning for the datasets                                                                                                                           |
+| `path_specs[].table_name`                            |                          | {table}                                   | Display name of the dataset.Combination of named variableds from include path and strings                                                                                                                      |
+| `path_specs[].file_types`                            |                          | ["csv", "tsv", "json", "parquet", "avro"] | Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.                                                                           |
+| `path_specs[].default_extension`                     |                          |                                           | For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.                                                                             |
 | `env`                                                |                          | `PROD`                                    | Environment to use in namespace when constructing URNs.                                                                                                                                                        |
 | `platform`                                           |                          | Autodetected                              | Platform to use in namespace when constructing URNs. If left blank, local paths will correspond to `file` and S3 paths will correspond to `s3`.                                                                |
 | `platform_instance`                                  |                          |                                           | Platform instance for datasets and containers                                                                                                                                                                  |
@@ -109,6 +111,27 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | `profiling.include_field_histogram`                  |                          | `False`                                   | Whether to profile for the histogram for numeric fields.                                                                                                                                                       |
 | `profiling.include_field_sample_values`              |                          | `True`                                    | Whether to profile for the sample values for all columns.                                                                                                                                                      |
 
+## Sample recipe
+
+```yaml
+source:
+  type: s3
+  config:
+    env: "PROD"
+    path_specs:
+      -
+        include: "s3://mybucket/folder1/{table}/{partition_key[0]}/{partition_key[1]}/{partition_key[2]}/*.csv"
+      -
+        include: "s3://mybucket/folder2/{table}/{partition_key[0]}/{partition_key[1]}/{partition_key[2]}/*.parquet"
+    aws_config:
+     aws_region: "us-east-1"
+
+sink:
+  type: "datahub-rest"
+  config:
+    server: "http://localhost:8080"
+```
+
 ## Valid path_spec.include
 
 ```python

diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md
@@ -1,20 +1,20 @@
-## Valid path_spec.include
+## Valid path_specs.include
 
 ```python
 s3://my-bucket/foo/tests/bar.avro # single file table   
 s3://my-bucket/foo/tests/*.* # mulitple file level tables
 s3://my-bucket/foo/tests/{table}/*.avro #table without partition
 s3://my-bucket/foo/tests/{table}/*/*.avro #table where partitions are not specified
 s3://my-bucket/foo/tests/{table}/*.* # table where no partitions as well as data type specified
-s3://my-bucket/{dept}/tests/{table}/*.avro # specifying key wards to be used in display name
+s3://my-bucket/{dept}/tests/{table}/*.avro # specifying keywords to be used in display name
 s3://my-bucket/{dept}/tests/{table}/{partition_key[0]}={partition[0]}/{partition_key[1]}={partition[1]}/*.avro # specify partition key and value format
 s3://my-bucket/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.avro # specify partition value only format
 s3://my-bucket/{dept}/tests/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # for all extensions
 s3://my-bucket/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 2 levels down in bucket
 s3://my-bucket/*/*/{table}/{partition[0]}/{partition[1]}/{partition[2]}/*.* # table is present at 3 levels down in bucket
 ```
 
-## Valid path_spec.exclude
+## Valid path_specs.exclude
 - **/tests/**
 - s3://my-bucket/hr/**
 - **/tests/*.csv
@@ -40,7 +40,7 @@ If you would like to write a more complicated function for resolving file names,
 
 :::caution
 
-Specify as long fixed prefix ( with out /*/ ) as possible in `path_spec.include`. This will reduce the scanning time and cost, specifically on AWS S3 
+Specify as long fixed prefix ( with out /*/ ) as possible in `path_specs.include`. This will reduce the scanning time and cost, specifically on AWS S3
 
 :::
 

diff --git a/metadata-ingestion/docs/sources/s3/s3_recipe.yml b/metadata-ingestion/docs/sources/s3/s3_recipe.yml
@@ -1,8 +1,10 @@
 source:
   type: s3
   config:
-    path_spec:
-      include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*"
+    path_specs:
+      -
+        include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*"
+
     aws_config:
       aws_access_key_id: *****
       aws_secret_access_key: *****
@@ -11,5 +13,4 @@ source:
     profiling:
       enabled: false
 
-sink:
-  # sink configs
+# sink configs