GoogleCloudPlatform · adlersantos · Sep 9, 2022 · Aug 17, 2022 · Aug 17, 2022 · Aug 18, 2022
diff --git a/datasets/cloud_datasets/infra/cloud_datasets_dataset.tf b/datasets/cloud_datasets/infra/cloud_datasets_dataset.tf
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_dataset" "_cloud_datasets" {
+  dataset_id  = "_cloud_datasets"
+  project     = var.project_id
+  description = "A dataset dedicated to Google Cloud Datasets Program and its metadata (not a public dataset)"
+}
+
+output "bigquery_dataset-_cloud_datasets-dataset_id" {
+  value = google_bigquery_dataset._cloud_datasets.dataset_id
+}
diff --git a/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf b/datasets/cloud_datasets/infra/pdp_extract_tabular_metadata_pipeline.tf
@@ -0,0 +1,232 @@
+/**
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_table" "_cloud_datasets_tabular_datasets" {
+  project     = var.project_id
+  dataset_id  = "_cloud_datasets"
+  table_id    = "tabular_datasets"
+  description = "This table contains all the metadata for all the tabular datasets in the Cloud Datasets program"
+  schema      = <<EOF
+    [
+  {
+      "name": "extracted_at",
+      "description": "The date and time when this row was extracted from BigQuery",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "created_at",
+      "description": "The date and time when the dataset was created",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "modified_at",
+      "description": "The date and time when the dataset was last modified",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "project_id",
+      "description": "The GCP project where the public dataset is stored",
+      "type": "STRING"
+  },
+  {
+      "name": "dataset_id",
+      "description": "The BigQuery dataset ID",
+      "type": "STRING"
+  },
+  {
+      "name": "description",
+      "description": "The dataset description",
+      "type": "STRING"
+  },
+  {
+      "name": "num_tables",
+      "description": "Number of tables contained in this dataset",
+      "type": "INTEGER"
+  }
+]
+    EOF
+  depends_on = [
+    google_bigquery_dataset._cloud_datasets
+  ]
+}
+
+output "bigquery_table-_cloud_datasets_tabular_datasets-table_id" {
+  value = google_bigquery_table._cloud_datasets_tabular_datasets.table_id
+}
+
+output "bigquery_table-_cloud_datasets_tabular_datasets-id" {
+  value = google_bigquery_table._cloud_datasets_tabular_datasets.id
+}
+
+resource "google_bigquery_table" "_cloud_datasets_tables" {
+  project     = var.project_id
+  dataset_id  = "_cloud_datasets"
+  table_id    = "tables"
+  description = "This table contains all the metadata for all the tables in the Cloud Datasets program"
+  schema      = <<EOF
+    [
+  {
+      "name": "extracted_at",
+      "description": "The date and time when this row was extracted from BigQuery",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "created_at",
+      "description": "The date and time when the dataset was created",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "modified_at",
+      "description": "The date and time when the dataset was last modified",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "project_id",
+      "description": "The GCP project where the public dataset is stored",
+      "type": "STRING"
+  },
+  {
+      "name": "dataset_id",
+      "description": "The BigQuery dataset ID",
+      "type": "STRING"
+  },
+  {
+      "name": "table_id",
+      "description": "The BigQuery table ID",
+      "type": "STRING"
+  },
+  {
+      "name": "description",
+      "description": "The dataset description",
+      "type": "STRING"
+  },
+  {
+      "name": "type",
+      "description": "The type of the table",
+      "type": "STRING"
+  },
+  {
+      "name": "num_bytes",
+      "description": "The number of bytes the table allocated on disk",
+      "type": "INTEGER"
+  },
+  {
+      "name": "num_rows",
+      "description": "The number of rows in the table",
+      "type": "INTEGER"
+  },
+  {
+      "name": "num_columns",
+      "description": "The number of columns in the table",
+      "type": "INTEGER"
+  },
+  {
+      "name": "described_columns",
+      "description": "The number of columns in the table with a description",
+      "type": "INTEGER"
+  }
+]
+    EOF
+  depends_on = [
+    google_bigquery_dataset._cloud_datasets
+  ]
+}
+
+output "bigquery_table-_cloud_datasets_tables-table_id" {
+  value = google_bigquery_table._cloud_datasets_tables.table_id
+}
+
+output "bigquery_table-_cloud_datasets_tables-id" {
+  value = google_bigquery_table._cloud_datasets_tables.id
+}
+
+resource "google_bigquery_table" "_cloud_datasets_tables_fields" {
+  project     = var.project_id
+  dataset_id  = "_cloud_datasets"
+  table_id    = "tables_fields"
+  description = "This table contains all the metadata for all the field in all the tables in the Cloud Datasets program"
+  schema      = <<EOF
+    [
+  {
+      "name": "extracted_at",
+      "description": "The date and time when this row was extracted from BigQuery",
+      "type": "TIMESTAMP"
+  },
+  {
+      "name": "project_id",
+      "description": "The GCP project where the public dataset is stored",
+      "type": "STRING"
+  },
+  {
+      "name": "dataset_id",
+      "description": "The BigQuery dataset ID",
+      "type": "STRING"
+  },
+  {
+      "name": "table_id",
+      "description": "The BigQuery table ID",
+      "type": "STRING"
+  },
+  {
+      "name": "name",
+      "description": "The name of the field",
+      "type": "STRING"
+  },
+  {
+      "name": "description",
+      "description": "The description for the field",
+      "type": "STRING"
+  },
+  {
+      "name": "field_type",
+      "description": "The type of the field",
+      "type": "STRING"
+  },
+  {
+      "name": "mode",
+      "description": "The mode of the field",
+      "type": "STRING"
+  },
+  {
+      "name": "precision",
+      "description": "Precision for the NUMERIC field",
+      "type": "INTEGER"
+  },
+  {
+      "name": "scale",
+      "description": "Scale for the NUMERIC field",
+      "type": "INTEGER"
+  },
+  {
+      "name": "max_length",
+      "description": "Maximum length for the STRING or BYTES field",
+      "type": "INTEGER"
+  }
+]
+    EOF
+  depends_on = [
+    google_bigquery_dataset._cloud_datasets
+  ]
+}
+
+output "bigquery_table-_cloud_datasets_tables_fields-table_id" {
+  value = google_bigquery_table._cloud_datasets_tables_fields.table_id
+}
+
+output "bigquery_table-_cloud_datasets_tables_fields-id" {
+  value = google_bigquery_table._cloud_datasets_tables_fields.id
+}
diff --git a/datasets/cloud_datasets/infra/provider.tf b/datasets/cloud_datasets/infra/provider.tf
@@ -0,0 +1,27 @@
+/**
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+provider "google" {
+  project = var.project_id
+  region  = var.region
+}
+
+data "google_client_openid_userinfo" "me" {}
+
+output "impersonating-account" {
+  value = data.google_client_openid_userinfo.me.email
+}
diff --git a/datasets/cloud_datasets/infra/variables.tf b/datasets/cloud_datasets/infra/variables.tf
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2022 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+variable "project_id" {}
+variable "bucket_name_prefix" {}
+variable "impersonating_acct" {}
+variable "region" {}
+variable "env" {}
+variable "iam_policies" {
+  default = {}
+}
+
diff --git a/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/Dockerfile b/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/Dockerfile
@@ -0,0 +1,38 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base image for this build
+# FROM gcr.io/google.com/cloudsdktool/cloud-sdk:slim
+FROM python:3.8
+
+# Allow statements and log messages to appear in Cloud logs
+ENV PYTHONUNBUFFERED True
+
+# Copy the requirements file into the image
+COPY requirements.txt ./
+
+# Install the packages specified in the requirements file
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+
+# The WORKDIR instruction sets the working directory for any RUN, CMD,
+# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
+# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
+# any subsequent Dockerfile instruction
+WORKDIR /custom
+
+# Copy the specific data processing script/s in the image under /custom/*
+COPY ./script.py .
+
+# Command to run the data processing script when the container is run
+CMD ["python3", "script.py"]
diff --git a/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/requirements.txt b/datasets/cloud_datasets/pipelines/_images/pdp_extract_tabular_metadata/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+google-cloud-storage
+google-cloud-bigquery