Skip to content

Commit

Permalink
Generic dataset module and specific s3_datasets module - part 1 (Rena…
Browse files Browse the repository at this point in the history
…me datasets as s3_datasets) (#1250)

### Feature or Bugfix
- Refactoring

### Detail
- Rename `datasets` module to `s3_datasets` module

This PR is the first step to extract a generic datasets_base module that
implements the undifferentiated concepts of Dataset in data.all.
s3_datasets will use this base module to implement the specific
implementation for S3 datatasets.

### Relates
- #1123 
- #955 

### Security
Please answer the questions below briefly where applicable, or write
`N/A`. Based on
[OWASP 10](https://owasp.org/Top10/en/).

- Does this PR introduce or modify any input fields or queries - this
includes
fetching data from storage outside the application (e.g. a database, an
S3 bucket)?
  - Is the input sanitized?
- What precautions are you taking before deserializing the data you
consume?
  - Is injection prevented by parametrizing queries?
  - Have you ensured no `eval` or similar functions are used?
- Does this PR introduce any functionality or component that requires
authorization?
- How have you ensured it respects the existing AuthN/AuthZ mechanisms?
  - Are you logging failed auth attempts?
- Are you using or adding any cryptographic features?
  - Do you use a standard proven implementations?
  - Are the used keys controlled by the customer? Where are they stored?
- Are you introducing any new policies/roles/users?
  - Have you used the least-privilege principle? How?


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
dlpzx authored May 7, 2024
1 parent 74a303c commit 40defe8
Show file tree
Hide file tree
Showing 181 changed files with 364 additions and 353 deletions.
6 changes: 3 additions & 3 deletions backend/dataall/modules/dataset_sharing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ def is_supported(modes: Set[ImportMode]) -> bool:
@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.notifications import NotificationsModuleInterface
from dataall.modules.datasets import DatasetApiModuleInterface
from dataall.modules.s3_datasets import DatasetApiModuleInterface

return [DatasetApiModuleInterface, NotificationsModuleInterface]

def __init__(self):
from dataall.modules.dataset_sharing import api
from dataall.modules.dataset_sharing.services.managed_share_policy_service import SharePolicyService
from dataall.modules.datasets.services.dataset_service import DatasetService
from dataall.modules.s3_datasets.services.dataset_service import DatasetService
from dataall.modules.dataset_sharing.services.dataset_sharing_service import DatasetSharingService

EnvironmentResourceManager.register(ShareEnvironmentResource())
Expand All @@ -42,7 +42,7 @@ def is_supported(modes: List[ImportMode]):
@staticmethod
def depends_on() -> List[Type['ModuleInterface']]:
from dataall.modules.notifications import NotificationsModuleInterface
from dataall.modules.datasets import DatasetAsyncHandlersModuleInterface
from dataall.modules.s3_datasets import DatasetAsyncHandlersModuleInterface

return [DatasetAsyncHandlersModuleInterface, NotificationsModuleInterface]

Expand Down
6 changes: 3 additions & 3 deletions backend/dataall/modules/dataset_sharing/api/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from dataall.modules.dataset_sharing.services.share_object_service import ShareObjectService
from dataall.modules.dataset_sharing.services.dataset_sharing_service import DatasetSharingService
from dataall.modules.dataset_sharing.aws.glue_client import GlueClient
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -335,6 +335,6 @@ def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str,
return DatasetSharingService.list_shared_tables_by_env_dataset(datasetUri, envUri)


@is_feature_enabled('modules.datasets.features.aws_actions')
@is_feature_enabled('modules.s3_datasets.features.aws_actions')
def get_dataset_shared_assume_role_url(context: Context, source, datasetUri: str = None):
return DatasetSharingService.get_dataset_shared_assume_role_url(uri=datasetUri)
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
PrincipalType,
)
from dataall.modules.dataset_sharing.db.share_object_models import ShareObjectItem, ShareObject
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset, DatasetBucket
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset, DatasetBucket

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
ShareItemStatus,
PrincipalType,
)
from dataall.modules.datasets.db.dataset_models import DatasetLock
from dataall.modules.s3_datasets.db.dataset_models import DatasetLock

log = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from dataall.core.environment.db.environment_models import Environment
from dataall.modules.dataset_sharing.db.share_object_models import ShareObject
from dataall.modules.datasets.db.dataset_models import DatasetTable, DatasetStorageLocation, DatasetBucket
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, DatasetStorageLocation, DatasetBucket
from dataall.base.utils.alarm_service import AlarmService

log = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
)
from dataall.modules.dataset_sharing.services.share_permissions import SHARE_OBJECT_APPROVER
from dataall.modules.dataset_sharing.services.share_item_service import ShareItemService
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.services.dataset_permissions import (
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.services.dataset_permissions import (
MANAGE_DATASETS,
UPDATE_DATASET,
DELETE_DATASET,
Expand All @@ -21,9 +21,9 @@
CREDENTIALS_DATASET,
)

from dataall.modules.datasets.db.dataset_models import Dataset
from dataall.modules.datasets.services.datasets_enums import DatasetRole
from dataall.modules.datasets.services.dataset_service import DatasetServiceInterface
from dataall.modules.s3_datasets.db.dataset_models import Dataset
from dataall.modules.s3_datasets.services.datasets_enums import DatasetRole
from dataall.modules.s3_datasets.services.dataset_service import DatasetServiceInterface


import logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
LIST_ENVIRONMENT_SHARED_WITH_OBJECTS,
APPROVE_SHARE_OBJECT,
)
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.db.dataset_models import Dataset
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import Dataset

log = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
ShareItemActions,
ShareItemHealthStatus,
)
from dataall.modules.datasets.db.dataset_models import DatasetTable, Dataset
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, Dataset
from dataall.modules.dataset_sharing.services.dataset_sharing_alarm_service import DatasetSharingAlarmService
from dataall.modules.dataset_sharing.db.share_object_models import ShareObjectItem, ShareObject
from dataall.modules.dataset_sharing.services.share_managers.share_manager_utils import ShareErrorFormatter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
EMPTY_STATEMENT_SID,
)
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import PrincipalType
from dataall.modules.datasets.db.dataset_models import DatasetStorageLocation, Dataset
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, Dataset

logger = logging.getLogger(__name__)
ACCESS_POINT_CREATION_TIME = 30
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
EMPTY_STATEMENT_SID,
)
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import PrincipalType
from dataall.modules.datasets.db.dataset_models import Dataset, DatasetBucket
from dataall.modules.s3_datasets.db.dataset_models import Dataset, DatasetBucket
from dataall.modules.dataset_sharing.db.share_object_repositories import ShareObjectRepository

logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dataall.core.tasks.db.task_models import Task
from dataall.core.tasks.service_handlers import Worker
from dataall.modules.dataset_sharing.db.share_object_models import ShareObject
from dataall.modules.datasets.db.dataset_models import Dataset
from dataall.modules.s3_datasets.db.dataset_models import Dataset
from dataall.base.context import get_context
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import ShareObjectStatus
from dataall.modules.notifications.db.notification_repositories import NotificationRepository
Expand Down Expand Up @@ -139,7 +139,9 @@ def _create_notification_task(self, subject, msg):
- dataset.stewards
- share.owner (person that opened the request) OR share.groupUri (if group_notifications=true)
"""
share_notification_config = config.get_property('modules.datasets.features.share_notifications', default=None)
share_notification_config = config.get_property(
'modules.s3_datasets.features.share_notifications', default=None
)
if share_notification_config:
for share_notification_config_type in share_notification_config.keys():
n_config = share_notification_config[share_notification_config_type]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@
GET_SHARE_OBJECT,
)
from dataall.modules.dataset_sharing.aws.glue_client import GlueClient
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.db.dataset_models import DatasetTable, Dataset, DatasetStorageLocation
from dataall.modules.datasets.services.dataset_permissions import DATASET_TABLE_READ, DATASET_FOLDER_READ
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, Dataset, DatasetStorageLocation
from dataall.modules.s3_datasets.services.dataset_permissions import DATASET_TABLE_READ, DATASET_FOLDER_READ
from dataall.base.aws.iam import IAM

import logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from dataall.modules.dataset_sharing.services.share_managers import LFShareManager
from dataall.modules.dataset_sharing.aws.ram_client import RamClient
from dataall.modules.dataset_sharing.services.share_object_service import ShareObjectService
from dataall.modules.datasets.db.dataset_models import DatasetTable, Dataset
from dataall.modules.s3_datasets.db.dataset_models import DatasetTable, Dataset
from dataall.modules.dataset_sharing.db.share_object_models import ShareObject
from dataall.modules.dataset_sharing.db.share_object_repositories import ShareObjectRepository, ShareItemSM
from dataall.modules.dataset_sharing.services.share_managers.share_manager_utils import ShareErrorFormatter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataall.modules.dataset_sharing.services.share_exceptions import PrincipalRoleNotFound
from dataall.modules.dataset_sharing.services.share_managers import S3AccessPointShareManager
from dataall.modules.dataset_sharing.services.share_object_service import ShareObjectService
from dataall.modules.datasets.db.dataset_models import DatasetStorageLocation, Dataset
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, Dataset
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import (
ShareItemHealthStatus,
ShareItemStatus,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dataall.modules.dataset_sharing.services.share_exceptions import PrincipalRoleNotFound
from dataall.modules.dataset_sharing.services.share_managers import S3BucketShareManager
from dataall.modules.dataset_sharing.services.share_object_service import ShareObjectService
from dataall.modules.datasets.db.dataset_models import Dataset, DatasetBucket
from dataall.modules.s3_datasets.db.dataset_models import Dataset, DatasetBucket
from dataall.modules.dataset_sharing.services.dataset_sharing_enums import (
ShareItemHealthStatus,
ShareItemStatus,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
from dataall.modules.dataset_sharing.db.share_object_models import ShareObjectItem
from dataall.modules.dataset_sharing.db.share_object_repositories import ShareObjectRepository
from dataall.modules.dataset_sharing.services.share_notification_service import ShareNotificationService
from dataall.modules.datasets.aws.sns_dataset_client import SnsDatasetClient
from dataall.modules.datasets.db.dataset_location_repositories import DatasetLocationRepository
from dataall.modules.datasets.db.dataset_table_repositories import DatasetTableRepository
from dataall.modules.s3_datasets.aws.sns_dataset_client import SnsDatasetClient
from dataall.modules.s3_datasets.db.dataset_location_repositories import DatasetLocationRepository
from dataall.modules.s3_datasets.db.dataset_table_repositories import DatasetTableRepository
from dataall.modules.dataset_sharing.tasks.subscriptions import poll_queues
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset

root = logging.getLogger()
root.setLevel(logging.INFO)
Expand Down
3 changes: 0 additions & 3 deletions backend/dataall/modules/datasets/api/dataset/__init__.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ def __init__(self):
from dataall.modules.feed.api.registry import FeedRegistry, FeedDefinition
from dataall.modules.catalog.indexers.registry import GlossaryRegistry, GlossaryDefinition
from dataall.core.environment.services.environment_resource_manager import EnvironmentResourceManager
from dataall.modules.datasets.indexers.dataset_indexer import DatasetIndexer
from dataall.modules.datasets.indexers.location_indexer import DatasetLocationIndexer
from dataall.modules.datasets.indexers.table_indexer import DatasetTableIndexer
from dataall.modules.s3_datasets.indexers.dataset_indexer import DatasetIndexer
from dataall.modules.s3_datasets.indexers.location_indexer import DatasetLocationIndexer
from dataall.modules.s3_datasets.indexers.table_indexer import DatasetTableIndexer

import dataall.modules.datasets.api
from dataall.modules.datasets.services.dataset_permissions import GET_DATASET, UPDATE_DATASET
from dataall.modules.datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset
import dataall.modules.s3_datasets.api
from dataall.modules.s3_datasets.services.dataset_permissions import GET_DATASET, UPDATE_DATASET
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset

FeedRegistry.register(FeedDefinition('DatasetStorageLocation', DatasetStorageLocation))
FeedRegistry.register(FeedDefinition('DatasetTable', DatasetTable))
Expand Down Expand Up @@ -86,11 +86,11 @@ def is_supported(modes: Set[ImportMode]):
return ImportMode.HANDLERS in modes

def __init__(self):
import dataall.modules.datasets.handlers
import dataall.modules.datasets.db.dataset_models
import dataall.modules.datasets.db.dataset_repositories
import dataall.modules.datasets.services.dataset_permissions
import dataall.modules.datasets.services.datasets_enums
import dataall.modules.s3_datasets.handlers
import dataall.modules.s3_datasets.db.dataset_models
import dataall.modules.s3_datasets.db.dataset_repositories
import dataall.modules.s3_datasets.services.dataset_permissions
import dataall.modules.s3_datasets.services.datasets_enums

log.info('Dataset handlers have been imported')

Expand All @@ -103,10 +103,10 @@ def is_supported(modes: Set[ImportMode]):
return ImportMode.CDK in modes

def __init__(self):
import dataall.modules.datasets.cdk
import dataall.modules.s3_datasets.cdk
from dataall.core.environment.cdk.environment_stack import EnvironmentSetup
from dataall.modules.datasets.cdk.dataset_glue_profiler_extension import DatasetGlueProfilerExtension
from dataall.modules.datasets.cdk.dataset_custom_resources_extension import DatasetCustomResourcesExtension
from dataall.modules.s3_datasets.cdk.dataset_glue_profiler_extension import DatasetGlueProfilerExtension
from dataall.modules.s3_datasets.cdk.dataset_custom_resources_extension import DatasetCustomResourcesExtension

EnvironmentSetup.register(DatasetGlueProfilerExtension)
EnvironmentSetup.register(DatasetCustomResourcesExtension)
Expand All @@ -120,7 +120,7 @@ def is_supported(modes: Set[ImportMode]) -> bool:
return ImportMode.STACK_UPDATER_TASK in modes

def __init__(self):
from dataall.modules.datasets.tasks.dataset_stack_finder import DatasetStackFinder
from dataall.modules.s3_datasets.tasks.dataset_stack_finder import DatasetStackFinder

DatasetStackFinder()
log.info('Dataset stack updater task has been loaded')
Expand All @@ -138,7 +138,7 @@ def depends_on() -> List[Type['ModuleInterface']]:
return [CatalogIndexerModuleInterface]

def __init__(self):
from dataall.modules.datasets.indexers.dataset_catalog_indexer import DatasetCatalogIndexer
from dataall.modules.s3_datasets.indexers.dataset_catalog_indexer import DatasetCatalogIndexer

DatasetCatalogIndexer()
log.info('Dataset catalog indexer task has been loaded')
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""The GraphQL schema of datasets and related functionality"""

from dataall.modules.datasets.api import table_column, profiling, storage_location, table, dataset
from dataall.modules.s3_datasets.api import table_column, profiling, storage_location, table, dataset

__all__ = ['table_column', 'profiling', 'storage_location', 'table', 'dataset']
3 changes: 3 additions & 0 deletions backend/dataall/modules/s3_datasets/api/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from dataall.modules.s3_datasets.api.dataset import input_types, mutations, queries, resolvers, types

__all__ = ['resolvers', 'types', 'input_types', 'queries', 'mutations']
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataall.base.api import gql
from dataall.base.api.constants import SortDirection
from dataall.modules.datasets.services.datasets_enums import DatasetSortField
from dataall.modules.s3_datasets.services.datasets_enums import DatasetSortField


NewDatasetInput = gql.InputType(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from dataall.base.api import gql
from dataall.modules.datasets.api.dataset.input_types import (
from dataall.modules.s3_datasets.api.dataset.input_types import (
ModifyDatasetInput,
NewDatasetInput,
ImportDatasetInput,
)
from dataall.modules.datasets.api.dataset.resolvers import (
from dataall.modules.s3_datasets.api.dataset.resolvers import (
create_dataset,
update_dataset,
generate_dataset_access_token,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataall.base.api import gql
from dataall.modules.datasets.api.dataset.input_types import DatasetFilter
from dataall.modules.datasets.api.dataset.resolvers import (
from dataall.modules.s3_datasets.api.dataset.input_types import DatasetFilter
from dataall.modules.s3_datasets.api.dataset.resolvers import (
get_dataset,
list_all_user_datasets,
list_owned_datasets,
Expand All @@ -9,7 +9,7 @@
list_datasets_owned_by_env_group,
list_datasets_created_in_environment,
)
from dataall.modules.datasets.api.dataset.types import DatasetSearchResult
from dataall.modules.s3_datasets.api.dataset.types import DatasetSearchResult

getDataset = gql.QueryField(
name='getDataset',
Expand Down
Loading

0 comments on commit 40defe8

Please sign in to comment.