From 69231d2130c15055bc45301a252bd44fe9f1e603 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 26 Oct 2023 16:43:47 -0700 Subject: [PATCH 1/4] feat(sqlparser): performance improvements In particular, this will improve performance for queries with a large number of output columns. Simpler queries will also see modest speedups. In my tests, a simple `SELECT` with ~6000 columns saw a ~15-20x speedup. Should have no behavioral impact. --- metadata-ingestion/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 7f7826abe20952..2c0d6a432b5cd0 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -108,7 +108,7 @@ sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.5.2.dev45", + "acryl-sqlglot==18.17.1.dev12", } sql_common = ( From 1471410fad2e4cf549d6865818aab9411aa45336 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 27 Oct 2023 14:31:42 -0700 Subject: [PATCH 2/4] update sqlglot again + misc docker fixes --- metadata-ingestion/setup.py | 2 +- .../src/datahub/cli/docker_cli.py | 5 ++ .../src/datahub/upgrade/upgrade.py | 12 ++--- .../src/datahub/utilities/sqlglot_lineage.py | 1 + .../test_teradata_strange_operators.json | 46 +++++++++++++++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 14 ++++++ 6 files changed, 71 insertions(+), 9 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 2c0d6a432b5cd0..e8830224fe46eb 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -108,7 +108,7 @@ sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.17.1.dev12", + "acryl-sqlglot==18.17.1.dev16", } sql_common = ( diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 4afccfe711e347..77e3285d359efc 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -5,6 +5,7 @@ import os import pathlib import platform +import signal import subprocess import sys import tempfile @@ -770,6 +771,10 @@ def quickstart( # noqa: C901 logger.debug("docker compose up still running, sending SIGKILL") up_process.kill() up_process.wait() + else: + # If the docker process got a keyboard interrupt, raise one here. + if up_process.returncode in {128 + signal.SIGINT, -signal.SIGINT}: + raise KeyboardInterrupt # Check docker health every few seconds. status = check_docker_quickstart() diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index 30f19b8b84f354..acc7954ad25a63 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -1,6 +1,5 @@ import asyncio import contextlib -import functools import logging import sys from datetime import datetime, timedelta, timezone @@ -374,17 +373,14 @@ def check_upgrade(func: Callable[..., T]) -> Callable[..., T]: @wraps(func) def async_wrapper(*args: Any, **kwargs: Any) -> Any: async def run_inner_func(): - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, functools.partial(func, *args, **kwargs) - ) + return func(*args, **kwargs) async def run_func_check_upgrade(): version_stats_future = asyncio.ensure_future(retrieve_version_stats()) - the_one_future = asyncio.ensure_future(run_inner_func()) - ret = await the_one_future + main_func_future = asyncio.ensure_future(run_inner_func()) + ret = await main_func_future - # the one future has returned + # the main future has returned # we check the other futures quickly try: version_stats = await asyncio.wait_for(version_stats_future, 0.5) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 1d74b205698140..074a28e5dca6cd 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -93,6 +93,7 @@ def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: sqlglot.exp.Update: QueryType.UPDATE, sqlglot.exp.Delete: QueryType.DELETE, sqlglot.exp.Merge: QueryType.MERGE, + sqlglot.exp.Subqueryable: QueryType.SELECT, # unions, etc. are also selects } for cls, query_type in mapping.items(): diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json new file mode 100644 index 00000000000000..4b21a2512ccd11 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json @@ -0,0 +1,46 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "col1", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)", + "column": "col1" + } + ] + }, + { + "downstream": { + "table": null, + "column": "col2", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "column": "col2" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)", + "column": "col2" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index dfc5b486abd35f..2e7d72512b1630 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -675,6 +675,20 @@ def test_teradata_default_normalization(): ) +def test_teradata_strange_operators(): + assert_sql_result( + """ +select col1, col2 from dbc.table1 +where col1 eq 'value1' +minus +select col1, col2 from dbc.table2 +""", + dialect="teradata", + default_schema="dbc", + expected_file=RESOURCE_DIR / "test_teradata_strange_operators.json", + ) + + def test_snowflake_update_hardcoded(): assert_sql_result( """ From e60e31ebb34e39902fbe2150dc02440194346414 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 30 Oct 2023 14:21:01 -0700 Subject: [PATCH 3/4] fix issue with union --- .../tests/unit/sql_parsing/goldens/test_select_from_union.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json index 902aa010c8afc4..5d1d421f49a2aa 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json @@ -1,5 +1,5 @@ { - "query_type": "UNKNOWN", + "query_type": "SELECT", "in_tables": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf10.orders,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf100.orders,PROD)" From 87d0b06232843726973d744b40539ce0ec5d4151 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 30 Oct 2023 14:24:56 -0700 Subject: [PATCH 4/4] tweak --- metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index 8b4de46435ff0b..6413275ac63a6f 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -821,10 +821,8 @@ def _extract_select_from_update( ) # Update statements always implicitly have the updated table in context. - # TODO: Retain table name alias. + # TODO: Retain table name alias, if one was present. if select_statement.args.get("from"): - # select_statement = sqlglot.parse_one(select_statement.sql(dialect=dialect)) - select_statement = select_statement.join( statement.this, append=True, join_kind="cross" )