tobymao · tobymao · Sep 17, 2023 · Sep 16, 2023 · Sep 16, 2023
diff --git a/sqlglot/dialects/bigquery.py b/sqlglot/dialects/bigquery.py
@@ -462,6 +462,7 @@ class Generator(generator.Generator):
                     _unqualify_unnest,
                     transforms.eliminate_distinct_on,
                     _alias_ordered_group,
+                    transforms.eliminate_semi_and_anti_joins,
                 ]
             ),
             exp.SHA2: lambda self, e: self.func(

diff --git a/sqlglot/dialects/clickhouse.py b/sqlglot/dialects/clickhouse.py
@@ -118,7 +118,7 @@ class Parser(parser.Parser):
             TokenType.ARRAY,
         }
 
-        TABLE_ALIAS_TOKENS = {*parser.Parser.TABLE_ALIAS_TOKENS} - {
+        TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {
             TokenType.ANY,
             TokenType.SEMI,
             TokenType.ANTI,

diff --git a/sqlglot/dialects/drill.py b/sqlglot/dialects/drill.py
@@ -135,7 +135,9 @@ class Generator(generator.Generator):
             exp.StrPosition: str_position_sql,
             exp.StrToDate: _str_to_date,
             exp.Pow: rename_func("POW"),
-            exp.Select: transforms.preprocess([transforms.eliminate_distinct_on]),
+            exp.Select: transforms.preprocess(
+                [transforms.eliminate_distinct_on, transforms.eliminate_semi_and_anti_joins]
+            ),
             exp.StrToTime: lambda self, e: f"TO_TIMESTAMP({self.sql(e, 'this')}, {self.format_time(e)})",
             exp.TimeStrToDate: lambda self, e: f"CAST({self.sql(e, 'this')} AS DATE)",
             exp.TimeStrToTime: timestrtotime_sql,

diff --git a/sqlglot/dialects/duckdb.py b/sqlglot/dialects/duckdb.py
@@ -191,6 +191,11 @@ class Parser(parser.Parser):
             ),
         }
 
+        TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {
+            TokenType.SEMI,
+            TokenType.ANTI,
+        }
+
         def _parse_types(
             self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True
         ) -> t.Optional[exp.Expression]:

diff --git a/sqlglot/dialects/mysql.py b/sqlglot/dialects/mysql.py
@@ -560,7 +560,9 @@ class Generator(generator.Generator):
             exp.NullSafeEQ: lambda self, e: self.binary(e, "<=>"),
             exp.NullSafeNEQ: lambda self, e: self.not_sql(self.binary(e, "<=>")),
             exp.Pivot: no_pivot_sql,
-            exp.Select: transforms.preprocess([transforms.eliminate_distinct_on]),
+            exp.Select: transforms.preprocess(
+                [transforms.eliminate_distinct_on, transforms.eliminate_semi_and_anti_joins]
+            ),
             exp.StrPosition: strposition_to_locate_sql,
             exp.StrToDate: _str_to_date_sql,
             exp.StrToTime: _str_to_date_sql,

diff --git a/sqlglot/dialects/postgres.py b/sqlglot/dialects/postgres.py
@@ -381,25 +381,29 @@ class Generator(generator.Generator):
         TRANSFORMS = {
             **generator.Generator.TRANSFORMS,
             exp.AnyValue: any_value_to_max_sql,
+            exp.Array: lambda self, e: f"{self.normalize_func('ARRAY')}({self.sql(e.expressions[0])})"
+            if isinstance(seq_get(e.expressions, 0), exp.Select)
+            else f"{self.normalize_func('ARRAY')}[{self.expressions(e, flat=True)}]",
             exp.ArrayConcat: rename_func("ARRAY_CAT"),
             exp.ArrayContained: lambda self, e: self.binary(e, "<@"),
             exp.ArrayContains: lambda self, e: self.binary(e, "@>"),
             exp.ArrayOverlaps: lambda self, e: self.binary(e, "&&"),
             exp.BitwiseXor: lambda self, e: self.binary(e, "#"),
             exp.ColumnDef: transforms.preprocess([_auto_increment_to_serial, _serial_to_generated]),
+            exp.CurrentDate: no_paren_current_date_sql,
+            exp.CurrentTimestamp: lambda *_: "CURRENT_TIMESTAMP",
+            exp.DateAdd: _date_add_sql("+"),
+            exp.DateDiff: _date_diff_sql,
+            exp.DateStrToDate: datestrtodate_sql,
+            exp.DataType: _datatype_sql,
+            exp.DateSub: _date_add_sql("-"),
             exp.Explode: rename_func("UNNEST"),
+            exp.GroupConcat: _string_agg_sql,
             exp.JSONExtract: arrow_json_extract_sql,
             exp.JSONExtractScalar: arrow_json_extract_scalar_sql,
             exp.JSONBExtract: lambda self, e: self.binary(e, "#>"),
             exp.JSONBExtractScalar: lambda self, e: self.binary(e, "#>>"),
             exp.JSONBContains: lambda self, e: self.binary(e, "?"),
-            exp.Pow: lambda self, e: self.binary(e, "^"),
-            exp.CurrentDate: no_paren_current_date_sql,
-            exp.CurrentTimestamp: lambda *_: "CURRENT_TIMESTAMP",
-            exp.DateAdd: _date_add_sql("+"),
-            exp.DateStrToDate: datestrtodate_sql,
-            exp.DateSub: _date_add_sql("-"),
-            exp.DateDiff: _date_diff_sql,
             exp.LogicalOr: rename_func("BOOL_OR"),
             exp.LogicalAnd: rename_func("BOOL_AND"),
             exp.Max: max_or_greatest,
@@ -413,8 +417,10 @@ class Generator(generator.Generator):
                 [transforms.add_within_group_for_percentiles]
             ),
             exp.Pivot: no_pivot_sql,
+            exp.Pow: lambda self, e: self.binary(e, "^"),
             exp.RegexpLike: lambda self, e: self.binary(e, "~"),
             exp.RegexpILike: lambda self, e: self.binary(e, "~*"),
+            exp.Select: transforms.preprocess([transforms.eliminate_semi_and_anti_joins]),
             exp.StrPosition: str_position_sql,
             exp.StrToTime: lambda self, e: f"TO_TIMESTAMP({self.sql(e, 'this')}, {self.format_time(e)})",
             exp.Substring: _substring_sql,
@@ -427,11 +433,6 @@ class Generator(generator.Generator):
             exp.TryCast: no_trycast_sql,
             exp.TsOrDsToDate: ts_or_ds_to_date_sql("postgres"),
             exp.UnixToTime: lambda self, e: f"TO_TIMESTAMP({self.sql(e, 'this')})",
-            exp.DataType: _datatype_sql,
-            exp.GroupConcat: _string_agg_sql,
-            exp.Array: lambda self, e: f"{self.normalize_func('ARRAY')}({self.sql(e.expressions[0])})"
-            if isinstance(seq_get(e.expressions, 0), exp.Select)
-            else f"{self.normalize_func('ARRAY')}[{self.expressions(e, flat=True)}]",
             exp.Xor: bool_xor_sql,
         }
 

diff --git a/sqlglot/dialects/presto.py b/sqlglot/dialects/presto.py
@@ -333,6 +333,7 @@ class Generator(generator.Generator):
                     transforms.eliminate_qualify,
                     transforms.eliminate_distinct_on,
                     transforms.explode_to_unnest(1),
+                    transforms.eliminate_semi_and_anti_joins,
                 ]
             ),
             exp.SortArray: _no_sort_array,

diff --git a/sqlglot/dialects/redshift.py b/sqlglot/dialects/redshift.py
@@ -138,7 +138,9 @@ class Generator(Postgres.Generator):
             exp.JSONExtract: _json_sql,
             exp.JSONExtractScalar: _json_sql,
             exp.SafeConcat: concat_to_dpipe_sql,
-            exp.Select: transforms.preprocess([transforms.eliminate_distinct_on]),
+            exp.Select: transforms.preprocess(
+                [transforms.eliminate_distinct_on, transforms.eliminate_semi_and_anti_joins]
+            ),
             exp.SortKeyProperty: lambda self, e: f"{'COMPOUND ' if e.args['compound'] else ''}SORTKEY({self.format_args(*e.this)})",
             exp.TsOrDsToDate: ts_or_ds_to_date_sql("redshift"),
         }

diff --git a/sqlglot/dialects/snowflake.py b/sqlglot/dialects/snowflake.py
@@ -281,7 +281,7 @@ class Parser(parser.Parser):
             ),
         }
 
-        TIMESTAMPS = parser.Parser.TIMESTAMPS.copy() - {TokenType.TIME}
+        TIMESTAMPS = parser.Parser.TIMESTAMPS - {TokenType.TIME}
 
         RANGE_PARSERS = {
             **parser.Parser.RANGE_PARSERS,
@@ -413,7 +413,11 @@ class Generator(generator.Generator):
             exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}",
             exp.RegexpILike: _regexpilike_sql,
             exp.Select: transforms.preprocess(
-                [transforms.eliminate_distinct_on, transforms.explode_to_unnest(0)]
+                [
+                    transforms.eliminate_distinct_on,
+                    transforms.explode_to_unnest(0),
+                    transforms.eliminate_semi_and_anti_joins,
+                ]
             ),
             exp.StarMap: rename_func("OBJECT_CONSTRUCT"),
             exp.StartsWith: rename_func("STARTSWITH"),

diff --git a/sqlglot/dialects/sqlite.py b/sqlglot/dialects/sqlite.py
@@ -125,7 +125,11 @@ class Generator(generator.Generator):
             exp.Pivot: no_pivot_sql,
             exp.SafeConcat: concat_to_dpipe_sql,
             exp.Select: transforms.preprocess(
-                [transforms.eliminate_distinct_on, transforms.eliminate_qualify]
+                [
+                    transforms.eliminate_distinct_on,
+                    transforms.eliminate_qualify,
+                    transforms.eliminate_semi_and_anti_joins,
+                ]
             ),
             exp.TableSample: no_tablesample_sql,
             exp.TimeStrToTime: lambda self, e: self.sql(e, "this"),

diff --git a/sqlglot/dialects/teradata.py b/sqlglot/dialects/teradata.py
@@ -168,7 +168,9 @@ class Generator(generator.Generator):
             **generator.Generator.TRANSFORMS,
             exp.Max: max_or_greatest,
             exp.Min: min_or_least,
-            exp.Select: transforms.preprocess([transforms.eliminate_distinct_on]),
+            exp.Select: transforms.preprocess(
+                [transforms.eliminate_distinct_on, transforms.eliminate_semi_and_anti_joins]
+            ),
             exp.StrToDate: lambda self, e: f"CAST({self.sql(e, 'this')} AS DATE FORMAT {self.format_time(e)})",
             exp.ToChar: lambda self, e: self.function_fallback_sql(e),
             exp.Use: lambda self, e: f"DATABASE {self.sql(e, 'this')}",

diff --git a/sqlglot/dialects/tsql.py b/sqlglot/dialects/tsql.py
@@ -613,7 +613,9 @@ class Generator(generator.Generator):
             exp.MD5: lambda self, e: self.func("HASHBYTES", exp.Literal.string("MD5"), e.this),
             exp.Min: min_or_least,
             exp.NumberToStr: _format_sql,
-            exp.Select: transforms.preprocess([transforms.eliminate_distinct_on]),
+            exp.Select: transforms.preprocess(
+                [transforms.eliminate_distinct_on, transforms.eliminate_semi_and_anti_joins]
+            ),
             exp.SHA: lambda self, e: self.func("HASHBYTES", exp.Literal.string("SHA1"), e.this),
             exp.SHA2: lambda self, e: self.func(
                 "HASHBYTES",

diff --git a/sqlglot/transforms.py b/sqlglot/transforms.py
@@ -347,6 +347,31 @@ def epoch_cast_to_ts(expression: exp.Expression) -> exp.Expression:
     return expression
 
 
+def timestamp_to_cast(expression: exp.Expression) -> exp.Expression:
+    if isinstance(expression, exp.Timestamp) and not expression.expression:
+        return exp.cast(
+            expression.this,
+            to=exp.DataType.Type.TIMESTAMP,
+        )
+    return expression
+
+
+def eliminate_semi_and_anti_joins(expression: exp.Expression) -> exp.Expression:
+    if isinstance(expression, exp.Select):
+        for join in expression.args.get("joins") or []:
+            on = join.args.get("on")
+            if on and join.kind in ("SEMI", "ANTI"):
+                subquery = exp.select("1").from_(join.this).where(on)
+                exists = exp.Exists(this=subquery)
+                if join.kind == "ANTI":
+                    exists = exists.not_(copy=False)
+
+                join.pop()
+                expression.where(exists, copy=False)
+
+    return expression
+
+
 def preprocess(
     transforms: t.List[t.Callable[[exp.Expression], exp.Expression]],
 ) -> t.Callable[[Generator, exp.Expression], str]:
@@ -391,12 +416,3 @@ def _to_sql(self, expression: exp.Expression) -> str:
         raise ValueError(f"Unsupported expression type {expression.__class__.__name__}.")
 
     return _to_sql
-
-
-def timestamp_to_cast(expression: exp.Expression) -> exp.Expression:
-    if isinstance(expression, exp.Timestamp) and not expression.expression:
-        return exp.cast(
-            expression.this,
-            to=exp.DataType.Type.TIMESTAMP,
-        )
-    return expression
diff --git a/tests/dialects/test_duckdb.py b/tests/dialects/test_duckdb.py
@@ -6,6 +6,33 @@ class TestDuckDB(Validator):
     dialect = "duckdb"
 
     def test_duckdb(self):
+        for join_type in ("SEMI", "ANTI"):
+            exists = "EXISTS" if join_type == "SEMI" else "NOT EXISTS"
+            self.validate_all(
+                f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                write={
+                    "bigquery": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "clickhouse": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                    "databricks": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                    "doris": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "drill": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "duckdb": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                    "hive": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                    "mysql": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "oracle": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                    "postgres": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "presto": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "redshift": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "snowflake": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "spark": f"SELECT * FROM t1 {join_type} JOIN t2 ON t1.x = t2.x",
+                    "sqlite": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "starrocks": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "teradata": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "trino": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                    "tsql": f"SELECT * FROM t1 WHERE {exists}(SELECT 1 FROM t2 WHERE t1.x = t2.x)",
+                },
+            )
+
         self.validate_all(
             "SELECT UNNEST(ARRAY[1, 2, 3]), UNNEST(ARRAY[4, 5]), UNNEST(ARRAY[6])",
             write={