From d014ff218012fb019e9f625416d4cf35051db107 Mon Sep 17 00:00:00 2001 From: Marko Mikulicic Date: Mon, 7 Feb 2022 14:48:17 +0100 Subject: [PATCH] fix: Case insensitive unquoted identifiers (#1747) --- datafusion/src/execution/context.rs | 167 ++++++++++++++++++++++++++++ datafusion/src/sql/planner.rs | 18 +-- datafusion/src/sql/utils.rs | 9 ++ 3 files changed, 182 insertions(+), 12 deletions(-) diff --git a/datafusion/src/execution/context.rs b/datafusion/src/execution/context.rs index 96e49c800f48..2f8663ecae01 100644 --- a/datafusion/src/execution/context.rs +++ b/datafusion/src/execution/context.rs @@ -3359,6 +3359,173 @@ mod tests { assert_eq!(result[0].schema().metadata(), result[1].schema().metadata()); } + #[tokio::test] + async fn normalized_column_identifiers() { + // create local execution context + let mut ctx = ExecutionContext::new(); + + // register csv file with the execution context + ctx.register_csv( + "case_insensitive_test", + "tests/example.csv", + CsvReadOptions::new(), + ) + .await + .unwrap(); + + let sql = "SELECT A, b FROM case_insensitive_test"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| a | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let sql = "SELECT t.A, b FROM case_insensitive_test AS t"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| a | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + // Aliases + + let sql = "SELECT t.A as x, b FROM case_insensitive_test AS t"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| x | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let sql = "SELECT t.A AS X, b FROM case_insensitive_test AS t"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| x | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let sql = r#"SELECT t.A AS "X", b FROM case_insensitive_test AS t"#; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| X | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + // Order by + + let sql = "SELECT t.A AS x, b FROM case_insensitive_test AS t ORDER BY x"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| x | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let sql = "SELECT t.A AS x, b FROM case_insensitive_test AS t ORDER BY X"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| x | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let sql = r#"SELECT t.A AS "X", b FROM case_insensitive_test AS t ORDER BY "X""#; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| X | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + // Where + + let sql = "SELECT a, b FROM case_insensitive_test where A IS NOT null"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| a | b |", + "+---+---+", + "| 1 | 2 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + // Group by + + let sql = "SELECT a as x, count(*) as c FROM case_insensitive_test GROUP BY X"; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| x | c |", + "+---+---+", + "| 1 | 1 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + + let sql = + r#"SELECT a as "X", count(*) as c FROM case_insensitive_test GROUP BY "X""#; + let result = plan_and_collect(&mut ctx, sql) + .await + .expect("ran plan correctly"); + let expected = vec![ + "+---+---+", + "| X | c |", + "+---+---+", + "| 1 | 1 |", + "+---+---+", + ]; + assert_batches_sorted_eq!(expected, &result); + } + struct MyPhysicalPlanner {} #[async_trait] diff --git a/datafusion/src/sql/planner.rs b/datafusion/src/sql/planner.rs index 462977274ecb..682b92ba661f 100644 --- a/datafusion/src/sql/planner.rs +++ b/datafusion/src/sql/planner.rs @@ -36,7 +36,7 @@ use crate::logical_plan::{ use crate::optimizer::utils::exprlist_to_columns; use crate::prelude::JoinType; use crate::scalar::ScalarValue; -use crate::sql::utils::make_decimal_type; +use crate::sql::utils::{make_decimal_type, normalize_ident}; use crate::{ error::{DataFusionError, Result}, physical_plan::udaf::AggregateUDF, @@ -1191,7 +1191,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { SelectItem::UnnamedExpr(expr) => self.sql_to_rex(expr, schema), SelectItem::ExprWithAlias { expr, alias } => Ok(Alias( Box::new(self.sql_to_rex(expr, schema)?), - alias.value.clone(), + normalize_ident(alias), )), SelectItem::Wildcard => Ok(Expr::Wildcard), SelectItem::QualifiedWildcard(_) => Err(DataFusionError::NotImplemented( @@ -1392,6 +1392,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { SQLExpr::Identifier(ref id) => { if id.value.starts_with('@') { + // TODO: figure out if ScalarVariables should be insensitive. let var_names = vec![id.value.clone()]; Ok(Expr::ScalarVariable(var_names)) } else { @@ -1401,7 +1402,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // identifier. (e.g. it is "foo.bar" not foo.bar) Ok(Expr::Column(Column { relation: None, - name: id.value.clone(), + name: normalize_ident(id), })) } } @@ -1418,8 +1419,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } SQLExpr::CompoundIdentifier(ids) => { - let mut var_names: Vec<_> = - ids.iter().map(|id| id.value.clone()).collect(); + let mut var_names: Vec<_> = ids.iter().map(normalize_ident).collect(); if &var_names[0][0..1] == "@" { Ok(Expr::ScalarVariable(var_names)) @@ -1639,13 +1639,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // (e.g. "foo.bar") for function names yet function.name.to_string() } else { - // if there is a quote style, then don't normalize - // the name, otherwise normalize to lowercase - let ident = &function.name.0[0]; - match ident.quote_style { - Some(_) => ident.value.clone(), - None => ident.value.to_ascii_lowercase(), - } + normalize_ident(&function.name.0[0]) }; // first, scalar built-in diff --git a/datafusion/src/sql/utils.rs b/datafusion/src/sql/utils.rs index 0ede5ad8559e..d0cef0f3d376 100644 --- a/datafusion/src/sql/utils.rs +++ b/datafusion/src/sql/utils.rs @@ -18,6 +18,7 @@ //! SQL Utility Functions use arrow::datatypes::DataType; +use sqlparser::ast::Ident; use crate::logical_plan::{Expr, LogicalPlan}; use crate::scalar::{ScalarValue, MAX_PRECISION_FOR_DECIMAL128}; @@ -532,6 +533,14 @@ pub(crate) fn make_decimal_type( } } +// Normalize an identifer to a lowercase string unless the identifier is quoted. +pub(crate) fn normalize_ident(id: &Ident) -> String { + match id.quote_style { + Some(_) => id.value.clone(), + None => id.value.to_ascii_lowercase(), + } +} + #[cfg(test)] mod tests { use super::*;