From 30d9bf52680e2475396b47fe406f63045c949855 Mon Sep 17 00:00:00 2001 From: Kunal Kundu Date: Tue, 19 Mar 2024 15:30:48 +0530 Subject: [PATCH 1/6] improve null handling for to_char --- datafusion/functions/src/datetime/to_char.rs | 21 ++++++++++++++++--- .../sqllogictest/test_files/timestamps.slt | 14 ++++++++++--- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 90b3a1d35353..13bae7710033 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -171,7 +171,11 @@ fn _to_char_scalar( // of the implementation in arrow-rs we need to convert it to an array let data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let array = expression.into_array(1)?; + let array_from_expr = expression.into_array(1)?; + let array = match format { + Some(_) => array_from_expr, + None => ColumnarValue::create_null_array(array_from_expr.len()).into_array(1)?, + }; let format_options = match _build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, @@ -215,8 +219,19 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { }; // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; - let result = formatter.value(idx).try_to_string(); + let result = match format { + Some(_) => { + let formatter = + ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; + formatter.value(idx).try_to_string() + } + None => { + let null_array = ColumnarValue::create_null_array(1).into_array(1)?; + let formatter = + ArrayFormatter::try_new(null_array.as_ref(), &format_options)?; + formatter.value(0).try_to_string() + } + }; match result { Ok(value) => results.push(value), Err(e) => return exec_err!("{}", e), diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index f718bbf14cbc..3af23e632fb2 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -2661,7 +2661,7 @@ PT123456S query T select to_char(arrow_cast(123456, 'Duration(Second)'), null); ---- -PT123456S +(empty) query error DataFusion error: Execution error: Cast error: Format error SELECT to_char(timestamps, '%X%K') from formats; @@ -2672,14 +2672,22 @@ SELECT to_char('2000-02-03'::date, '%X%K'); query T SELECT to_char(timestamps, null) from formats; ---- -2024-01-01T06:00:00Z -2025-01-01T23:59:58Z +(empty) +(empty) query T SELECT to_char(null, '%d-%m-%Y'); ---- (empty) +query T +SELECT to_char(column1, column2) +FROM +(VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S')); +---- +(empty) +01:01:2025 23-59-58 + statement ok drop table formats; From b2789c206ee4976a2ed9fe6e48a9dcf6f88475e7 Mon Sep 17 00:00:00 2001 From: Kunal Kundu Date: Wed, 20 Mar 2024 23:12:01 +0530 Subject: [PATCH 2/6] early return from to_char for null format --- datafusion/functions/src/datetime/to_char.rs | 38 ++++++++++---------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 13bae7710033..7f7423b41389 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -171,11 +171,20 @@ fn _to_char_scalar( // of the implementation in arrow-rs we need to convert it to an array let data_type = &expression.data_type(); let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_)); - let array_from_expr = expression.into_array(1)?; - let array = match format { - Some(_) => array_from_expr, - None => ColumnarValue::create_null_array(array_from_expr.len()).into_array(1)?, - }; + let array = expression.into_array(1)?; + + if format.is_none() { + if is_scalar_expression { + return Ok(ColumnarValue::Scalar(ScalarValue::Utf8( + Some(String::new()), + ))); + } else { + return Ok(ColumnarValue::Array(Arc::new(StringArray::from( + vec![String::new(); array.len()], + )))); + } + } + let format_options = match _build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, @@ -213,25 +222,18 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { } else { Some(format_array.value(idx)) }; + if format.is_none() { + results.push(String::new()); + continue; + } let format_options = match _build_format_options(data_type, format) { Ok(value) => value, Err(value) => return value, }; // this isn't ideal but this can't use ValueFormatter as it isn't independent // from ArrayFormatter - let result = match format { - Some(_) => { - let formatter = - ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; - formatter.value(idx).try_to_string() - } - None => { - let null_array = ColumnarValue::create_null_array(1).into_array(1)?; - let formatter = - ArrayFormatter::try_new(null_array.as_ref(), &format_options)?; - formatter.value(0).try_to_string() - } - }; + let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; + let result = formatter.value(idx).try_to_string(); match result { Ok(value) => results.push(value), Err(e) => return exec_err!("{}", e), From e505d6cda4097a06654e17e51c456f19d053be61 Mon Sep 17 00:00:00 2001 From: Kunal Kundu Date: Thu, 21 Mar 2024 00:05:49 +0530 Subject: [PATCH 3/6] remove invalid comment, update example --- datafusion-examples/examples/to_char.rs | 19 +++++++++++++++++++ datafusion/functions/src/datetime/to_char.rs | 1 - 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/datafusion-examples/examples/to_char.rs b/datafusion-examples/examples/to_char.rs index e99f69fbcd55..1c55ff4bf87b 100644 --- a/datafusion-examples/examples/to_char.rs +++ b/datafusion-examples/examples/to_char.rs @@ -193,5 +193,24 @@ async fn main() -> Result<()> { &result ); + // output format is null + + let result = ctx + .sql("SELECT to_char(arrow_cast(123456, 'Duration(Second)'), null)") + .await? + .collect() + .await?; + + assert_batches_eq!( + &[ + "+-----------------------------+", + "| to_char(Int64(123456),NULL) |", + "+-----------------------------+", + "| |", + "+-----------------------------+", + ], + &result + ); + Ok(()) } diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 7f7423b41389..9c03c54c13be 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -106,7 +106,6 @@ impl ScalarUDFImpl for ToCharFunc { } match &args[1] { - // null format, use default formats ColumnarValue::Scalar(ScalarValue::Utf8(None)) | ColumnarValue::Scalar(ScalarValue::Null) => { _to_char_scalar(args[0].clone(), None) From a3ad0309fa6836a6689192dbe9d86ccd098aa10f Mon Sep 17 00:00:00 2001 From: Kunal Kundu Date: Thu, 21 Mar 2024 00:31:25 +0530 Subject: [PATCH 4/6] rename column for consistency across platforms for tests --- datafusion-examples/examples/to_char.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion-examples/examples/to_char.rs b/datafusion-examples/examples/to_char.rs index 1c55ff4bf87b..953eecd79a6d 100644 --- a/datafusion-examples/examples/to_char.rs +++ b/datafusion-examples/examples/to_char.rs @@ -196,18 +196,18 @@ async fn main() -> Result<()> { // output format is null let result = ctx - .sql("SELECT to_char(arrow_cast(123456, 'Duration(Second)'), null)") + .sql("SELECT to_char(arrow_cast(123456, 'Duration(Second)'), null) as result") .await? .collect() .await?; assert_batches_eq!( &[ - "+-----------------------------+", - "| to_char(Int64(123456),NULL) |", - "+-----------------------------+", - "| |", - "+-----------------------------+", + "+--------+", + "| result |", + "+--------+", + "| |", + "+--------+", ], &result ); From 5156731779a92f154fcdbaa1c6e066a3cec96467 Mon Sep 17 00:00:00 2001 From: Kunal Kundu Date: Sat, 23 Mar 2024 16:46:25 +0530 Subject: [PATCH 5/6] return None instead of empty string from to_char --- datafusion/functions/src/datetime/to_char.rs | 24 +++++++++---------- .../sqllogictest/test_files/timestamps.slt | 8 +++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 9c03c54c13be..2bf2e4688ebd 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -174,13 +174,10 @@ fn _to_char_scalar( if format.is_none() { if is_scalar_expression { - return Ok(ColumnarValue::Scalar(ScalarValue::Utf8( - Some(String::new()), - ))); + return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); } else { - return Ok(ColumnarValue::Array(Arc::new(StringArray::from( - vec![String::new(); array.len()], - )))); + let result: Vec> = vec![None; array.len()]; + return Ok(ColumnarValue::Array(Arc::new(StringArray::from(result)))); } } @@ -211,7 +208,7 @@ fn _to_char_scalar( fn _to_char_array(args: &[ColumnarValue]) -> Result { let arrays = ColumnarValue::values_to_arrays(args)?; - let mut results: Vec = vec![]; + let mut results: Vec> = vec![]; let format_array = arrays[1].as_string::(); let data_type = arrays[0].data_type(); @@ -222,7 +219,7 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { Some(format_array.value(idx)) }; if format.is_none() { - results.push(String::new()); + results.push(None); continue; } let format_options = match _build_format_options(data_type, format) { @@ -234,7 +231,7 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { let formatter = ArrayFormatter::try_new(arrays[0].as_ref(), &format_options)?; let result = formatter.value(idx).try_to_string(); match result { - Ok(value) => results.push(value), + Ok(value) => results.push(Some(value)), Err(e) => return exec_err!("{}", e), } } @@ -243,9 +240,12 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result { ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(StringArray::from( results, )) as ArrayRef)), - ColumnarValue::Scalar(_) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some( - results.first().unwrap().to_string(), - )))), + ColumnarValue::Scalar(_) => match results.first().unwrap() { + Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some( + value.to_string(), + )))), + None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))), + }, } } diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 3af23e632fb2..f0e04b522a78 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -2661,7 +2661,7 @@ PT123456S query T select to_char(arrow_cast(123456, 'Duration(Second)'), null); ---- -(empty) +NULL query error DataFusion error: Execution error: Cast error: Format error SELECT to_char(timestamps, '%X%K') from formats; @@ -2672,8 +2672,8 @@ SELECT to_char('2000-02-03'::date, '%X%K'); query T SELECT to_char(timestamps, null) from formats; ---- -(empty) -(empty) +NULL +NULL query T SELECT to_char(null, '%d-%m-%Y'); @@ -2685,7 +2685,7 @@ SELECT to_char(column1, column2) FROM (VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S')); ---- -(empty) +NULL 01:01:2025 23-59-58 statement ok From 7c3d2582f319fb11e039e8f72b6bcc4d12d36ac4 Mon Sep 17 00:00:00 2001 From: Kunal Kundu Date: Sun, 24 Mar 2024 02:00:27 +0530 Subject: [PATCH 6/6] use arrow:new_null_array for fast init --- datafusion/functions/src/datetime/to_char.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index 2bf2e4688ebd..4b2552bcbfa6 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use arrow::datatypes::DataType; use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions}; use arrow_array::cast::AsArray; -use arrow_array::{Array, ArrayRef, StringArray}; +use arrow_array::{new_null_array, Array, ArrayRef, StringArray}; use arrow_schema::DataType::{Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8}; use arrow_schema::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; @@ -176,8 +176,10 @@ fn _to_char_scalar( if is_scalar_expression { return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); } else { - let result: Vec> = vec![None; array.len()]; - return Ok(ColumnarValue::Array(Arc::new(StringArray::from(result)))); + return Ok(ColumnarValue::Array(new_null_array( + &DataType::Utf8, + array.len(), + ))); } }