Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve null handling for to_char #9689

Merged
merged 7 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions datafusion-examples/examples/to_char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,5 +193,24 @@ async fn main() -> Result<()> {
&result
);

// output format is null

let result = ctx
.sql("SELECT to_char(arrow_cast(123456, 'Duration(Second)'), null) as result")
.await?
.collect()
.await?;

assert_batches_eq!(
&[
"+--------+",
"| result |",
"+--------+",
"| |",
"+--------+",
],
&result
);

Ok(())
}
18 changes: 17 additions & 1 deletion datafusion/functions/src/datetime/to_char.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ impl ScalarUDFImpl for ToCharFunc {
}

match &args[1] {
// null format, use default formats
ColumnarValue::Scalar(ScalarValue::Utf8(None))
| ColumnarValue::Scalar(ScalarValue::Null) => {
_to_char_scalar(args[0].clone(), None)
Expand Down Expand Up @@ -172,6 +171,19 @@ fn _to_char_scalar(
let data_type = &expression.data_type();
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
let array = expression.into_array(1)?;

if format.is_none() {
if is_scalar_expression {
return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
Some(String::new()),
Copy link
Contributor Author

@tinfoil-knight tinfoil-knight Mar 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using None instead of Some(String::new()) was causing NULL to appear instead of empty in SQL Logic Tests. (Output was same for CLI though.)

[SQL] select to_char(arrow_cast(123456, 'Duration(Second)'), null);
[Diff] (-expected|+actual)
-   (empty)
+   NULL
at test_files/timestamps.slt:2661

I don't think this is what we want. (empty) is the expected value.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought None is the correct value (as it will semantically be aNULL, which is the correct results)

The sqllogictests have special formatting for NULL values (to distinguish them from empty strings)

https://github.com/apache/arrow-datafusion/blob/1d8a41bc8e08b56e90d6f8e6ef20e39a126987e4/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs#L198-L200

However, when I double checked, the behavior of to_date in spark seems to be different yet (passing in a null format seems to simply ignore the format string and instead parses with the default -- it doesn't return null)

>>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])
>>> df.select(functions.to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
[Row(date=datetime.date(1997, 2, 28))]
>>> df.select(functions.to_date(df.t, None).alias('date')).collect()
[Row(date=datetime.date(1997, 2, 28))]
>>> df = spark.createDataFrame([('1997-02-2ddddd',)], ['t'])
>>> df.select(functions.to_date(df.t, None).alias('date')).collect()
[Row(date=None)]
>>> df.select(functions.to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()
[Row(date=None)]

Any hints @Omega359 ?

)));
} else {
return Ok(ColumnarValue::Array(Arc::new(StringArray::from(
vec![String::new(); array.len()],

This comment was marked as outdated.

))));
}
}

let format_options = match _build_format_options(data_type, format) {
Ok(value) => value,
Err(value) => return value,
Expand Down Expand Up @@ -209,6 +221,10 @@ fn _to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
} else {
Some(format_array.value(idx))
};
if format.is_none() {
results.push(String::new());
continue;
}
let format_options = match _build_format_options(data_type, format) {
Ok(value) => value,
Err(value) => return value,
Expand Down
14 changes: 11 additions & 3 deletions datafusion/sqllogictest/test_files/timestamps.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2661,7 +2661,7 @@ PT123456S
query T
select to_char(arrow_cast(123456, 'Duration(Second)'), null);
----
PT123456S
(empty)

query error DataFusion error: Execution error: Cast error: Format error
SELECT to_char(timestamps, '%X%K') from formats;
Expand All @@ -2672,14 +2672,22 @@ SELECT to_char('2000-02-03'::date, '%X%K');
query T
SELECT to_char(timestamps, null) from formats;
----
2024-01-01T06:00:00Z
2025-01-01T23:59:58Z
(empty)
(empty)

query T
SELECT to_char(null, '%d-%m-%Y');
----
(empty)

query T
SELECT to_char(column1, column2)
FROM
(VALUES ('2024-01-01 06:00:00'::timestamp, null), ('2025-01-01 23:59:58'::timestamp, '%d:%m:%Y %H-%M-%S'));
----
(empty)
01:01:2025 23-59-58

statement ok
drop table formats;

Expand Down
Loading