From 2bce56873ee1aa26c28ed2110234655d2923a33c Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 12 Feb 2025 20:26:27 +0900 Subject: [PATCH] Support converting large dates (i.e. +10999-12-31) from string to Date32 (#7074) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Support converting large dates (i.e. +10999-12-31) from string to Date32 * Fix lint * Update arrow-cast/src/parse.rs Co-authored-by: Andrew Lamb * fix: issue introduced in #6833 - less than equal check for scale in decimal conversion (#7070) * fix <= check for scale in decimal conversion * Update arrow-cast/src/cast/mod.rs name change Co-authored-by: Arttu * remove incorrect comment --------- Co-authored-by: Arttu * minor: re-export `OffsetBufferBuilder` in `arrow` crate (#7077) * Add another decimal cast edge test case (#7078) * Add another decimal cast edge test case Before 1019f5b27d3596077bcdd7e10b67e2c6d4cfbf02 this test would fail, as the cast produced 1. 0 is an edge case worth explicitly testing for. * typo/fmt Co-authored-by: Felipe Oliveira Carvalho --------- Co-authored-by: Felipe Oliveira Carvalho * Support both 0x01 and 0x02 as type for list of booleans in thrift metadata (#7052) * Support both 0x01 and 0x02 as type for list of booleans * Also support 0 for false inside boolean collections * Use hex notation in tests * Fix LocalFileSystem with range request that ends beyond end of file (#6751) * Fix LocalFileSystem with range request that ends beyond end of file * fix windows * add comment * Seek error * fix seek check * remove windows flag * Get file length from file metadata * Introduce `UnsafeFlag` to manage disabling `ArrayData` validation (#7027) * Introduce UnsafeFlag to manage disabling validation * fix docs * Refactor arrow-ipc: Rename `ArrayReader` to `RecodeBatchDecoder` (#7028) * Rename `ArrayReader` to `RecordBatchDecoder` * Remove alias for `self` * Minor: Update release schedule (#7086) * Minor: Update release schedule * realism * Refactor some decimal-related code and tests (#7062) * Refactor some decimal-related code and tests in preparation for adding Decimal32 and Decimal64 support * Fixed symbol * Apply PR feedback * Fixed format problem * Fixed logical merge conflicts * PR feedback * Refactor arrow-ipc: Move `create_*_array` methods into `RecordBatchDecoder` (#7029) * Move `create_primitive_array` into RecordBatchReader * Move `create_list-array` into RecordBatchReader * Move `create_dictionay_array` into RecordBatchReader * Print Parquet BasicTypeInfo id when present (#7094) * Print Parquet BasicTypeInfo id when present * Improve print_schema documentation * tiny cleanup * Add a custom implementation `LocalFileSystem::list_with_offset` (#7019) * Initial change from Daniel. * Upgrade unit test to be more generic. * Add comments on why we have filter * Cleanup unit tests. * Update object_store/src/local.rs Co-authored-by: Adam Reeve * Add changes suggested by Adam. * Cleanup match error. * Apply formatting changes suggested by cargo +stable fmt --all. * Apply cosmetic changes suggested by clippy. * Upgrade test_path_with_offset to create temporary directory + files for testing rather than pointing to existing dir. --------- Co-authored-by: Adam Reeve * fix: first none/empty list in `ListArray` panics in `cast_with_options` (#7065) * fix: first none in `ListArray` panics in `cast_with_options` * simplify * fix * Update arrow-cast/src/cast/list.rs Co-authored-by: Jeffrey Vo --------- Co-authored-by: Jeffrey Vo * Benchmarks for Arrow IPC writer (#7090) * Add benchmarks for Arrow IPC writer * Add benchmarks for Arrow IPC writer * reuse target buffer * rename, etc * Add compression type * update --------- Co-authored-by: Andy Grove * Minor: Clarify documentation on `NullBufferBuilder::allocated_size` (#7089) * Minor: Clarify documentaiton on NullBufferBuilder::allocated_size * add note about why allocations are 64 bytes * Add more tests for edge cases * Add negative test case for incorrectly formatted large dates --------- Co-authored-by: Andrew Lamb Co-authored-by: Himadri Pal Co-authored-by: Arttu Co-authored-by: Piotr Findeisen Co-authored-by: Felipe Oliveira Carvalho Co-authored-by: Jörn Horstmann Co-authored-by: Kyle Barron Co-authored-by: Curt Hagenlocher Co-authored-by: Devin Smith Co-authored-by: Corwin Joy Co-authored-by: Adam Reeve Co-authored-by: irenjj Co-authored-by: Jeffrey Vo Co-authored-by: Andy Grove --- arrow-cast/src/cast/mod.rs | 42 ++++++++++++++++++++++++++++++++++++++ arrow-cast/src/parse.rs | 26 +++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 4bb4fb3e79b..f0e7de056ea 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -4229,6 +4229,48 @@ mod tests { } } + #[test] + fn test_cast_string_with_large_date_to_date32() { + let array = Arc::new(StringArray::from(vec![ + Some("+10999-12-31"), + Some("-0010-02-28"), + Some("0010-02-28"), + Some("0000-01-01"), + Some("-0000-01-01"), + Some("-0001-01-01"), + ])) as ArrayRef; + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let b = cast_with_options(&array, &to_type, &options).unwrap(); + let c = b.as_primitive::(); + assert_eq!(3298139, c.value(0)); // 10999-12-31 + assert_eq!(-723122, c.value(1)); // -0010-02-28 + assert_eq!(-715817, c.value(2)); // 0010-02-28 + assert_eq!(c.value(3), c.value(4)); // Expect 0000-01-01 and -0000-01-01 to be parsed the same + assert_eq!(-719528, c.value(3)); // 0000-01-01 + assert_eq!(-719528, c.value(4)); // -0000-01-01 + assert_eq!(-719893, c.value(5)); // -0001-01-01 + } + + #[test] + fn test_cast_invalid_string_with_large_date_to_date32() { + // Large dates need to be prefixed with a + or - sign, otherwise they are not parsed correctly + let array = Arc::new(StringArray::from(vec![Some("10999-12-31")])) as ArrayRef; + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let err = cast_with_options(&array, &to_type, &options).unwrap_err(); + assert_eq!( + err.to_string(), + "Cast error: Cannot cast string '10999-12-31' to value of Date32 type" + ); + } + #[test] fn test_cast_string_format_yyyymmdd_to_date32() { let a0 = Arc::new(StringViewArray::from(vec![ diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 4e93e9787cc..55834ad92a0 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -595,6 +595,32 @@ const EPOCH_DAYS_FROM_CE: i32 = 719_163; const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and 2262-04-11T23:47:16.854775804"; fn parse_date(string: &str) -> Option { + // If the date has an extended (signed) year such as "+10999-12-31" or "-0012-05-06" + // + // According to [ISO 8601], years have: + // Four digits or more for the year. Years in the range 0000 to 9999 will be pre-padded by + // zero to ensure four digits. Years outside that range will have a prefixed positive or negative symbol. + // + // [ISO 8601]: https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE + if string.starts_with('+') || string.starts_with('-') { + // Skip the sign and look for the hyphen that terminates the year digits. + // According to ISO 8601 the unsigned part must be at least 4 digits. + let rest = &string[1..]; + let hyphen = rest.find('-')?; + if hyphen < 4 { + return None; + } + // The year substring is the sign and the digits (but not the separator) + // e.g. for "+10999-12-31", hyphen is 5 and s[..6] is "+10999" + let year: i32 = string[..hyphen + 1].parse().ok()?; + // The remainder should begin with a '-' which we strip off, leaving the month-day part. + let remainder = string[hyphen + 1..].strip_prefix('-')?; + let mut parts = remainder.splitn(2, '-'); + let month: u32 = parts.next()?.parse().ok()?; + let day: u32 = parts.next()?.parse().ok()?; + return NaiveDate::from_ymd_opt(year, month, day); + } + if string.len() > 10 { // Try to parse as datetime and return just the date part return string_to_datetime(&Utc, string)