diff --git a/Cargo.toml b/Cargo.toml index 9c696cba9de..79c21b5a785 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ bench = false [dependencies] num-traits = "0.2" chrono = { version = "0.4", default_features = false, features = ["std"] } -chrono-tz = { version = "0.4", optional = true } +chrono-tz = { version = "0.5", optional = true } # To efficiently cast numbers to strings lexical-core = { version = "0.7", optional = true } # We need to Hash values before sending them to an hasher. This diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 9c26cf6ae67..27c1977e410 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -124,14 +124,16 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, Date32) => true, (Utf8, Date64) => true, - (Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true, + (Utf8, Timestamp(TimeUnit::Nanosecond, _)) => true, (Utf8, LargeUtf8) => true, (Utf8, _) => is_numeric(to_type), (LargeUtf8, Date32) => true, (LargeUtf8, Date64) => true, - (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true, + (LargeUtf8, Timestamp(TimeUnit::Nanosecond, _)) => true, (LargeUtf8, Utf8) => true, (LargeUtf8, _) => is_numeric(to_type), + (Timestamp(_, _), Utf8) => true, + (Timestamp(_, _), LargeUtf8) => true, (_, Utf8) => is_numeric(from_type) || from_type == &Binary, (_, LargeUtf8) => is_numeric(from_type) || from_type == &Binary, @@ -509,10 +511,10 @@ fn cast_with_options( Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()) .map(|x| Box::new(x) as Box), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), Timestamp(TimeUnit::Nanosecond, Some(tz)) => { utf8_to_timestamp_ns_dyn::(array, tz.clone()) } - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -541,6 +543,21 @@ fn cast_with_options( let array = Utf8Array::::from_trusted_len_iter(iter); Ok(Box::new(array)) } + Timestamp(from_unit, Some(tz)) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(timestamp_to_utf8::( + from, + from_unit.clone(), + tz, + )?)) + } + Timestamp(from_unit, None) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(naive_timestamp_to_utf8::( + from, + from_unit.clone(), + ))) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -569,6 +586,21 @@ fn cast_with_options( let array = Utf8Array::::from_trusted_len_iter(iter); Ok(Box::new(array)) } + Timestamp(from_unit, Some(tz)) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(timestamp_to_utf8::( + from, + from_unit.clone(), + tz, + )?)) + } + Timestamp(from_unit, None) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(naive_timestamp_to_utf8::( + from, + from_unit.clone(), + ))) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -797,8 +829,8 @@ fn cast_with_options( } (Timestamp(_, _), Int64) => primitive_to_same_primitive_dyn::(array, to_type), (Int64, Timestamp(_, _)) => primitive_to_same_primitive_dyn::(array, to_type), - (Timestamp(from_unit, tz1), Timestamp(to_unit, tz2)) if tz1 == tz2 => { - primitive_dyn!(array, timestamp_to_timestamp, from_unit, to_unit, tz2) + (Timestamp(from_unit, _), Timestamp(to_unit, tz)) => { + primitive_dyn!(array, timestamp_to_timestamp, from_unit, to_unit, tz) } (Timestamp(from_unit, _), Date32) => primitive_dyn!(array, timestamp_to_date32, from_unit), (Timestamp(from_unit, _), Date64) => primitive_dyn!(array, timestamp_to_date64, from_unit), diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index 78171193e91..997779b536e 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -5,6 +5,7 @@ use crate::{ bitmap::Bitmap, compute::arity::unary, datatypes::{DataType, TimeUnit}, + error::ArrowError, temporal_conversions::*, types::NativeType, }; @@ -278,3 +279,144 @@ pub fn timestamp_to_timestamp( unary(from, |x| (x * (to_size / from_size)), to_type) } } + +fn timestamp_to_utf8_impl( + from: &PrimitiveArray, + time_unit: TimeUnit, + timezone: T, +) -> Utf8Array +where + T::Offset: std::fmt::Display, +{ + match time_unit { + TimeUnit::Nanosecond => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_ns_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Microsecond => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_us_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Millisecond => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_ms_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Second => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_s_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + } +} + +#[cfg(feature = "chrono-tz")] +fn chrono_tz_timestamp_to_utf8( + from: &PrimitiveArray, + time_unit: TimeUnit, + timezone_str: &str, +) -> Result> { + let timezone = parse_offset_tz(timezone_str); + if let Some(timezone) = timezone { + Ok(timestamp_to_utf8_impl::( + from, time_unit, timezone, + )) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed", + timezone_str + ))) + } +} + +#[cfg(not(feature = "chrono-tz"))] +fn chrono_tz_timestamp_to_utf8( + _: &PrimitiveArray, + _: TimeUnit, + timezone_str: &str, +) -> Result> { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed (feature chrono-tz is not active)", + timezone_str + ))) +} + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the timestamp in the rfc3339 format. +pub fn timestamp_to_utf8( + from: &PrimitiveArray, + time_unit: TimeUnit, + timezone_str: &str, +) -> Result> { + let timezone = parse_offset(timezone_str); + + if let Ok(timezone) = timezone { + Ok(timestamp_to_utf8_impl::( + from, time_unit, timezone, + )) + } else { + chrono_tz_timestamp_to_utf8(from, time_unit, timezone_str) + } +} + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the timestamp in the rfc3339 format. +pub fn naive_timestamp_to_utf8( + from: &PrimitiveArray, + time_unit: TimeUnit, +) -> Utf8Array { + match time_unit { + TimeUnit::Nanosecond => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_ns_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Microsecond => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_us_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Millisecond => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_ms_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Second => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_s_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + } +} diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index c631a212643..6b1a5062fa4 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -8,7 +8,7 @@ use chrono::{ use crate::datatypes::{DataType, TimeUnit}; use crate::error::Result; use crate::{ - array::{Int64Array, Offset, Utf8Array}, + array::{Offset, PrimitiveArray, Utf8Array}, error::ArrowError, }; @@ -159,34 +159,38 @@ pub fn timeunit_scale(a: &TimeUnit, b: &TimeUnit) -> f64 { } } -pub(crate) fn parse_offset(offset: &str) -> Result { +/// Parses an offset of the form `"+WX:YZ"` or `"UTC"` into [`FixedOffset`]. +/// # Errors +/// If the offset is not in any of the allowed forms. +pub fn parse_offset(offset: &str) -> Result { if offset == "UTC" { return Ok(FixedOffset::east(0)); } + let error = "timezone offset must be of the form [-]00:00"; + let mut a = offset.split(':'); - let first = a.next().map(Ok).unwrap_or_else(|| { - Err(ArrowError::InvalidArgumentError( - "timezone offset must be of the form [-]00:00".to_string(), - )) - })?; - let last = a.next().map(Ok).unwrap_or_else(|| { - Err(ArrowError::InvalidArgumentError( - "timezone offset must be of the form [-]00:00".to_string(), - )) - })?; - let hours: i32 = first.parse().map_err(|_| { - ArrowError::InvalidArgumentError("timezone offset must be of the form [-]00:00".to_string()) - })?; - let minutes: i32 = last.parse().map_err(|_| { - ArrowError::InvalidArgumentError("timezone offset must be of the form [-]00:00".to_string()) - })?; + let first = a + .next() + .map(Ok) + .unwrap_or_else(|| Err(ArrowError::InvalidArgumentError(error.to_string())))?; + let last = a + .next() + .map(Ok) + .unwrap_or_else(|| Err(ArrowError::InvalidArgumentError(error.to_string())))?; + let hours: i32 = first + .parse() + .map_err(|_| ArrowError::InvalidArgumentError(error.to_string()))?; + let minutes: i32 = last + .parse() + .map_err(|_| ArrowError::InvalidArgumentError(error.to_string()))?; Ok(FixedOffset::east(hours * 60 * 60 + minutes * 60)) } -// not public to not expose TimeZone +/// Parses `value` to `Option` consistent with the Arrow's definition of timestamp with timezone. +/// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`). #[inline] -pub(crate) fn utf8_to_timestamp_ns_scalar( +pub fn utf8_to_timestamp_ns_scalar( value: &str, fmt: &str, tz: &T, @@ -196,7 +200,9 @@ pub(crate) fn utf8_to_timestamp_ns_scalar( let r = parse(&mut parsed, value, fmt).ok(); if r.is_some() { parsed - .to_datetime_with_timezone(tz) + .to_datetime() + .map(|x| x.naive_utc()) + .map(|x| tz.from_utc_datetime(&x)) .map(|x| x.timestamp_nanos()) .ok() } else { @@ -204,6 +210,7 @@ pub(crate) fn utf8_to_timestamp_ns_scalar( } } +/// Parses `value` to `Option` consistent with the Arrow's definition of timestamp without timezone. #[inline] pub fn utf8_to_naive_timestamp_ns_scalar(value: &str, fmt: &str) -> Option { let fmt = StrftimeItems::new(fmt); @@ -220,23 +227,28 @@ fn utf8_to_timestamp_ns_impl( fmt: &str, timezone: String, tz: T, -) -> Int64Array { +) -> PrimitiveArray { let iter = array .iter() .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz))); - Int64Array::from_trusted_len_iter(iter) + PrimitiveArray::from_trusted_len_iter(iter) .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone))) } +#[cfg(feature = "chrono-tz")] +pub(crate) fn parse_offset_tz(tz: &str) -> Option { + tz.parse::().ok() +} + #[cfg(feature = "chrono-tz")] fn chrono_tz_utf_to_timestamp_ns( array: &Utf8Array, fmt: &str, timezone: String, -) -> Result { - let tz = timezone.as_str().parse::(); - if let Ok(tz) = tz { +) -> Result> { + let tz = parse_offset_tz(&timezone); + if let Some(tz) = tz { Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) } else { Err(ArrowError::InvalidArgumentError(format!( @@ -251,23 +263,26 @@ fn chrono_tz_utf_to_timestamp_ns( _: &Utf8Array, _: &str, timezone: String, -) -> Result { +) -> Result> { Err(ArrowError::InvalidArgumentError(format!( "timezone \"{}\" cannot be parsed (feature chrono-tz is not active)", timezone ))) } -/// Parses a [`Utf8Array`] to a time-aware timestamp, i.e. [`Int64Array`] with type `Timestamp(Nanosecond, Some(timezone))`. -/// When the value represents a string with another timezone, a conversion is applied. -/// Null elements remain null; non-parsable elements are set to null. +/// Parses a [`Utf8Array`] to a timeozone-aware timestamp, i.e. [`PrimitiveArray`] with type `Timestamp(Nanosecond, Some(timezone))`. +/// # Implementation +/// * parsed values with timezone other than `timezone` are converted to `timezone`. +/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones. +/// * Null elements remain null; non-parsable elements are null. +/// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. /// # Error -/// This function errors iff `timezone` is not parsiable to an offset. +/// This function errors iff `timezone` is not parsable to an offset. pub fn utf8_to_timestamp_ns( array: &Utf8Array, fmt: &str, timezone: String, -) -> Result { +) -> Result> { let tz = parse_offset(timezone.as_str()); if let Ok(tz) = tz { @@ -277,13 +292,17 @@ pub fn utf8_to_timestamp_ns( } } -/// Parses a [`Utf8Array`] to naive timestamp, i.e. [`Int64Array`] with type `Timestamp(Nanosecond, None)`. +/// Parses a [`Utf8Array`] to naive timestamp, i.e. +/// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. /// Timezones are ignored. /// Null elements remain null; non-parsable elements are set to null. -pub fn utf8_to_naive_timestamp_ns(array: &Utf8Array, fmt: &str) -> Int64Array { +pub fn utf8_to_naive_timestamp_ns( + array: &Utf8Array, + fmt: &str, +) -> PrimitiveArray { let iter = array .iter() .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); - Int64Array::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) } diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 8836fc098a4..8e0bb96166c 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -227,10 +227,9 @@ fn bool_to_binary() { } #[test] -#[should_panic(expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported")] fn int32_to_timestamp() { let array = Int32Array::from(&[Some(2), Some(10), None]); - cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + assert!(cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).is_err()); } #[test] @@ -251,6 +250,7 @@ fn consistency() { Float64, Timestamp(TimeUnit::Second, None), Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Millisecond, Some("+01:00".to_string())), Timestamp(TimeUnit::Microsecond, None), Timestamp(TimeUnit::Nanosecond, None), Time64(TimeUnit::Microsecond), @@ -471,6 +471,56 @@ fn list_to_list() { assert_eq!(expected, result.as_ref()); } +#[test] +fn timestamp_with_tz_to_utf8() { + let tz = "-02:00".to_string(); + let expected = + Utf8Array::::from_slice(&["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); + let array = Int64Array::from_slice(&[851020797000000000, 851024397000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + +#[test] +fn utf8_to_timestamp_with_tz() { + let tz = "-02:00".to_string(); + let array = + Utf8Array::::from_slice(&["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); + // the timezone is used to map the time to UTC. + let expected = Int64Array::from_slice(&[851020797000000000, 851024397000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + +#[test] +fn utf8_to_naive_timestamp() { + let array = + Utf8Array::::from_slice(&["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); + // the timezone is disregarded from the string and we assume UTC + let expected = Int64Array::from_slice(&[851013597000000000, 851017197000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + +#[test] +fn naive_timestamp_to_utf8() { + let array = Int64Array::from_slice(&[851013597000000000, 851017197000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); + + let expected = Utf8Array::::from_slice(&["1996-12-19 16:39:57", "1996-12-19 17:39:57"]); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + println!("{}", result); + println!("{}", expected); + assert_eq!(expected, result.as_ref()); +} + /* #[test] fn dict_to_dict_bad_index_value_primitive() { diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs index 4f313b404c4..94e933728dc 100644 --- a/tests/it/temporal_conversions.rs +++ b/tests/it/temporal_conversions.rs @@ -8,16 +8,29 @@ fn naive() { let array = Utf8Array::::from_slice(&[ "1996-12-19T16:39:57-02:00", "1996-12-19T13:39:57-03:00", - "1996-12-19 13:39:57-03:00", + "1996-12-19 13:39:57-03:00", // missing T ]); let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); assert_eq!(format!("{}", r), expected); - let fmt = "%Y-%m-%dT%H:%M:%S"; + let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info let array = Utf8Array::::from_slice(&[ "1996-12-19T16:39:57-02:00", "1996-12-19T13:39:57-03:00", - "1996-12-19 13:39:57-03:00", + "1996-12-19 13:39:57-03:00", // missing T + ]); + let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + assert_eq!(format!("{}", r), expected); +} + +#[test] +fn naive_no_tz() { + let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, ]"; + let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57", + "1996-12-19T13:39:57", + "1996-12-19 13:39:57", // missing T ]); let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); assert_eq!(format!("{}", r), expected); @@ -27,13 +40,27 @@ fn naive() { fn tz_aware() { let tz = "-02:00".to_string(); let expected = - "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 13:39:57 -02:00, ]"; + "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 17:39:57 -02:00, ]"; let fmt = "%Y-%m-%dT%H:%M:%S%.f%:z"; let array = Utf8Array::::from_slice(&[ "1996-12-19T16:39:57.0-02:00", - "1996-12-19T13:39:57.0-02:00", + "1996-12-19T16:39:57.0-03:00", // same time at a different TZ "1996-12-19 13:39:57.0-03:00", ]); let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); assert_eq!(format!("{}", r), expected); } + +#[test] +fn tz_aware_no_timezone() { + let tz = "-02:00".to_string(); + let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[, , ]"; + let fmt = "%Y-%m-%dT%H:%M:%S%.f"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57.0", + "1996-12-19T17:39:57.0", + "1996-12-19 13:39:57.0", + ]); + let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + assert_eq!(format!("{}", r), expected); +}