diff --git a/Cargo.toml b/Cargo.toml index d9c7d9c763c..94f5224613c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,8 @@ bench = false [dependencies] num-traits = "0.2" -chrono = "^0.4" +chrono = { version = "0.4", default_features = false, features = ["std"] } +chrono-tz = { version = "0.5", optional = true } # To efficiently cast numbers to strings lexical-core = { version = "0.8", optional = true } # We need to Hash values before sending them to an hasher. This @@ -85,6 +86,8 @@ full = [ "merge_sort", "ahash", "compute", + # parses timezones used in timestamp conversions + "chrono-tz", ] merge_sort = ["itertools"] io_csv = ["csv", "lazy_static", "regex", "lexical-core", "streaming-iterator"] diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 73bf56fb35e..06666f1f1d0 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -26,14 +26,12 @@ mod binary_to; mod boolean_to; mod dictionary_to; mod primitive_to; -mod timestamps; mod utf8_to; pub use binary_to::*; pub use boolean_to::*; pub use dictionary_to::*; pub use primitive_to::*; -pub use timestamps::*; pub use utf8_to::*; /// options defining how Cast kernels behave @@ -126,14 +124,16 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, Date32) => true, (Utf8, Date64) => true, - (Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true, + (Utf8, Timestamp(TimeUnit::Nanosecond, _)) => true, (Utf8, LargeUtf8) => true, (Utf8, _) => is_numeric(to_type), (LargeUtf8, Date32) => true, (LargeUtf8, Date64) => true, - (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true, + (LargeUtf8, Timestamp(TimeUnit::Nanosecond, _)) => true, (LargeUtf8, Utf8) => true, (LargeUtf8, _) => is_numeric(to_type), + (Timestamp(_, _), Utf8) => true, + (Timestamp(_, _), LargeUtf8) => true, (_, Utf8) => is_numeric(from_type) || from_type == &Binary, (_, LargeUtf8) => is_numeric(from_type) || from_type == &Binary, @@ -487,7 +487,10 @@ fn cast_with_options( LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( array.as_any().downcast_ref().unwrap(), ))), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, Some(tz)) => { + utf8_to_timestamp_ns_dyn::(array, tz.clone()) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -508,7 +511,10 @@ fn cast_with_options( Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()) .map(|x| Box::new(x) as Box), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, Some(tz)) => { + utf8_to_timestamp_ns_dyn::(array, tz.clone()) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -537,6 +543,14 @@ fn cast_with_options( let array = Utf8Array::::from_trusted_len_iter(iter); Ok(Box::new(array)) } + Timestamp(from_unit, Some(tz)) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(timestamp_to_utf8::(from, *from_unit, tz)?)) + } + Timestamp(from_unit, None) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(naive_timestamp_to_utf8::(from, *from_unit))) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -565,6 +579,14 @@ fn cast_with_options( let array = Utf8Array::::from_trusted_len_iter(iter); Ok(Box::new(array)) } + Timestamp(from_unit, Some(tz)) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(timestamp_to_utf8::(from, *from_unit, tz)?)) + } + Timestamp(from_unit, None) => { + let from = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(naive_timestamp_to_utf8::(from, *from_unit))) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -793,8 +815,8 @@ fn cast_with_options( } (Timestamp(_, _), Int64) => primitive_to_same_primitive_dyn::(array, to_type), (Int64, Timestamp(_, _)) => primitive_to_same_primitive_dyn::(array, to_type), - (Timestamp(from_unit, tz1), Timestamp(to_unit, tz2)) if tz1 == tz2 => { - primitive_dyn!(array, timestamp_to_timestamp, *from_unit, *to_unit, tz2) + (Timestamp(from_unit, _), Timestamp(to_unit, tz)) => { + primitive_dyn!(array, timestamp_to_timestamp, *from_unit, *to_unit, tz) } (Timestamp(from_unit, _), Date32) => primitive_dyn!(array, timestamp_to_date32, *from_unit), (Timestamp(from_unit, _), Date64) => primitive_dyn!(array, timestamp_to_date64, *from_unit), diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index f592e82ee60..c5ba3e5b080 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -5,6 +5,7 @@ use crate::{ bitmap::Bitmap, compute::arity::unary, datatypes::{DataType, TimeUnit}, + error::ArrowError, temporal_conversions::*, types::NativeType, }; @@ -268,3 +269,144 @@ pub fn timestamp_to_timestamp( unary(from, |x| (x * (to_size / from_size)), to_type) } } + +fn timestamp_to_utf8_impl( + from: &PrimitiveArray, + time_unit: TimeUnit, + timezone: T, +) -> Utf8Array +where + T::Offset: std::fmt::Display, +{ + match time_unit { + TimeUnit::Nanosecond => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_ns_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Microsecond => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_us_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Millisecond => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_ms_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Second => { + let iter = from.iter().map(|x| { + x.map(|x| { + let datetime = timestamp_s_to_datetime(*x); + let offset = timezone.offset_from_utc_datetime(&datetime); + chrono::DateTime::::from_utc(datetime, offset).to_rfc3339() + }) + }); + Utf8Array::from_trusted_len_iter(iter) + } + } +} + +#[cfg(feature = "chrono-tz")] +fn chrono_tz_timestamp_to_utf8( + from: &PrimitiveArray, + time_unit: TimeUnit, + timezone_str: &str, +) -> Result> { + let timezone = parse_offset_tz(timezone_str); + if let Some(timezone) = timezone { + Ok(timestamp_to_utf8_impl::( + from, time_unit, timezone, + )) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed", + timezone_str + ))) + } +} + +#[cfg(not(feature = "chrono-tz"))] +fn chrono_tz_timestamp_to_utf8( + _: &PrimitiveArray, + _: TimeUnit, + timezone_str: &str, +) -> Result> { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed (feature chrono-tz is not active)", + timezone_str + ))) +} + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the timestamp in the rfc3339 format. +pub fn timestamp_to_utf8( + from: &PrimitiveArray, + time_unit: TimeUnit, + timezone_str: &str, +) -> Result> { + let timezone = parse_offset(timezone_str); + + if let Ok(timezone) = timezone { + Ok(timestamp_to_utf8_impl::( + from, time_unit, timezone, + )) + } else { + chrono_tz_timestamp_to_utf8(from, time_unit, timezone_str) + } +} + +/// Returns a [`Utf8Array`] where every element is the utf8 representation of the timestamp in the rfc3339 format. +pub fn naive_timestamp_to_utf8( + from: &PrimitiveArray, + time_unit: TimeUnit, +) -> Utf8Array { + match time_unit { + TimeUnit::Nanosecond => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_ns_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Microsecond => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_us_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Millisecond => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_ms_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + TimeUnit::Second => { + let iter = from.iter().map(|x| { + x.copied() + .map(timestamp_s_to_datetime) + .map(|x| x.to_string()) + }); + Utf8Array::from_trusted_len_iter(iter) + } + } +} diff --git a/src/compute/cast/timestamps.rs b/src/compute/cast/timestamps.rs deleted file mode 100644 index 354dd652a80..00000000000 --- a/src/compute/cast/timestamps.rs +++ /dev/null @@ -1,302 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::error::{ArrowError, Result}; -use chrono::{prelude::*, LocalResult}; - -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function intertprets strings without an explicit time zone as -/// timestamps with offsets of the local time on the machine -/// -/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as -/// it has an explicit timezone specifier (“Z” for Zulu/UTC) -/// -/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in -/// the timezone of the machine. For example, if -/// the system timezone is set to Americas/New_York (UTC-5) the -/// timestamp will be interpreted as though it were -/// `1997-01-31T09:26:56.123-05:00` -pub fn utf8_to_timestamp_ns_scalar(s: &str) -> Result { - // Fast path: RFC3339 timestamp (with a T) - // Example: 2020-09-08T13:42:29.190855Z - if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.timestamp_nanos()); - } - - // Implement quasi-RFC3339 support by trying to parse the - // timestamp with various other format specifiers to to support - // separating the date and time with a space ' ' rather than 'T' to be - // (more) compatible with Apache Spark SQL - - // timezone offset, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855-05:00 - if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return Ok(ts.timestamp_nanos()); - } - - // with an explicit Z, using ' ' as a separator - // Example: 2020-09-08 13:42:29Z - if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.timestamp_nanos()); - } - - // Support timestamps without an explicit timezone offset, again - // to be compatible with what Apache Spark SQL does. - - // without a timezone specifier as a local time, using T as a separator - // Example: 2020-09-08T13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using T as a - // separator, no fractional seconds - // Example: 2020-09-08T13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a - // separator, no fractional seconds - // Example: 2020-09-08 13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // Note we don't pass along the error message from the underlying - // chrono parsing because we tried several different format - // strings and we don't know which the user was trying to - // match. Ths any of the specific error messages is likely to be - // be more confusing than helpful - Err(ArrowError::Other(format!( - "Error parsing '{}' as timestamp", - s - ))) -} - -/// Converts the naive datetime (which has no specific timezone) to a -/// nanosecond epoch timestamp relative to UTC. -fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result { - let l = Local {}; - - match l.from_local_datetime(&datetime) { - LocalResult::None => Err(ArrowError::Other(format!( - "Error parsing '{}' as timestamp: local time representation is invalid", - s - ))), - LocalResult::Single(local_datetime) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - // Ambiguous times can happen if the timestamp is exactly when - // a daylight savings time transition occurs, for example, and - // so the datetime could validly be said to be in two - // potential offsets. However, since we are about to convert - // to UTC anyways, we can pick one arbitrarily - LocalResult::Ambiguous(local_datetime, _) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn utf8_to_timestamp_timezone() -> Result<()> { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00")? - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z")? - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z")? - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00")? - ); - Ok(()) - } - - #[test] - fn utf8_to_timestamp_timezone_space() -> Result<()> { - // Ensure space rather than T between time and date is accepted - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00")? - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z")? - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z")? - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00")? - ); - Ok(()) - } - - /// Interprets a naive_datetime (with no explicit timzone offset) - /// using the local timezone and returns the timestamp in UTC (0 - /// offset) - fn naive_datetime_to_timestamp(naive_datetime: &NaiveDateTime) -> i64 { - // Note: Use chrono APIs that are different than - // naive_datetime_to_timestamp to compute the utc offset to - // try and double check the logic - let utc_offset_secs = match Local.offset_from_local_datetime(naive_datetime) { - LocalResult::Single(local_offset) => local_offset.fix().local_minus_utc() as i64, - _ => panic!("Unexpected failure converting to local datetime"), - }; - let utc_offset_nanos = utc_offset_secs * 1_000_000_000; - naive_datetime.timestamp_nanos() - utc_offset_nanos - } - - #[test] - #[cfg_attr(miri, ignore)] // miri does not support one of the instructions - fn utf8_to_timestamp_no_timezone() -> Result<()> { - // This test is designed to succeed in regardless of the local - // timezone the test machine is running. Thus it is still - // somewhat suceptable to bugs in the use of chrono - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms_nano(13, 42, 29, 190855), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), - parse_timestamp("2020-09-08T13:42:29.190855")? - ); - - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), - parse_timestamp("2020-09-08 13:42:29.190855")? - ); - - // Also ensure that parsing timestamps with no fractional - // second part works as well - let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms(13, 42, 29), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), - parse_timestamp("2020-09-08T13:42:29")? - ); - - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), - parse_timestamp("2020-09-08 13:42:29")? - ); - - Ok(()) - } - - #[test] - #[cfg_attr(miri, ignore)] // miri does not support one of the instructions - fn utf8_to_timestamp_invalid() { - // Test parsing invalid formats - - // It would be nice to make these messages better - expect_timestamp_parse_error("", "Error parsing '' as timestamp"); - expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp"); - expect_timestamp_parse_error( - "Wed, 18 Feb 2015 23:16:09 GMT", - "Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp", - ); - } - - // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { - let result = utf8_to_timestamp_ns_scalar(s); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{}': {:?}", s, e); - } - result - } - - fn expect_timestamp_parse_error(s: &str, expected_err: &str) { - match utf8_to_timestamp_ns_scalar(s) { - Ok(v) => panic!( - "Expected error '{}' while parsing '{}', but parsed {} instead", - expected_err, s, v - ), - Err(e) => { - assert!( - e.to_string().contains(expected_err), - "Can not find expected error '{}' while parsing '{}'. Actual error '{}'", - expected_err, - s, - e - ); - } - } - } -} diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index e267199a015..d74ec2b6e1b 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -2,18 +2,16 @@ use std::convert::TryFrom; use chrono::Datelike; -use crate::{ - array::*, - buffer::Buffer, - datatypes::{DataType, TimeUnit}, - types::NativeType, -}; +use crate::{array::*, buffer::Buffer, datatypes::DataType, types::NativeType}; use crate::{ error::{ArrowError, Result}, - temporal_conversions::EPOCH_DAYS_FROM_CE, + temporal_conversions::{ + utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, + utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, + }, }; -use super::utf8_to_timestamp_ns_scalar; +const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; /// Casts a [`Utf8Array`] to a [`PrimitiveArray`], making any uncastable value a Null. pub fn utf8_to_primitive(from: &Utf8Array, to: &DataType) -> PrimitiveArray @@ -92,18 +90,34 @@ pub fn utf8_to_dictionary( Ok(array.into()) } -pub(super) fn utf8_to_timestamp_ns_dyn(from: &dyn Array) -> Result> { +pub(super) fn utf8_to_naive_timestamp_ns_dyn( + from: &dyn Array, +) -> Result> { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_timestamp_ns::(from))) + Ok(Box::new(utf8_to_naive_timestamp_ns::(from))) } -/// The array version of [`utf8_to_timestamp_ns_scalar`]. -pub fn utf8_to_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { - let iter = from - .iter() - .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x).ok())); - PrimitiveArray::::from_trusted_len_iter(iter) - .to(DataType::Timestamp(TimeUnit::Nanosecond, None)) +/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting +pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { + utf8_to_naive_timestamp_ns_(from, RFC3339) +} + +pub(super) fn utf8_to_timestamp_ns_dyn( + from: &dyn Array, + timezone: String, +) -> Result> { + let from = from.as_any().downcast_ref().unwrap(); + utf8_to_timestamp_ns::(from, timezone) + .map(Box::new) + .map(|x| x as Box) +} + +/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting +pub fn utf8_to_timestamp_ns( + from: &Utf8Array, + timezone: String, +) -> Result> { + utf8_to_timestamp_ns_(from, RFC3339, timezone) } pub fn utf8_to_large_utf8(from: &Utf8Array) -> Utf8Array { diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index e9c3cf76038..5d32df3bc3d 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -1,10 +1,15 @@ //! Conversion methods for dates and times. -use chrono::{FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; +use chrono::{ + format::{parse, Parsed, StrftimeItems}, + FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, +}; +use crate::datatypes::{DataType, TimeUnit}; +use crate::error::Result; use crate::{ - datatypes::TimeUnit, - error::{ArrowError, Result}, + array::{Offset, PrimitiveArray, Utf8Array}, + error::ArrowError, }; /// Number of seconds in a day @@ -154,27 +159,150 @@ pub fn timeunit_scale(a: TimeUnit, b: TimeUnit) -> f64 { } } -pub(crate) fn parse_offset(offset: &str) -> Result { +/// Parses an offset of the form `"+WX:YZ"` or `"UTC"` into [`FixedOffset`]. +/// # Errors +/// If the offset is not in any of the allowed forms. +pub fn parse_offset(offset: &str) -> Result { if offset == "UTC" { return Ok(FixedOffset::east(0)); } + let error = "timezone offset must be of the form [-]00:00"; + let mut a = offset.split(':'); - let first = a.next().map(Ok).unwrap_or_else(|| { - Err(ArrowError::InvalidArgumentError( - "timezone offset must be of the form [-]00:00".to_string(), - )) - })?; - let last = a.next().map(Ok).unwrap_or_else(|| { - Err(ArrowError::InvalidArgumentError( - "timezone offset must be of the form [-]00:00".to_string(), - )) - })?; - let hours: i32 = first.parse().map_err(|_| { - ArrowError::InvalidArgumentError("timezone offset must be of the form [-]00:00".to_string()) - })?; - let minutes: i32 = last.parse().map_err(|_| { - ArrowError::InvalidArgumentError("timezone offset must be of the form [-]00:00".to_string()) - })?; + let first = a + .next() + .map(Ok) + .unwrap_or_else(|| Err(ArrowError::InvalidArgumentError(error.to_string())))?; + let last = a + .next() + .map(Ok) + .unwrap_or_else(|| Err(ArrowError::InvalidArgumentError(error.to_string())))?; + let hours: i32 = first + .parse() + .map_err(|_| ArrowError::InvalidArgumentError(error.to_string()))?; + let minutes: i32 = last + .parse() + .map_err(|_| ArrowError::InvalidArgumentError(error.to_string()))?; Ok(FixedOffset::east(hours * 60 * 60 + minutes * 60)) } + +/// Parses `value` to `Option` consistent with the Arrow's definition of timestamp with timezone. +/// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`). +#[inline] +pub fn utf8_to_timestamp_ns_scalar( + value: &str, + fmt: &str, + tz: &T, +) -> Option { + let mut parsed = Parsed::new(); + let fmt = StrftimeItems::new(fmt); + let r = parse(&mut parsed, value, fmt).ok(); + if r.is_some() { + parsed + .to_datetime() + .map(|x| x.naive_utc()) + .map(|x| tz.from_utc_datetime(&x)) + .map(|x| x.timestamp_nanos()) + .ok() + } else { + None + } +} + +/// Parses `value` to `Option` consistent with the Arrow's definition of timestamp without timezone. +#[inline] +pub fn utf8_to_naive_timestamp_ns_scalar(value: &str, fmt: &str) -> Option { + let fmt = StrftimeItems::new(fmt); + let mut parsed = Parsed::new(); + parse(&mut parsed, value, fmt.clone()).ok(); + parsed + .to_naive_datetime_with_offset(0) + .map(|x| x.timestamp_nanos()) + .ok() +} + +fn utf8_to_timestamp_ns_impl( + array: &Utf8Array, + fmt: &str, + timezone: String, + tz: T, +) -> PrimitiveArray { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz))); + + PrimitiveArray::from_trusted_len_iter(iter) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone))) +} + +#[cfg(feature = "chrono-tz")] +pub(crate) fn parse_offset_tz(tz: &str) -> Option { + tz.parse::().ok() +} + +#[cfg(feature = "chrono-tz")] +fn chrono_tz_utf_to_timestamp_ns( + array: &Utf8Array, + fmt: &str, + timezone: String, +) -> Result> { + let tz = parse_offset_tz(&timezone); + if let Some(tz) = tz { + Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed", + timezone + ))) + } +} + +#[cfg(not(feature = "chrono-tz"))] +fn chrono_tz_utf_to_timestamp_ns( + _: &Utf8Array, + _: &str, + timezone: String, +) -> Result> { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed (feature chrono-tz is not active)", + timezone + ))) +} + +/// Parses a [`Utf8Array`] to a timeozone-aware timestamp, i.e. [`PrimitiveArray`] with type `Timestamp(Nanosecond, Some(timezone))`. +/// # Implementation +/// * parsed values with timezone other than `timezone` are converted to `timezone`. +/// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones. +/// * Null elements remain null; non-parsable elements are null. +/// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`. +/// # Error +/// This function errors iff `timezone` is not parsable to an offset. +pub fn utf8_to_timestamp_ns( + array: &Utf8Array, + fmt: &str, + timezone: String, +) -> Result> { + let tz = parse_offset(timezone.as_str()); + + if let Ok(tz) = tz { + Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + } else { + chrono_tz_utf_to_timestamp_ns(array, fmt, timezone) + } +} + +/// Parses a [`Utf8Array`] to naive timestamp, i.e. +/// [`PrimitiveArray`] with type `Timestamp(Nanosecond, None)`. +/// Timezones are ignored. +/// Null elements remain null; non-parsable elements are set to null. +pub fn utf8_to_naive_timestamp_ns( + array: &Utf8Array, + fmt: &str, +) -> PrimitiveArray { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); + + PrimitiveArray::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) +} diff --git a/tests/it/compute/cast.rs b/tests/it/compute/cast.rs index 8836fc098a4..70cba0455fa 100644 --- a/tests/it/compute/cast.rs +++ b/tests/it/compute/cast.rs @@ -227,10 +227,9 @@ fn bool_to_binary() { } #[test] -#[should_panic(expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported")] fn int32_to_timestamp() { let array = Int32Array::from(&[Some(2), Some(10), None]); - cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap(); + assert!(cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).is_err()); } #[test] @@ -251,6 +250,7 @@ fn consistency() { Float64, Timestamp(TimeUnit::Second, None), Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Millisecond, Some("+01:00".to_string())), Timestamp(TimeUnit::Microsecond, None), Timestamp(TimeUnit::Nanosecond, None), Time64(TimeUnit::Microsecond), @@ -471,6 +471,54 @@ fn list_to_list() { assert_eq!(expected, result.as_ref()); } +#[test] +fn timestamp_with_tz_to_utf8() { + let tz = "-02:00".to_string(); + let expected = + Utf8Array::::from_slice(&["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); + let array = Int64Array::from_slice(&[851020797000000000, 851024397000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + +#[test] +fn utf8_to_timestamp_with_tz() { + let tz = "-02:00".to_string(); + let array = + Utf8Array::::from_slice(&["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); + // the timezone is used to map the time to UTC. + let expected = Int64Array::from_slice(&[851020797000000000, 851024397000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(tz))); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + +#[test] +fn utf8_to_naive_timestamp() { + let array = + Utf8Array::::from_slice(&["1996-12-19T16:39:57-02:00", "1996-12-19T17:39:57-02:00"]); + // the timezone is disregarded from the string and we assume UTC + let expected = Int64Array::from_slice(&[851013597000000000, 851017197000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + +#[test] +fn naive_timestamp_to_utf8() { + let array = Int64Array::from_slice(&[851013597000000000, 851017197000000000]) + .to(DataType::Timestamp(TimeUnit::Nanosecond, None)); + + let expected = Utf8Array::::from_slice(&["1996-12-19 16:39:57", "1996-12-19 17:39:57"]); + + let result = cast(&array, expected.data_type()).expect("cast failed"); + assert_eq!(expected, result.as_ref()); +} + /* #[test] fn dict_to_dict_bad_index_value_primitive() { diff --git a/tests/it/main.rs b/tests/it/main.rs index ad9a9b97353..9f9208e7f23 100644 --- a/tests/it/main.rs +++ b/tests/it/main.rs @@ -4,6 +4,7 @@ mod bitmap; mod buffer; mod ffi; mod scalar; +mod temporal_conversions; mod io; mod test_util; diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs new file mode 100644 index 00000000000..94e933728dc --- /dev/null +++ b/tests/it/temporal_conversions.rs @@ -0,0 +1,66 @@ +use arrow2::array::*; +use arrow2::temporal_conversions; + +#[test] +fn naive() { + let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, ]"; + let fmt = "%Y-%m-%dT%H:%M:%S:z"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57-02:00", + "1996-12-19T13:39:57-03:00", + "1996-12-19 13:39:57-03:00", // missing T + ]); + let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + assert_eq!(format!("{}", r), expected); + + let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57-02:00", + "1996-12-19T13:39:57-03:00", + "1996-12-19 13:39:57-03:00", // missing T + ]); + let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + assert_eq!(format!("{}", r), expected); +} + +#[test] +fn naive_no_tz() { + let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, ]"; + let fmt = "%Y-%m-%dT%H:%M:%S"; // no tz info + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57", + "1996-12-19T13:39:57", + "1996-12-19 13:39:57", // missing T + ]); + let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + assert_eq!(format!("{}", r), expected); +} + +#[test] +fn tz_aware() { + let tz = "-02:00".to_string(); + let expected = + "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 17:39:57 -02:00, ]"; + let fmt = "%Y-%m-%dT%H:%M:%S%.f%:z"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57.0-02:00", + "1996-12-19T16:39:57.0-03:00", // same time at a different TZ + "1996-12-19 13:39:57.0-03:00", + ]); + let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + assert_eq!(format!("{}", r), expected); +} + +#[test] +fn tz_aware_no_timezone() { + let tz = "-02:00".to_string(); + let expected = "Timestamp(Nanosecond, Some(\"-02:00\"))[, , ]"; + let fmt = "%Y-%m-%dT%H:%M:%S%.f"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57.0", + "1996-12-19T17:39:57.0", + "1996-12-19 13:39:57.0", + ]); + let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + assert_eq!(format!("{}", r), expected); +}