diff --git a/Cargo.toml b/Cargo.toml index 94ca01441ee..242dccf5e2f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,8 @@ bench = false [dependencies] num-traits = "0.2" -chrono = "^0.4" +chrono = { version = "0.4", default_features = false, features = ["std"] } +chrono-tz = { version = "0.4", optional = true } # To efficiently cast numbers to strings lexical-core = { version = "0.8", optional = true } # We need to Hash values before sending them to an hasher. This @@ -84,6 +85,8 @@ full = [ "merge_sort", "ahash", "compute", + # parses timezones used in timestamp conversions + "chrono-tz", ] merge_sort = ["itertools"] io_csv = ["csv", "lazy_static", "regex", "lexical-core"] diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 73bf56fb35e..3b0541996c5 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -26,14 +26,12 @@ mod binary_to; mod boolean_to; mod dictionary_to; mod primitive_to; -mod timestamps; mod utf8_to; pub use binary_to::*; pub use boolean_to::*; pub use dictionary_to::*; pub use primitive_to::*; -pub use timestamps::*; pub use utf8_to::*; /// options defining how Cast kernels behave @@ -487,7 +485,10 @@ fn cast_with_options( LargeUtf8 => Ok(Box::new(utf8_to_large_utf8( array.as_any().downcast_ref().unwrap(), ))), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, Some(tz)) => { + utf8_to_timestamp_ns_dyn::(array, tz.clone()) + } _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, @@ -508,7 +509,10 @@ fn cast_with_options( Date64 => utf8_to_date64_dyn::(array), Utf8 => utf8_large_to_utf8(array.as_any().downcast_ref().unwrap()) .map(|x| Box::new(x) as Box), - Timestamp(TimeUnit::Nanosecond, None) => utf8_to_timestamp_ns_dyn::(array), + Timestamp(TimeUnit::Nanosecond, Some(tz)) => { + utf8_to_timestamp_ns_dyn::(array, tz.clone()) + } + Timestamp(TimeUnit::Nanosecond, None) => utf8_to_naive_timestamp_ns_dyn::(array), _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, diff --git a/src/compute/cast/timestamps.rs b/src/compute/cast/timestamps.rs deleted file mode 100644 index 354dd652a80..00000000000 --- a/src/compute/cast/timestamps.rs +++ /dev/null @@ -1,302 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::error::{ArrowError, Result}; -use chrono::{prelude::*, LocalResult}; - -/// Accepts a string in RFC3339 / ISO8601 standard format and some -/// variants and converts it to a nanosecond precision timestamp. -/// -/// Implements the `to_timestamp` function to convert a string to a -/// timestamp, following the model of spark SQL’s to_`timestamp`. -/// -/// In addition to RFC3339 / ISO8601 standard timestamps, it also -/// accepts strings that use a space ` ` to separate the date and time -/// as well as strings that have no explicit timezone offset. -/// -/// Examples of accepted inputs: -/// * `1997-01-31T09:26:56.123Z` # RCF3339 -/// * `1997-01-31T09:26:56.123-05:00` # RCF3339 -/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T -/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified -/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset -/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds -// -/// Internally, this function uses the `chrono` library for the -/// datetime parsing -/// -/// We hope to extend this function in the future with a second -/// parameter to specifying the format string. -/// -/// ## Timestamp Precision -/// -/// Function uses the maximum precision timestamps supported by -/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This -/// means the range of dates that timestamps can represent is ~1677 AD -/// to 2262 AM -/// -/// -/// ## Timezone / Offset Handling -/// -/// Numerical values of timestamps are stored compared to offset UTC. -/// -/// This function intertprets strings without an explicit time zone as -/// timestamps with offsets of the local time on the machine -/// -/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as -/// it has an explicit timezone specifier (“Z” for Zulu/UTC) -/// -/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in -/// the timezone of the machine. For example, if -/// the system timezone is set to Americas/New_York (UTC-5) the -/// timestamp will be interpreted as though it were -/// `1997-01-31T09:26:56.123-05:00` -pub fn utf8_to_timestamp_ns_scalar(s: &str) -> Result { - // Fast path: RFC3339 timestamp (with a T) - // Example: 2020-09-08T13:42:29.190855Z - if let Ok(ts) = DateTime::parse_from_rfc3339(s) { - return Ok(ts.timestamp_nanos()); - } - - // Implement quasi-RFC3339 support by trying to parse the - // timestamp with various other format specifiers to to support - // separating the date and time with a space ' ' rather than 'T' to be - // (more) compatible with Apache Spark SQL - - // timezone offset, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855-05:00 - if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") { - return Ok(ts.timestamp_nanos()); - } - - // with an explicit Z, using ' ' as a separator - // Example: 2020-09-08 13:42:29Z - if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") { - return Ok(ts.timestamp_nanos()); - } - - // Support timestamps without an explicit timezone offset, again - // to be compatible with what Apache Spark SQL does. - - // without a timezone specifier as a local time, using T as a separator - // Example: 2020-09-08T13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using T as a - // separator, no fractional seconds - // Example: 2020-09-08T13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a separator - // Example: 2020-09-08 13:42:29.190855 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") { - return naive_datetime_to_timestamp(s, ts); - } - - // without a timezone specifier as a local time, using ' ' as a - // separator, no fractional seconds - // Example: 2020-09-08 13:42:29 - if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { - return naive_datetime_to_timestamp(s, ts); - } - - // Note we don't pass along the error message from the underlying - // chrono parsing because we tried several different format - // strings and we don't know which the user was trying to - // match. Ths any of the specific error messages is likely to be - // be more confusing than helpful - Err(ArrowError::Other(format!( - "Error parsing '{}' as timestamp", - s - ))) -} - -/// Converts the naive datetime (which has no specific timezone) to a -/// nanosecond epoch timestamp relative to UTC. -fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result { - let l = Local {}; - - match l.from_local_datetime(&datetime) { - LocalResult::None => Err(ArrowError::Other(format!( - "Error parsing '{}' as timestamp: local time representation is invalid", - s - ))), - LocalResult::Single(local_datetime) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - // Ambiguous times can happen if the timestamp is exactly when - // a daylight savings time transition occurs, for example, and - // so the datetime could validly be said to be in two - // potential offsets. However, since we are about to convert - // to UTC anyways, we can pick one arbitrarily - LocalResult::Ambiguous(local_datetime, _) => { - Ok(local_datetime.with_timezone(&Utc).timestamp_nanos()) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn utf8_to_timestamp_timezone() -> Result<()> { - // Explicit timezone - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855+00:00")? - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08T13:42:29.190855Z")? - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08T13:42:29Z")? - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08T13:42:29.190855-05:00")? - ); - Ok(()) - } - - #[test] - fn utf8_to_timestamp_timezone_space() -> Result<()> { - // Ensure space rather than T between time and date is accepted - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855+00:00")? - ); - assert_eq!( - 1599572549190855000, - parse_timestamp("2020-09-08 13:42:29.190855Z")? - ); - assert_eq!( - 1599572549000000000, - parse_timestamp("2020-09-08 13:42:29Z")? - ); // no fractional part - assert_eq!( - 1599590549190855000, - parse_timestamp("2020-09-08 13:42:29.190855-05:00")? - ); - Ok(()) - } - - /// Interprets a naive_datetime (with no explicit timzone offset) - /// using the local timezone and returns the timestamp in UTC (0 - /// offset) - fn naive_datetime_to_timestamp(naive_datetime: &NaiveDateTime) -> i64 { - // Note: Use chrono APIs that are different than - // naive_datetime_to_timestamp to compute the utc offset to - // try and double check the logic - let utc_offset_secs = match Local.offset_from_local_datetime(naive_datetime) { - LocalResult::Single(local_offset) => local_offset.fix().local_minus_utc() as i64, - _ => panic!("Unexpected failure converting to local datetime"), - }; - let utc_offset_nanos = utc_offset_secs * 1_000_000_000; - naive_datetime.timestamp_nanos() - utc_offset_nanos - } - - #[test] - #[cfg_attr(miri, ignore)] // miri does not support one of the instructions - fn utf8_to_timestamp_no_timezone() -> Result<()> { - // This test is designed to succeed in regardless of the local - // timezone the test machine is running. Thus it is still - // somewhat suceptable to bugs in the use of chrono - let naive_datetime = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms_nano(13, 42, 29, 190855), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), - parse_timestamp("2020-09-08T13:42:29.190855")? - ); - - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime), - parse_timestamp("2020-09-08 13:42:29.190855")? - ); - - // Also ensure that parsing timestamps with no fractional - // second part works as well - let naive_datetime_whole_secs = NaiveDateTime::new( - NaiveDate::from_ymd(2020, 9, 8), - NaiveTime::from_hms(13, 42, 29), - ); - - // Ensure both T and ' ' variants work - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), - parse_timestamp("2020-09-08T13:42:29")? - ); - - assert_eq!( - naive_datetime_to_timestamp(&naive_datetime_whole_secs), - parse_timestamp("2020-09-08 13:42:29")? - ); - - Ok(()) - } - - #[test] - #[cfg_attr(miri, ignore)] // miri does not support one of the instructions - fn utf8_to_timestamp_invalid() { - // Test parsing invalid formats - - // It would be nice to make these messages better - expect_timestamp_parse_error("", "Error parsing '' as timestamp"); - expect_timestamp_parse_error("SS", "Error parsing 'SS' as timestamp"); - expect_timestamp_parse_error( - "Wed, 18 Feb 2015 23:16:09 GMT", - "Error parsing 'Wed, 18 Feb 2015 23:16:09 GMT' as timestamp", - ); - } - - // Parse a timestamp to timestamp int with a useful human readable error message - fn parse_timestamp(s: &str) -> Result { - let result = utf8_to_timestamp_ns_scalar(s); - if let Err(e) = &result { - eprintln!("Error parsing timestamp '{}': {:?}", s, e); - } - result - } - - fn expect_timestamp_parse_error(s: &str, expected_err: &str) { - match utf8_to_timestamp_ns_scalar(s) { - Ok(v) => panic!( - "Expected error '{}' while parsing '{}', but parsed {} instead", - expected_err, s, v - ), - Err(e) => { - assert!( - e.to_string().contains(expected_err), - "Can not find expected error '{}' while parsing '{}'. Actual error '{}'", - expected_err, - s, - e - ); - } - } - } -} diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index e267199a015..d74ec2b6e1b 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -2,18 +2,16 @@ use std::convert::TryFrom; use chrono::Datelike; -use crate::{ - array::*, - buffer::Buffer, - datatypes::{DataType, TimeUnit}, - types::NativeType, -}; +use crate::{array::*, buffer::Buffer, datatypes::DataType, types::NativeType}; use crate::{ error::{ArrowError, Result}, - temporal_conversions::EPOCH_DAYS_FROM_CE, + temporal_conversions::{ + utf8_to_naive_timestamp_ns as utf8_to_naive_timestamp_ns_, + utf8_to_timestamp_ns as utf8_to_timestamp_ns_, EPOCH_DAYS_FROM_CE, + }, }; -use super::utf8_to_timestamp_ns_scalar; +const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; /// Casts a [`Utf8Array`] to a [`PrimitiveArray`], making any uncastable value a Null. pub fn utf8_to_primitive(from: &Utf8Array, to: &DataType) -> PrimitiveArray @@ -92,18 +90,34 @@ pub fn utf8_to_dictionary( Ok(array.into()) } -pub(super) fn utf8_to_timestamp_ns_dyn(from: &dyn Array) -> Result> { +pub(super) fn utf8_to_naive_timestamp_ns_dyn( + from: &dyn Array, +) -> Result> { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(utf8_to_timestamp_ns::(from))) + Ok(Box::new(utf8_to_naive_timestamp_ns::(from))) } -/// The array version of [`utf8_to_timestamp_ns_scalar`]. -pub fn utf8_to_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { - let iter = from - .iter() - .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x).ok())); - PrimitiveArray::::from_trusted_len_iter(iter) - .to(DataType::Timestamp(TimeUnit::Nanosecond, None)) +/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting +pub fn utf8_to_naive_timestamp_ns(from: &Utf8Array) -> PrimitiveArray { + utf8_to_naive_timestamp_ns_(from, RFC3339) +} + +pub(super) fn utf8_to_timestamp_ns_dyn( + from: &dyn Array, + timezone: String, +) -> Result> { + let from = from.as_any().downcast_ref().unwrap(); + utf8_to_timestamp_ns::(from, timezone) + .map(Box::new) + .map(|x| x as Box) +} + +/// [`crate::temporal_conversions::utf8_to_timestamp_ns`] applied for RFC3339 formatting +pub fn utf8_to_timestamp_ns( + from: &Utf8Array, + timezone: String, +) -> Result> { + utf8_to_timestamp_ns_(from, RFC3339, timezone) } pub fn utf8_to_large_utf8(from: &Utf8Array) -> Utf8Array { diff --git a/src/temporal_conversions.rs b/src/temporal_conversions.rs index e9c3cf76038..a63865103e2 100644 --- a/src/temporal_conversions.rs +++ b/src/temporal_conversions.rs @@ -1,10 +1,15 @@ //! Conversion methods for dates and times. -use chrono::{FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; +use chrono::{ + format::{parse, Parsed, StrftimeItems}, + FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, +}; +use crate::datatypes::{DataType, TimeUnit}; +use crate::error::Result; use crate::{ - datatypes::TimeUnit, - error::{ArrowError, Result}, + array::{Int64Array, Offset, Utf8Array}, + error::ArrowError, }; /// Number of seconds in a day @@ -178,3 +183,107 @@ pub(crate) fn parse_offset(offset: &str) -> Result { Ok(FixedOffset::east(hours * 60 * 60 + minutes * 60)) } + +// not public to not expose TimeZone +#[inline] +pub(crate) fn utf8_to_timestamp_ns_scalar( + value: &str, + fmt: &str, + tz: &T, +) -> Option { + let mut parsed = Parsed::new(); + let fmt = StrftimeItems::new(fmt); + let r = parse(&mut parsed, value, fmt).ok(); + if r.is_some() { + parsed + .to_datetime_with_timezone(tz) + .map(|x| x.timestamp_nanos()) + .ok() + } else { + None + } +} + +#[inline] +pub fn utf8_to_naive_timestamp_ns_scalar(value: &str, fmt: &str) -> Option { + let fmt = StrftimeItems::new(fmt); + let mut parsed = Parsed::new(); + parse(&mut parsed, value, fmt.clone()).ok(); + parsed + .to_naive_datetime_with_offset(0) + .map(|x| x.timestamp_nanos()) + .ok() +} + +fn utf8_to_timestamp_ns_impl( + array: &Utf8Array, + fmt: &str, + timezone: String, + tz: T, +) -> Int64Array { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_timestamp_ns_scalar(x, fmt, &tz))); + + Int64Array::from_trusted_len_iter(iter) + .to(DataType::Timestamp(TimeUnit::Nanosecond, Some(timezone))) +} + +#[cfg(feature = "chrono-tz")] +fn chrono_tz_utf_to_timestamp_ns( + array: &Utf8Array, + fmt: &str, + timezone: String, +) -> Result { + let tz = timezone.as_str().parse::(); + if let Ok(tz) = tz { + Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed", + timezone + ))) + } +} + +#[cfg(not(feature = "chrono-tz"))] +fn chrono_tz_utf_to_timestamp_ns( + _: &Utf8Array, + _: &str, + timezone: String, +) -> Result { + Err(ArrowError::InvalidArgumentError(format!( + "timezone \"{}\" cannot be parsed (feature chrono-tz is not active)", + timezone + ))) +} + +/// Parses a [`Utf8Array`] to a time-aware timestamp, i.e. [`Int64Array`] with type `Timestamp(Nanosecond, Some(timezone))`. +/// When the value represents a string with another timezone, a conversion is applied. +/// Null elements remain null; non-parsable elements are set to null. +/// # Error +/// This function errors iff `timezone` is not parsiable to an offset. +pub fn utf8_to_timestamp_ns( + array: &Utf8Array, + fmt: &str, + timezone: String, +) -> Result { + let tz = parse_offset(timezone.as_str()); + + if let Ok(tz) = tz { + Ok(utf8_to_timestamp_ns_impl(array, fmt, timezone, tz)) + } else { + chrono_tz_utf_to_timestamp_ns(array, fmt, timezone) + } +} + +/// Parses a [`Utf8Array`] to naive timestamp, i.e. [`Int64Array`] with type `Timestamp(Nanosecond, None)`. +/// Timezones are ignored. +/// Null elements remain null; non-parsable elements are set to null. +pub fn utf8_to_naive_timestamp_ns(array: &Utf8Array, fmt: &str) -> Int64Array { + let iter = array + .iter() + .map(|x| x.and_then(|x| utf8_to_naive_timestamp_ns_scalar(x, fmt))); + + Int64Array::from_trusted_len_iter(iter).to(DataType::Timestamp(TimeUnit::Nanosecond, None)) +} diff --git a/tests/it/main.rs b/tests/it/main.rs index ad9a9b97353..9f9208e7f23 100644 --- a/tests/it/main.rs +++ b/tests/it/main.rs @@ -4,6 +4,7 @@ mod bitmap; mod buffer; mod ffi; mod scalar; +mod temporal_conversions; mod io; mod test_util; diff --git a/tests/it/temporal_conversions.rs b/tests/it/temporal_conversions.rs new file mode 100644 index 00000000000..4f313b404c4 --- /dev/null +++ b/tests/it/temporal_conversions.rs @@ -0,0 +1,39 @@ +use arrow2::array::*; +use arrow2::temporal_conversions; + +#[test] +fn naive() { + let expected = "Timestamp(Nanosecond, None)[1996-12-19 16:39:57, 1996-12-19 13:39:57, ]"; + let fmt = "%Y-%m-%dT%H:%M:%S:z"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57-02:00", + "1996-12-19T13:39:57-03:00", + "1996-12-19 13:39:57-03:00", + ]); + let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + assert_eq!(format!("{}", r), expected); + + let fmt = "%Y-%m-%dT%H:%M:%S"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57-02:00", + "1996-12-19T13:39:57-03:00", + "1996-12-19 13:39:57-03:00", + ]); + let r = temporal_conversions::utf8_to_naive_timestamp_ns(&array, fmt); + assert_eq!(format!("{}", r), expected); +} + +#[test] +fn tz_aware() { + let tz = "-02:00".to_string(); + let expected = + "Timestamp(Nanosecond, Some(\"-02:00\"))[1996-12-19 16:39:57 -02:00, 1996-12-19 13:39:57 -02:00, ]"; + let fmt = "%Y-%m-%dT%H:%M:%S%.f%:z"; + let array = Utf8Array::::from_slice(&[ + "1996-12-19T16:39:57.0-02:00", + "1996-12-19T13:39:57.0-02:00", + "1996-12-19 13:39:57.0-03:00", + ]); + let r = temporal_conversions::utf8_to_timestamp_ns(&array, fmt, tz).unwrap(); + assert_eq!(format!("{}", r), expected); +}