From acf58c255d4f3b59adf1d22ad9e69b104fbc93c1 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 10 Aug 2021 22:14:34 +0000 Subject: [PATCH] Added support for MONTH_DAY_NANO intervals. --- src/array/equal/mod.rs | 7 +- src/array/ffi.rs | 3 + src/array/growable/mod.rs | 3 + src/array/mod.rs | 14 +++- src/array/primitive/display.rs | 18 ++++- src/array/primitive/mod.rs | 6 +- src/compute/aggregate/memory.rs | 3 +- src/datatypes/mod.rs | 7 ++ src/io/ipc/convert.rs | 2 + src/io/ipc/gen/Schema.rs | 136 +++++++++++++++++++++++--------- src/io/ipc/read/deserialize.rs | 15 +++- src/io/ipc/write/serialize.rs | 5 +- src/io/json/schema.rs | 4 + src/io/json_integration/read.rs | 47 +++++++++-- src/scalar/mod.rs | 7 +- src/types/mod.rs | 127 +++++++++++++++++++++++++++++ 16 files changed, 355 insertions(+), 49 deletions(-) diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index 45062e27c4f..40c976ea25e 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -2,7 +2,7 @@ use std::unimplemented; use crate::{ datatypes::{DataType, IntervalUnit}, - types::{days_ms, NativeType}, + types::{days_ms, months_days_ns, NativeType}, }; use super::{ @@ -224,6 +224,11 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool { let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let lhs = lhs.as_any().downcast_ref().unwrap(); + let rhs = rhs.as_any().downcast_ref().unwrap(); + primitive::equal::(lhs, rhs) + } DataType::Float16 => unreachable!(), DataType::Float32 => { let lhs = lhs.as_any().downcast_ref().unwrap(); diff --git a/src/array/ffi.rs b/src/array/ffi.rs index e707f98d6b0..1ce677408cf 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -63,6 +63,9 @@ pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { ffi_dyn!(array, PrimitiveArray) } DataType::Interval(IntervalUnit::DayTime) => ffi_dyn!(array, PrimitiveArray), + DataType::Interval(IntervalUnit::MonthDayNano) => { + ffi_dyn!(array, PrimitiveArray) + } DataType::Int64 | DataType::Date64 | DataType::Time64(_) diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index db54a6e12e4..8d63ee345eb 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -127,6 +127,9 @@ pub fn make_growable<'a>( DataType::Interval(IntervalUnit::DayTime) => { dyn_growable!(days_ms, arrays, use_validity, capacity) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + dyn_growable!(months_days_ns, arrays, use_validity, capacity) + } DataType::Decimal(_, _) => dyn_growable!(i128, arrays, use_validity, capacity), DataType::UInt8 => dyn_growable!(u8, arrays, use_validity, capacity), DataType::UInt16 => dyn_growable!(u16, arrays, use_validity, capacity), diff --git a/src/array/mod.rs b/src/array/mod.rs index 3505a6204a6..24101deb897 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::fmt::Display; use crate::error::Result; -use crate::types::days_ms; +use crate::types::{days_ms, months_days_ns}; use crate::{ bitmap::{Bitmap, MutableBitmap}, datatypes::{DataType, IntervalUnit}, @@ -163,6 +163,9 @@ impl Display for dyn Array { DataType::Interval(IntervalUnit::DayTime) => { fmt_dyn!(self, PrimitiveArray, f) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + fmt_dyn!(self, PrimitiveArray, f) + } DataType::Int64 | DataType::Date64 | DataType::Time64(_) @@ -217,6 +220,9 @@ pub fn new_empty_array(data_type: DataType) -> Box { DataType::Interval(IntervalUnit::DayTime) => { Box::new(PrimitiveArray::::new_empty(data_type)) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Box::new(PrimitiveArray::::new_empty(data_type)) + } DataType::Int64 | DataType::Date64 | DataType::Time64(_) @@ -271,6 +277,9 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { DataType::Interval(IntervalUnit::DayTime) => { Box::new(PrimitiveArray::::new_null(data_type, length)) } + DataType::Interval(IntervalUnit::MonthDayNano) => Box::new( + PrimitiveArray::::new_null(data_type, length), + ), DataType::Int64 | DataType::Date64 | DataType::Time64(_) @@ -332,6 +341,9 @@ pub fn clone(array: &dyn Array) -> Box { clone_dyn!(array, PrimitiveArray) } DataType::Interval(IntervalUnit::DayTime) => clone_dyn!(array, PrimitiveArray), + DataType::Interval(IntervalUnit::MonthDayNano) => { + clone_dyn!(array, PrimitiveArray) + } DataType::Int64 | DataType::Date64 | DataType::Time64(_) diff --git a/src/array/primitive/display.rs b/src/array/primitive/display.rs index a884f3cbeea..b7ecbaccd36 100644 --- a/src/array/primitive/display.rs +++ b/src/array/primitive/display.rs @@ -1,4 +1,8 @@ -use crate::{datatypes::*, temporal_conversions, types::days_ms}; +use crate::{ + datatypes::*, + temporal_conversions, + types::{days_ms, months_days_ns}, +}; use super::super::{display_fmt, Array}; use super::PrimitiveArray; @@ -159,6 +163,18 @@ impl std::fmt::Display for PrimitiveArray { } } +impl std::fmt::Display for PrimitiveArray { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let new_lines = false; + let head = &format!("{}", self.data_type()); + let iter = self.iter().map(|x| { + x.copied() + .map(|x| format!("{}m{}d{}ns", x.months(), x.days(), x.ns())) + }); + display_fmt(iter, head, f, new_lines) + } +} + macro_rules! display { ($ty:ty) => { impl std::fmt::Display for PrimitiveArray<$ty> { diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 5a394e817c4..ca7fd55f372 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -3,7 +3,7 @@ use crate::{ buffer::Buffer, datatypes::*, error::ArrowError, - types::{days_ms, NativeType}, + types::{days_ms, months_days_ns, NativeType}, }; use super::Array; @@ -167,6 +167,8 @@ pub type Int64Array = PrimitiveArray; pub type Int128Array = PrimitiveArray; /// A type definition [`PrimitiveArray`] for [`days_ms`] pub type DaysMsArray = PrimitiveArray; +/// A type definition [`PrimitiveArray`] for [`months_days_ns`] +pub type MonthsDaysNsArray = PrimitiveArray; /// A type definition [`PrimitiveArray`] for `f32` pub type Float32Array = PrimitiveArray; /// A type definition [`PrimitiveArray`] for `f64` @@ -192,6 +194,8 @@ pub type Int64Vec = MutablePrimitiveArray; pub type Int128Vec = MutablePrimitiveArray; /// A type definition [`MutablePrimitiveArray`] for [`days_ms`] pub type DaysMsVec = MutablePrimitiveArray; +/// A type definition [`MutablePrimitiveArray`] for [`months_days_ns`] +pub type MonthsDaysNsVec = MutablePrimitiveArray; /// A type definition [`MutablePrimitiveArray`] for `f32` pub type Float32Vec = MutablePrimitiveArray; /// A type definition [`MutablePrimitiveArray`] for `f64` diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 7b1eb239d00..045ed1c19a9 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -1,7 +1,7 @@ use crate::array::*; use crate::bitmap::Bitmap; use crate::datatypes::{DataType, IntervalUnit}; -use crate::types::days_ms; +use crate::types::{days_ms, months_days_ns}; fn validity_size(validity: &Option) -> usize { validity.as_ref().map(|b| b.as_slice().len()).unwrap_or(0) @@ -72,6 +72,7 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { Float64 => dyn_primitive!(array, f64), Decimal(_, _) => dyn_primitive!(array, i128), Interval(IntervalUnit::DayTime) => dyn_primitive!(array, days_ms), + Interval(IntervalUnit::MonthDayNano) => dyn_primitive!(array, months_days_ns), Binary => dyn_binary!(array, BinaryArray, i32), FixedSizeBinary(_) => { let array = array diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 6603ed80331..462f79a3aad 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -149,6 +149,13 @@ pub enum IntervalUnit { /// Indicates the number of elapsed days and milliseconds, /// stored as 2 contiguous 32-bit integers (8-bytes in total). DayTime, + /// The values are stored contiguously in 16 byte blocks. Months and + /// days are encoded as 32 bit integers and nanoseconds is encoded as a + /// 64 bit integer. All integers are signed. Each field is independent + /// (e.g. there is no constraint that nanoseconds have the same sign + /// as days or that the quantitiy of nanoseconds represents less + /// then a day's worth of time). + MonthDayNano, } impl DataType { diff --git a/src/io/ipc/convert.rs b/src/io/ipc/convert.rs index c5452db242d..33e7a4ec873 100644 --- a/src/io/ipc/convert.rs +++ b/src/io/ipc/convert.rs @@ -227,6 +227,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT match interval.unit() { ipc::IntervalUnit::YEAR_MONTH => DataType::Interval(IntervalUnit::YearMonth), ipc::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), + ipc::IntervalUnit::MONTH_DAY_NANO => DataType::Interval(IntervalUnit::MonthDayNano), z => panic!("Interval type with unit of {:?} unsupported", z), } } @@ -513,6 +514,7 @@ pub(crate) fn get_fb_field_type<'a>( let interval_unit = match unit { IntervalUnit::YearMonth => ipc::IntervalUnit::YEAR_MONTH, IntervalUnit::DayTime => ipc::IntervalUnit::DAY_TIME, + IntervalUnit::MonthDayNano => ipc::IntervalUnit::MONTH_DAY_NANO, }; builder.add_unit(interval_unit); FbFieldType { diff --git a/src/io/ipc/gen/Schema.rs b/src/io/ipc/gen/Schema.rs index 594551f2103..25cd2880ecd 100644 --- a/src/io/ipc/gen/Schema.rs +++ b/src/io/ipc/gen/Schema.rs @@ -18,7 +18,7 @@ #![allow(dead_code)] #![allow(unused_imports)] -use flatbuffers::EndianScalar; +use flatbuffers::{EndianScalar, Follow}; use std::{cmp::Ordering, mem}; // automatically generated by the FlatBuffers compiler, do not modify @@ -45,7 +45,7 @@ pub const ENUM_VALUES_METADATA_VERSION: [MetadataVersion; 5] = [ MetadataVersion::V5, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct MetadataVersion(pub i16); #[allow(non_upper_case_globals)] @@ -104,7 +104,9 @@ impl flatbuffers::Push for MetadataVersion { type Output = MetadataVersion; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -115,6 +117,7 @@ impl flatbuffers::EndianScalar for MetadataVersion { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -127,7 +130,6 @@ impl<'a> flatbuffers::Verifiable for MetadataVersion { v: &mut flatbuffers::Verifier, pos: usize, ) -> Result<(), flatbuffers::InvalidFlatbuffer> { - use flatbuffers::Verifiable; i16::run_verifier(v, pos) } } @@ -171,7 +173,7 @@ pub const ENUM_VALUES_FEATURE: [Feature; 3] = [ /// Enums added to this list should be assigned power-of-two values /// to facilitate exchanging and comparing bitmaps for supported /// features. -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Feature(pub i64); #[allow(non_upper_case_globals)] @@ -225,7 +227,9 @@ impl flatbuffers::Push for Feature { type Output = Feature; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -236,6 +240,7 @@ impl flatbuffers::EndianScalar for Feature { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i64::from_le(self.0); Self(b) @@ -271,7 +276,7 @@ pub const ENUM_MAX_UNION_MODE: i16 = 1; #[allow(non_camel_case_types)] pub const ENUM_VALUES_UNION_MODE: [UnionMode; 2] = [UnionMode::Sparse, UnionMode::Dense]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct UnionMode(pub i16); #[allow(non_upper_case_globals)] @@ -313,7 +318,9 @@ impl flatbuffers::Push for UnionMode { type Output = UnionMode; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -324,6 +331,7 @@ impl flatbuffers::EndianScalar for UnionMode { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -360,7 +368,7 @@ pub const ENUM_MAX_PRECISION: i16 = 2; pub const ENUM_VALUES_PRECISION: [Precision; 3] = [Precision::HALF, Precision::SINGLE, Precision::DOUBLE]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Precision(pub i16); #[allow(non_upper_case_globals)] @@ -404,7 +412,9 @@ impl flatbuffers::Push for Precision { type Output = Precision; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -415,6 +425,7 @@ impl flatbuffers::EndianScalar for Precision { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -450,7 +461,7 @@ pub const ENUM_MAX_DATE_UNIT: i16 = 1; #[allow(non_camel_case_types)] pub const ENUM_VALUES_DATE_UNIT: [DateUnit; 2] = [DateUnit::DAY, DateUnit::MILLISECOND]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct DateUnit(pub i16); #[allow(non_upper_case_globals)] @@ -492,7 +503,9 @@ impl flatbuffers::Push for DateUnit { type Output = DateUnit; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -503,6 +516,7 @@ impl flatbuffers::EndianScalar for DateUnit { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -543,7 +557,7 @@ pub const ENUM_VALUES_TIME_UNIT: [TimeUnit; 4] = [ TimeUnit::NANOSECOND, ]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct TimeUnit(pub i16); #[allow(non_upper_case_globals)] @@ -594,7 +608,9 @@ impl flatbuffers::Push for TimeUnit { type Output = TimeUnit; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -605,6 +621,7 @@ impl flatbuffers::EndianScalar for TimeUnit { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -632,31 +649,37 @@ pub const ENUM_MIN_INTERVAL_UNIT: i16 = 0; since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] -pub const ENUM_MAX_INTERVAL_UNIT: i16 = 1; +pub const ENUM_MAX_INTERVAL_UNIT: i16 = 2; #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." )] #[allow(non_camel_case_types)] -pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 2] = - [IntervalUnit::YEAR_MONTH, IntervalUnit::DAY_TIME]; +pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 3] = [ + IntervalUnit::YEAR_MONTH, + IntervalUnit::DAY_TIME, + IntervalUnit::MONTH_DAY_NANO, +]; -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct IntervalUnit(pub i16); #[allow(non_upper_case_globals)] impl IntervalUnit { pub const YEAR_MONTH: Self = Self(0); pub const DAY_TIME: Self = Self(1); + pub const MONTH_DAY_NANO: Self = Self(2); pub const ENUM_MIN: i16 = 0; - pub const ENUM_MAX: i16 = 1; - pub const ENUM_VALUES: &'static [Self] = &[Self::YEAR_MONTH, Self::DAY_TIME]; + pub const ENUM_MAX: i16 = 2; + pub const ENUM_VALUES: &'static [Self] = + &[Self::YEAR_MONTH, Self::DAY_TIME, Self::MONTH_DAY_NANO]; /// Returns the variant's name or "" if unknown. pub fn variant_name(self) -> Option<&'static str> { match self { Self::YEAR_MONTH => Some("YEAR_MONTH"), Self::DAY_TIME => Some("DAY_TIME"), + Self::MONTH_DAY_NANO => Some("MONTH_DAY_NANO"), _ => None, } } @@ -683,7 +706,9 @@ impl flatbuffers::Push for IntervalUnit { type Output = IntervalUnit; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -694,6 +719,7 @@ impl flatbuffers::EndianScalar for IntervalUnit { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -755,7 +781,7 @@ pub const ENUM_VALUES_TYPE: [Type; 22] = [ /// ---------------------------------------------------------------------- /// Top-level Type value, enabling extensible type-specific metadata. We can /// add new logical types to Type without breaking backwards compatibility -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Type(pub u8); #[allow(non_upper_case_globals)] @@ -847,7 +873,6 @@ impl std::fmt::Debug for Type { } } } -pub struct TypeUnionTableOffset {} impl<'a> flatbuffers::Follow<'a> for Type { type Inner = Self; #[inline] @@ -861,7 +886,9 @@ impl flatbuffers::Push for Type { type Output = Type; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -872,6 +899,7 @@ impl flatbuffers::EndianScalar for Type { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = u8::from_le(self.0); Self(b) @@ -890,6 +918,8 @@ impl<'a> flatbuffers::Verifiable for Type { } impl flatbuffers::SimpleToVerifyInSlice for Type {} +pub struct TypeUnionTableOffset {} + #[deprecated( since = "2.0.0", note = "Use associated constants instead. This will no longer be generated in 2021." @@ -912,7 +942,7 @@ pub const ENUM_VALUES_DICTIONARY_KIND: [DictionaryKind; 1] = [DictionaryKind::De /// Maintained for forwards compatibility, in the future /// Dictionaries might be explicit maps between integers and values /// allowing for non-contiguous index values -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct DictionaryKind(pub i16); #[allow(non_upper_case_globals)] @@ -952,7 +982,9 @@ impl flatbuffers::Push for DictionaryKind { type Output = DictionaryKind; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -963,6 +995,7 @@ impl flatbuffers::EndianScalar for DictionaryKind { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -1000,7 +1033,7 @@ pub const ENUM_VALUES_ENDIANNESS: [Endianness; 2] = [Endianness::Little, Endiann /// ---------------------------------------------------------------------- /// Endianness of the platform producing the data -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] #[repr(transparent)] pub struct Endianness(pub i16); #[allow(non_upper_case_globals)] @@ -1042,7 +1075,9 @@ impl flatbuffers::Push for Endianness { type Output = Endianness; #[inline] fn push(&self, dst: &mut [u8], _rest: &[u8]) { - unsafe { flatbuffers::emplace_scalar::(dst, self.0) }; + unsafe { + flatbuffers::emplace_scalar::(dst, self.0); + } } } @@ -1053,6 +1088,7 @@ impl flatbuffers::EndianScalar for Endianness { Self(b) } #[inline] + #[allow(clippy::wrong_self_convention)] fn from_little_endian(self) -> Self { let b = i16::from_le(self.0); Self(b) @@ -1077,6 +1113,11 @@ impl flatbuffers::SimpleToVerifyInSlice for Endianness {} #[repr(transparent)] #[derive(Clone, Copy, PartialEq)] pub struct Buffer(pub [u8; 16]); +impl Default for Buffer { + fn default() -> Self { + Self([0; 16]) + } +} impl std::fmt::Debug for Buffer { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { f.debug_struct("Buffer") @@ -1134,7 +1175,7 @@ impl<'a> flatbuffers::Verifiable for Buffer { v.in_buffer::(pos) } } -impl Buffer { +impl<'a> Buffer { #[allow(clippy::too_many_arguments)] pub fn new(offset: i64, length: i64) -> Self { let mut s = Self([0; 16]); @@ -2911,8 +2952,33 @@ pub enum TimestampOffset {} /// leap seconds, as a 64-bit integer. Note that UNIX time does not include /// leap seconds. /// -/// The Timestamp metadata supports both "time zone naive" and "time zone -/// aware" timestamps. Read about the timezone attribute for more detail +/// Date & time libraries often have multiple different data types for temporal +/// data. In order to ease interoperability between different implementations the +/// Arrow project has some recommendations for encoding these types into a Timestamp +/// column. +/// +/// An "instant" represents a single moment in time that has no meaningful time zone +/// or the time zone is unknown. A column of instants can also contain values from +/// multiple time zones. To encode an instant set the timezone string to "UTC". +/// +/// A "zoned date-time" represents a single moment in time that has a meaningful +/// reference time zone. To encode a zoned date-time as a Timestamp set the timezone +/// string to the name of the timezone. There is some ambiguity between an instant +/// and a zoned date-time with the UTC time zone. Both of these are stored the same. +/// Typically, this distinction does not matter. If it does, then an application should +/// use custom metadata or an extension type to distinguish between the two cases. +/// +/// An "offset date-time" represents a single moment in time combined with a meaningful +/// offset from UTC. To encode an offset date-time as a Timestamp set the timezone string +/// to the numeric time zone offset string (e.g. "+03:00"). +/// +/// A "local date-time" does not represent a single moment in time. It represents a wall +/// clock time combined with a date. Because of daylight savings time there may multiple +/// instants that correspond to a single local date-time in any given time zone. A +/// local date-time is often stored as a struct or a Date32/Time64 pair. However, it can +/// also be encoded into a Timestamp column. To do so the value should be the the time +/// elapsed from the Unix epoch so that a wall clock in UTC would display the desired time. +/// The timezone string should be set to null or the empty string. pub struct Timestamp<'a> { pub _tab: flatbuffers::Table<'a>, } @@ -2963,11 +3029,9 @@ impl<'a> Timestamp<'a> { /// Whether a timezone string is present indicates different semantics about /// the data: /// - /// * If the time zone is null or equal to an empty string, the data is "time - /// zone naive" and shall be displayed *as is* to the user, not localized - /// to the locale of the user. This data can be though of as UTC but - /// without having "UTC" as the time zone, it is not considered to be - /// localized to any time zone + /// * If the time zone is null or an empty string, the data is a local date-time + /// and does not represent a single moment in time. Instead it represents a wall clock + /// time and care should be taken to avoid interpreting it semantically as an instant. /// /// * If the time zone is set to a valid value, values can be displayed as /// "localized" to that time zone, even though the underlying 64-bit diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index f244024bb0a..65a64c1bb56 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -12,7 +12,10 @@ use std::{ use crate::datatypes::{DataType, IntervalUnit}; use crate::error::Result; use crate::io::ipc::gen::Message::BodyCompression; -use crate::{array::*, types::days_ms}; +use crate::{ + array::*, + types::{days_ms, months_days_ns}, +}; use super::super::gen; use super::array::*; @@ -104,6 +107,16 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), + DataType::Interval(IntervalUnit::MonthDayNano) => read_primitive::( + field_nodes, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + compression, + ) + .map(|x| Arc::new(x) as Arc), DataType::UInt8 => read_primitive::( field_nodes, data_type, diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index 60f529395d0..52eb8aa76ca 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -25,7 +25,7 @@ use crate::{ endianess::is_native_little_endian, io::ipc::gen::Message, trusted_len::TrustedLen, - types::{days_ms, NativeType}, + types::{days_ms, months_days_ns, NativeType}, }; use crate::io::ipc::gen::Schema; @@ -412,6 +412,9 @@ pub fn write( DataType::Interval(IntervalUnit::DayTime) => { write_primitive::(array, buffers, arrow_data, offset, is_little_endian) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + write_primitive::(array, buffers, arrow_data, offset, is_little_endian) + } DataType::UInt8 => { write_primitive::(array, buffers, arrow_data, offset, is_little_endian) } diff --git a/src/io/json/schema.rs b/src/io/json/schema.rs index c1e0c89f2ed..564b16825c8 100644 --- a/src/io/json/schema.rs +++ b/src/io/json/schema.rs @@ -103,6 +103,7 @@ impl ToJson for DataType { DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { IntervalUnit::YearMonth => "YEAR_MONTH", IntervalUnit::DayTime => "DAY_TIME", + IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", }}), DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { TimeUnit::Second => "SECOND", @@ -251,6 +252,9 @@ impl TryFrom<&Value> for DataType { Some(s) if s == "interval" => match map.get("unit") { Some(p) if p == "DAY_TIME" => Ok(DataType::Interval(IntervalUnit::DayTime)), Some(p) if p == "YEAR_MONTH" => Ok(DataType::Interval(IntervalUnit::YearMonth)), + Some(p) if p == "MONTH_DAY_NANO" => { + Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + } _ => Err(ArrowError::Schema( "interval unit missing or invalid".to_string(), )), diff --git a/src/io/json_integration/read.rs b/src/io/json_integration/read.rs index 6063335a95b..9374b6dfecc 100644 --- a/src/io/json_integration/read.rs +++ b/src/io/json_integration/read.rs @@ -27,7 +27,7 @@ use crate::{ datatypes::{DataType, Field, IntervalUnit, Schema}, error::{ArrowError, Result}, record_batch::RecordBatch, - types::{days_ms, NativeType}, + types::{days_ms, months_days_ns, NativeType}, }; use super::{ArrowJsonBatch, ArrowJsonColumn, ArrowJsonDictionaryBatch}; @@ -55,7 +55,7 @@ fn to_offsets(offsets: Option<&Vec>) -> Buffer { .collect() } -fn to_interval(value: &Value) -> days_ms { +fn to_days_ms(value: &Value) -> days_ms { if let Value::Object(v) = value { let days = v.get("days").unwrap(); let milliseconds = v.get("milliseconds").unwrap(); @@ -72,7 +72,26 @@ fn to_interval(value: &Value) -> days_ms { } } -fn to_primitive_interval( +fn to_months_days_ns(value: &Value) -> months_days_ns { + if let Value::Object(v) = value { + let months = v.get("months").unwrap(); + let days = v.get("days").unwrap(); + let nanoseconds = v.get("nanoseconds").unwrap(); + match (months, days, nanoseconds) { + (Value::Number(months), Value::Number(days), Value::Number(nanoseconds)) => { + let months = months.as_i64().unwrap() as i32; + let days = days.as_i64().unwrap() as i32; + let nanoseconds = nanoseconds.as_i64().unwrap(); + months_days_ns::new(months, days, nanoseconds) + } + (_, _, _) => panic!(), + } + } else { + panic!() + } +} + +fn to_primitive_days_ms( json_col: &ArrowJsonColumn, data_type: DataType, ) -> PrimitiveArray { @@ -82,11 +101,26 @@ fn to_primitive_interval( .as_ref() .unwrap() .iter() - .map(to_interval) + .map(to_days_ms) .collect(); PrimitiveArray::::from_data(data_type, values, validity) } +fn to_primitive_months_days_ns( + json_col: &ArrowJsonColumn, + data_type: DataType, +) -> PrimitiveArray { + let validity = to_validity(&json_col.validity); + let values = json_col + .data + .as_ref() + .unwrap() + .iter() + .map(to_months_days_ns) + .collect(); + PrimitiveArray::::from_data(data_type, values, validity) +} + fn to_decimal(json_col: &ArrowJsonColumn, data_type: DataType) -> PrimitiveArray { let validity = to_validity(&json_col.validity); let values = json_col @@ -252,8 +286,11 @@ pub fn to_array( | DataType::Timestamp(_, _) | DataType::Duration(_) => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), DataType::Interval(IntervalUnit::DayTime) => { - Ok(Arc::new(to_primitive_interval(json_col, data_type.clone()))) + Ok(Arc::new(to_primitive_days_ms(json_col, data_type.clone()))) } + DataType::Interval(IntervalUnit::MonthDayNano) => Ok(Arc::new( + to_primitive_months_days_ns(json_col, data_type.clone()), + )), DataType::Decimal(_, _) => Ok(Arc::new(to_decimal(json_col, data_type.clone()))), DataType::UInt8 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), DataType::UInt16 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index 9ec6417ad66..26e93d1d2e3 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -1,6 +1,10 @@ use std::any::Any; -use crate::{array::*, datatypes::*, types::days_ms}; +use crate::{ + array::*, + datatypes::*, + types::{days_ms, months_days_ns}, +}; mod equal; mod primitive; @@ -101,6 +105,7 @@ pub fn new_scalar(array: &dyn Array, index: usize) -> Box { } Int64 | Date64 | Time64(_) | Duration(_) | Timestamp(_, _) => dyn_new!(array, index, i64), Interval(IntervalUnit::DayTime) => dyn_new!(array, index, days_ms), + Interval(IntervalUnit::MonthDayNano) => dyn_new!(array, index, months_days_ns), UInt8 => dyn_new!(array, index, u8), UInt16 => dyn_new!(array, index, u16), UInt32 => dyn_new!(array, index, u32), diff --git a/src/types/mod.rs b/src/types/mod.rs index fb96b099b22..d325eb05c18 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -121,6 +121,10 @@ natural_type!(i64, DataType::Int64); natural_type!(f32, DataType::Float32); natural_type!(f64, DataType::Float64); natural_type!(days_ms, DataType::Interval(IntervalUnit::DayTime)); +natural_type!( + months_days_ns, + DataType::Interval(IntervalUnit::MonthDayNano) +); natural_type!(i128, DataType::Decimal(32, 32)); // users should set the decimal when creating an array create_relation!(u8, &DataType::UInt8); @@ -246,3 +250,126 @@ impl PartialOrd for days_ms { Some(self.cmp(other)) } } + +/// The in-memory representation of the MonthDayNano variant of the "Interval" logical type. +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash)] +#[allow(non_camel_case_types)] +#[repr(C)] +pub struct months_days_ns(i32, i32, i64); + +impl std::fmt::Display for months_days_ns { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}m {}d {}ns", self.months(), self.days(), self.ns()) + } +} + +unsafe impl NativeType for months_days_ns { + type Bytes = [u8; 16]; + #[inline] + fn to_le_bytes(&self) -> Self::Bytes { + let months = self.months().to_le_bytes(); + let days = self.days().to_le_bytes(); + let ns = self.ns().to_le_bytes(); + let mut result = [0; 16]; + result[0] = months[0]; + result[1] = months[1]; + result[2] = months[2]; + result[3] = months[3]; + result[4] = days[0]; + result[5] = days[1]; + result[6] = days[2]; + result[7] = days[3]; + (0..8).for_each(|i| { + result[8 + i] = ns[i]; + }); + result + } + + #[inline] + fn to_be_bytes(&self) -> Self::Bytes { + let months = self.months().to_be_bytes(); + let days = self.days().to_be_bytes(); + let ns = self.ns().to_be_bytes(); + let mut result = [0; 16]; + result[0] = months[0]; + result[1] = months[1]; + result[2] = months[2]; + result[3] = months[3]; + result[4] = days[0]; + result[5] = days[1]; + result[6] = days[2]; + result[7] = days[3]; + (0..8).for_each(|i| { + result[8 + i] = ns[i]; + }); + result + } + + #[inline] + fn from_be_bytes(bytes: Self::Bytes) -> Self { + let mut months = [0; 4]; + months[0] = bytes[0]; + months[1] = bytes[1]; + months[2] = bytes[2]; + months[3] = bytes[3]; + let mut days = [0; 4]; + days[0] = bytes[4]; + days[1] = bytes[5]; + days[2] = bytes[6]; + days[3] = bytes[7]; + let mut ns = [0; 8]; + (0..8).for_each(|i| { + ns[i] = bytes[8 + i]; + }); + Self( + i32::from_be_bytes(months), + i32::from_be_bytes(days), + i64::from_be_bytes(ns), + ) + } +} + +create_relation!( + months_days_ns, + &DataType::Interval(IntervalUnit::MonthDayNano) +); + +impl months_days_ns { + #[inline] + pub fn new(months: i32, days: i32, nanoseconds: i64) -> Self { + Self(months, days, nanoseconds) + } + + #[inline] + pub fn months(&self) -> i32 { + self.0 + } + + #[inline] + pub fn days(&self) -> i32 { + self.1 + } + + #[inline] + pub fn ns(&self) -> i64 { + self.2 + } +} + +impl Ord for months_days_ns { + fn cmp(&self, other: &Self) -> Ordering { + match self.months().cmp(&other.months()) { + Ordering::Equal => match self.days().cmp(&other.days()) { + Ordering::Equal => self.ns().cmp(&other.ns()), + other => other, + }, + other => other, + } + } +} + +impl PartialOrd for months_days_ns { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +}