From b8b3ec4b6e51c547fb76f4d4677c6ccb7051e556 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 29 Aug 2021 17:33:11 +0000 Subject: [PATCH 01/12] Improved StructArray constructors. --- src/array/growable/structure.rs | 13 ++++++- src/array/mod.rs | 4 +- src/array/struct_.rs | 60 ++++++++++++++++++------------ src/compute/take/mod.rs | 4 +- src/compute/take/structure.rs | 2 +- src/io/ipc/read/array/struct_.rs | 2 +- src/io/json/read/deserialize.rs | 2 +- src/io/json_integration/read.rs | 2 +- src/record_batch.rs | 2 +- tests/it/array/growable/struct_.rs | 4 +- tests/it/io/json/mod.rs | 4 +- tests/it/io/json/read.rs | 4 +- tests/it/io/json/write.rs | 8 ++-- tests/it/io/print.rs | 2 +- 14 files changed, 67 insertions(+), 46 deletions(-) diff --git a/src/array/growable/structure.rs b/src/array/growable/structure.rs index f0f9053e952..620fe191509 100644 --- a/src/array/growable/structure.rs +++ b/src/array/growable/structure.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use crate::{ array::{Array, StructArray}, bitmap::MutableBitmap, + datatypes::DataType, }; use super::{ @@ -68,7 +69,11 @@ impl<'a> GrowableStruct<'a> { let values = std::mem::take(&mut self.values); let values = values.into_iter().map(|mut x| x.as_arc()).collect(); - StructArray::from_data(self.arrays[0].fields().to_vec(), values, validity.into()) + StructArray::from_data( + DataType::Struct(self.arrays[0].fields().to_vec()), + values, + validity.into(), + ) } } @@ -116,6 +121,10 @@ impl<'a> From> for StructArray { fn from(val: GrowableStruct<'a>) -> Self { let values = val.values.into_iter().map(|mut x| x.as_arc()).collect(); - StructArray::from_data(val.arrays[0].fields().to_vec(), values, val.validity.into()) + StructArray::from_data( + DataType::Struct(val.arrays[0].fields().to_vec()), + values, + val.validity.into(), + ) } } diff --git a/src/array/mod.rs b/src/array/mod.rs index 46bc3730d8f..e54bf718f69 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -249,7 +249,7 @@ pub fn new_empty_array(data_type: DataType) -> Box { DataType::List(_) => Box::new(ListArray::::new_empty(data_type)), DataType::LargeList(_) => Box::new(ListArray::::new_empty(data_type)), DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_empty(data_type)), - DataType::Struct(fields) => Box::new(StructArray::new_empty(&fields)), + DataType::Struct(_) => Box::new(StructArray::new_empty(data_type)), DataType::Union(_, _, _) => Box::new(UnionArray::new_empty(data_type)), DataType::Dictionary(key_type, value_type) => { with_match_dictionary_key_type!(key_type.as_ref(), |$T| { @@ -298,7 +298,7 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { DataType::List(_) => Box::new(ListArray::::new_null(data_type, length)), DataType::LargeList(_) => Box::new(ListArray::::new_null(data_type, length)), DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_null(data_type, length)), - DataType::Struct(fields) => Box::new(StructArray::new_null(&fields, length)), + DataType::Struct(_) => Box::new(StructArray::new_null(data_type, length)), DataType::Union(_, _, _) => Box::new(UnionArray::new_null(data_type, length)), DataType::Dictionary(key_type, value_type) => { with_match_dictionary_key_type!(key_type.as_ref(), |$T| { diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 88bb1057b66..488df1097d5 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -35,21 +35,29 @@ pub struct StructArray { impl StructArray { /// Creates an empty [`StructArray`]. - pub fn new_empty(fields: &[Field]) -> Self { - let values = fields - .iter() - .map(|field| new_empty_array(field.data_type().clone()).into()) - .collect(); - Self::from_data(fields.to_vec(), values, None) + pub fn new_empty(data_type: DataType) -> Self { + if let DataType::Struct(fields) = &data_type { + let values = fields + .iter() + .map(|field| new_empty_array(field.data_type().clone()).into()) + .collect(); + Self::from_data(data_type, values, None) + } else { + panic!("StructArray must be initialized with DataType::Struct"); + } } /// Creates a null [`StructArray`] of length `length`. - pub fn new_null(fields: &[Field], length: usize) -> Self { - let values = fields - .iter() - .map(|field| new_null_array(field.data_type().clone(), length).into()) - .collect(); - Self::from_data(fields.to_vec(), values, Some(Bitmap::new_zeroed(length))) + pub fn new_null(data_type: DataType, length: usize) -> Self { + if let DataType::Struct(fields) = &data_type { + let values = fields + .iter() + .map(|field| new_null_array(field.data_type().clone(), length).into()) + .collect(); + Self::from_data(data_type, values, Some(Bitmap::new_zeroed(length))) + } else { + panic!("StructArray must be initialized with DataType::Struct"); + } } /// Canonical method to create a [`StructArray`]. @@ -58,20 +66,24 @@ impl StructArray { /// * values's len is different from Fields' length. /// * any element of values has a different length than the first element. pub fn from_data( - fields: Vec, + data_type: DataType, values: Vec>, validity: Option, ) -> Self { - assert!(!fields.is_empty()); - assert_eq!(fields.len(), values.len()); - assert!(values.iter().all(|x| x.len() == values[0].len())); - if let Some(ref validity) = validity { - assert_eq!(values[0].len(), validity.len()); - } - Self { - data_type: DataType::Struct(fields), - values, - validity, + if let DataType::Struct(fields) = &data_type { + assert!(!fields.is_empty()); + assert_eq!(fields.len(), values.len()); + assert!(values.iter().all(|x| x.len() == values[0].len())); + if let Some(ref validity) = validity { + assert_eq!(values[0].len(), validity.len()); + } + Self { + data_type, + values, + validity, + } + } else { + panic!("StructArray must be initialized with DataType::Struct"); } } @@ -199,6 +211,6 @@ unsafe impl FromFfi for StructArray { if offset > 0 { validity = validity.map(|x| x.slice(offset, length)) } - Ok(Self::from_data(fields, values, validity)) + Ok(Self::from_data(DataType::Struct(fields), values, validity)) } } diff --git a/src/compute/take/mod.rs b/src/compute/take/mod.rs index 18685b0e382..ebbb172bb5a 100644 --- a/src/compute/take/mod.rs +++ b/src/compute/take/mod.rs @@ -253,7 +253,7 @@ mod tests { Field::new("b", DataType::Int32, true), ]; StructArray::from_data( - fields, + DataType::Struct(fields), vec![ Arc::new(boolean) as Arc, Arc::new(int) as Arc, @@ -277,7 +277,7 @@ mod tests { .collect::() .into(); let expected = StructArray::from_data( - array.fields().to_vec(), + array.data_type().clone(), vec![ Arc::new(boolean) as Arc, Arc::new(int) as Arc, diff --git a/src/compute/take/structure.rs b/src/compute/take/structure.rs index b724b28759c..47aacbae1f3 100644 --- a/src/compute/take/structure.rs +++ b/src/compute/take/structure.rs @@ -61,7 +61,7 @@ pub fn take(array: &StructArray, indices: &PrimitiveArray) -> Resul .collect::>()?; let validity = take_validity(array.validity(), indices)?; Ok(StructArray::from_data( - array.fields().to_vec(), + array.data_type().clone(), values, validity, )) diff --git a/src/io/ipc/read/array/struct_.rs b/src/io/ipc/read/array/struct_.rs index c259849c37a..aa14105dbba 100644 --- a/src/io/ipc/read/array/struct_.rs +++ b/src/io/ipc/read/array/struct_.rs @@ -51,7 +51,7 @@ pub fn read_struct( }) .collect::>>()?; - Ok(StructArray::from_data(fields.to_vec(), values, validity)) + Ok(StructArray::from_data(data_type, values, validity)) } pub fn skip_struct( diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index fb9e56ef1f6..92e30c3a53e 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -188,7 +188,7 @@ fn read_struct(rows: &[&Value], data_type: DataType) -> StructArray { .map(|(_, (data_type, values))| read(&values, data_type.clone())) .collect::>(); - StructArray::from_data(fields.to_vec(), values, None) + StructArray::from_data(data_type, values, None) } fn read_dictionary(rows: &[&Value], data_type: DataType) -> DictionaryArray { diff --git a/src/io/json_integration/read.rs b/src/io/json_integration/read.rs index 188aa6f412b..66901eae1fe 100644 --- a/src/io/json_integration/read.rs +++ b/src/io/json_integration/read.rs @@ -301,7 +301,7 @@ pub fn to_array( .map(|(field, col)| to_array(field, col, dictionaries)) .collect::>>()?; - let array = StructArray::from_data(fields.clone(), values, validity); + let array = StructArray::from_data(data_type.clone(), values, validity); Ok(Arc::new(array)) } DataType::Dictionary(key_type, _) => { diff --git a/src/record_batch.rs b/src/record_batch.rs index 61686936231..40164d59eb8 100644 --- a/src/record_batch.rs +++ b/src/record_batch.rs @@ -360,7 +360,7 @@ impl From for StructArray { .zip(batch.columns.iter()) .map(|t| (t.0.clone(), t.1.clone())) .unzip(); - StructArray::from_data(fields, values, None) + StructArray::from_data(DataType::Struct(fields), values, None) } } diff --git a/tests/it/array/growable/struct_.rs b/tests/it/array/growable/struct_.rs index f26e4c08ae4..0dd85e0dcd1 100644 --- a/tests/it/array/growable/struct_.rs +++ b/tests/it/array/growable/struct_.rs @@ -7,7 +7,7 @@ use arrow2::array::{ use arrow2::bitmap::Bitmap; use arrow2::datatypes::{DataType, Field}; -fn some_values() -> (Vec, Vec>) { +fn some_values() -> (DataType, Vec>) { let strings: Arc = Arc::new(Utf8Array::::from(&[ Some("a"), Some("aa"), @@ -26,7 +26,7 @@ fn some_values() -> (Vec, Vec>) { Field::new("f1", DataType::Utf8, true), Field::new("f2", DataType::Int32, true), ]; - (fields, vec![strings, ints]) + (DataType::Struct(fields), vec![strings, ints]) } #[test] diff --git a/tests/it/io/json/mod.rs b/tests/it/io/json/mod.rs index b8bc9a5f7be..1bf3d0856cd 100644 --- a/tests/it/io/json/mod.rs +++ b/tests/it/io/json/mod.rs @@ -212,11 +212,11 @@ fn case_struct() -> (String, Schema, Vec>) { // build expected output let d = Utf8Array::::from(&vec![Some("text"), None, Some("text"), None]); - let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + let c = StructArray::from_data(DataType::Struct(vec![d_field]), vec![Arc::new(d)], None); let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]); let expected = StructArray::from_data( - vec![Field::new("b", DataType::Boolean, true), c_field], + DataType::Struct(vec![Field::new("b", DataType::Boolean, true), c_field]), vec![Arc::new(b), Arc::new(c)], None, ); diff --git a/tests/it/io/json/read.rs b/tests/it/io/json/read.rs index 89dc10e39b4..4a7c0b0d3d3 100644 --- a/tests/it/io/json/read.rs +++ b/tests/it/io/json/read.rs @@ -154,7 +154,7 @@ fn nested_list_arrays() { None, ]); - let c = StructArray::from_data(vec![d_field], vec![Arc::new(d)], None); + let c = StructArray::from_data(DataType::Struct(vec![d_field]), vec![Arc::new(d)], None); let b = BooleanArray::from(vec![ Some(true), @@ -165,7 +165,7 @@ fn nested_list_arrays() { Some(true), ]); let a_struct = StructArray::from_data( - vec![b_field, c_field], + DataType::Struct(vec![b_field, c_field]), vec![Arc::new(b) as Arc, Arc::new(c) as Arc], None, ); diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index 13337ec7bd3..93c6fb27f8d 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -51,11 +51,11 @@ fn write_nested_structs() { ]); let c1 = StructArray::from_data( - fields, + DataType::Struct(fields), vec![ Arc::new(Int32Array::from(&[Some(1), None, Some(5)])), Arc::new(StructArray::from_data( - vec![c121], + DataType::Struct(vec![c121]), vec![Arc::new(Utf8Array::::from(&vec![ Some("e"), Some("f"), @@ -192,11 +192,11 @@ fn write_list_of_struct() { let schema = Schema::new(vec![field_c1, field_c2]); let s = StructArray::from_data( - fields, + DataType::Struct(fields), vec![ Arc::new(Int32Array::from(&[Some(1), None, Some(5)])), Arc::new(StructArray::from_data( - inner, + DataType::Struct(inner), vec![Arc::new(Utf8Array::::from(&vec![ Some("e"), Some("f"), diff --git a/tests/it/io/print.rs b/tests/it/io/print.rs index a2ee1b97ef9..040e7c4b555 100644 --- a/tests/it/io/print.rs +++ b/tests/it/io/print.rs @@ -360,7 +360,7 @@ fn write_struct() -> Result<()> { let validity = Some(Bitmap::from(&[true, false, true])); - let array = StructArray::from_data(fields, values, validity); + let array = StructArray::from_data(DataType::Struct(fields), values, validity); let schema = Schema::new(vec![Field::new("a", array.data_type().clone(), true)]); From 106e355fe0becbc90d5754a3ec0fa684dd946105 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 29 Aug 2021 21:05:20 +0000 Subject: [PATCH 02/12] Dict migration. --- src/array/dictionary/mod.rs | 12 ++++++++---- src/array/mod.rs | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index beb4960fff1..db0fa39cef1 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -50,10 +50,14 @@ impl DictionaryArray { /// Returns an [`DictionaryArray`] whose all elements are null #[inline] pub fn new_null(data_type: DataType, length: usize) -> Self { - Self::from_data( - PrimitiveArray::::new_null(K::DATA_TYPE, length), - new_empty_array(data_type).into(), - ) + if let DataType::Dictionary(_, data_type) = data_type { + Self::from_data( + PrimitiveArray::::new_null(K::DATA_TYPE, length), + new_empty_array(data_type.as_ref().clone()).into(), + ) + } else { + panic!("DictionaryArray must be initialized with DataType::Dictionary"); + } } /// The canonical method to create a new [`DictionaryArray`]. diff --git a/src/array/mod.rs b/src/array/mod.rs index e54bf718f69..ae2d11d8b49 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -300,9 +300,9 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_null(data_type, length)), DataType::Struct(_) => Box::new(StructArray::new_null(data_type, length)), DataType::Union(_, _, _) => Box::new(UnionArray::new_null(data_type, length)), - DataType::Dictionary(key_type, value_type) => { + DataType::Dictionary(ref key_type, _) => { with_match_dictionary_key_type!(key_type.as_ref(), |$T| { - Box::new(DictionaryArray::<$T>::new_null(*value_type, length)) + Box::new(DictionaryArray::<$T>::new_null(data_type, length)) }) } } From 140a703996f3595e0e50f5c4f3e0e12831ac72f9 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 10:41:59 +0000 Subject: [PATCH 03/12] Migrated utf8. --- src/array/dictionary/mod.rs | 12 ++++--- src/array/growable/utf8.rs | 8 ++++- src/array/mod.rs | 12 +++---- src/array/utf8/ffi.rs | 3 +- src/array/utf8/mod.rs | 39 +++++++++++++-------- src/array/utf8/mutable.rs | 44 +++++++++++++++++++++--- src/compute/cast/utf8_to.rs | 10 ++++-- src/compute/substring.rs | 7 +++- src/compute/take/utf8.rs | 3 +- src/io/ipc/read/array/utf8.rs | 6 +++- src/io/ipc/read/deserialize.rs | 2 ++ src/io/json_integration/read.rs | 8 ++--- src/io/parquet/read/binary/basic.rs | 2 ++ src/io/parquet/read/binary/dictionary.rs | 9 ++++- src/io/parquet/read/binary/nested.rs | 1 + src/io/parquet/read/mod.rs | 4 +-- tests/it/array/utf8/mod.rs | 25 ++++++++++---- 17 files changed, 147 insertions(+), 48 deletions(-) diff --git a/src/array/dictionary/mod.rs b/src/array/dictionary/mod.rs index db0fa39cef1..acab539d6cb 100644 --- a/src/array/dictionary/mod.rs +++ b/src/array/dictionary/mod.rs @@ -43,17 +43,21 @@ pub struct DictionaryArray { impl DictionaryArray { /// Returns a new empty [`DictionaryArray`]. pub fn new_empty(data_type: DataType) -> Self { - let values = new_empty_array(data_type).into(); - Self::from_data(PrimitiveArray::::new_empty(K::DATA_TYPE), values) + if let DataType::Dictionary(_, values) = data_type { + let values = new_empty_array(values.as_ref().clone()).into(); + Self::from_data(PrimitiveArray::::new_empty(K::DATA_TYPE), values) + } else { + panic!("DictionaryArray must be initialized with DataType::Dictionary"); + } } /// Returns an [`DictionaryArray`] whose all elements are null #[inline] pub fn new_null(data_type: DataType, length: usize) -> Self { - if let DataType::Dictionary(_, data_type) = data_type { + if let DataType::Dictionary(_, values) = data_type { Self::from_data( PrimitiveArray::::new_null(K::DATA_TYPE, length), - new_empty_array(data_type.as_ref().clone()).into(), + new_empty_array(values.as_ref().clone()).into(), ) } else { panic!("DictionaryArray must be initialized with DataType::Dictionary"); diff --git a/src/array/growable/utf8.rs b/src/array/growable/utf8.rs index 2d34c64866c..355d1e20121 100644 --- a/src/array/growable/utf8.rs +++ b/src/array/growable/utf8.rs @@ -56,7 +56,12 @@ impl<'a, O: Offset> GrowableUtf8<'a, O> { let values = std::mem::take(&mut self.values); unsafe { - Utf8Array::::from_data_unchecked(offsets.into(), values.into(), validity.into()) + Utf8Array::::from_data_unchecked( + self.arrays[0].data_type().clone(), + offsets.into(), + values.into(), + validity.into(), + ) } } } @@ -96,6 +101,7 @@ impl<'a, O: Offset> From> for Utf8Array { fn from(val: GrowableUtf8<'a, O>) -> Self { unsafe { Utf8Array::::from_data_unchecked( + val.arrays[0].data_type().clone(), val.offsets.into(), val.values.into(), val.validity.into(), diff --git a/src/array/mod.rs b/src/array/mod.rs index ae2d11d8b49..a9110351414 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -244,16 +244,16 @@ pub fn new_empty_array(data_type: DataType) -> Box { DataType::Binary => Box::new(BinaryArray::::new_empty()), DataType::LargeBinary => Box::new(BinaryArray::::new_empty()), DataType::FixedSizeBinary(_) => Box::new(FixedSizeBinaryArray::new_empty(data_type)), - DataType::Utf8 => Box::new(Utf8Array::::new_empty()), - DataType::LargeUtf8 => Box::new(Utf8Array::::new_empty()), + DataType::Utf8 => Box::new(Utf8Array::::new_empty(data_type)), + DataType::LargeUtf8 => Box::new(Utf8Array::::new_empty(data_type)), DataType::List(_) => Box::new(ListArray::::new_empty(data_type)), DataType::LargeList(_) => Box::new(ListArray::::new_empty(data_type)), DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_empty(data_type)), DataType::Struct(_) => Box::new(StructArray::new_empty(data_type)), DataType::Union(_, _, _) => Box::new(UnionArray::new_empty(data_type)), - DataType::Dictionary(key_type, value_type) => { + DataType::Dictionary(ref key_type, _) => { with_match_dictionary_key_type!(key_type.as_ref(), |$T| { - Box::new(DictionaryArray::<$T>::new_empty(*value_type)) + Box::new(DictionaryArray::<$T>::new_empty(data_type)) }) } } @@ -293,8 +293,8 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { DataType::Binary => Box::new(BinaryArray::::new_null(length)), DataType::LargeBinary => Box::new(BinaryArray::::new_null(length)), DataType::FixedSizeBinary(_) => Box::new(FixedSizeBinaryArray::new_null(data_type, length)), - DataType::Utf8 => Box::new(Utf8Array::::new_null(length)), - DataType::LargeUtf8 => Box::new(Utf8Array::::new_null(length)), + DataType::Utf8 => Box::new(Utf8Array::::new_null(data_type, length)), + DataType::LargeUtf8 => Box::new(Utf8Array::::new_null(data_type, length)), DataType::List(_) => Box::new(ListArray::::new_null(data_type, length)), DataType::LargeList(_) => Box::new(ListArray::::new_null(data_type, length)), DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_null(data_type, length)), diff --git a/src/array/utf8/ffi.rs b/src/array/utf8/ffi.rs index adc2a1ea86c..13631725916 100644 --- a/src/array/utf8/ffi.rs +++ b/src/array/utf8/ffi.rs @@ -32,6 +32,7 @@ unsafe impl FromFfi for Utf8Array { offsets = offsets.slice(offset, length); validity = validity.map(|x| x.slice(offset, length)) } - Ok(Self::from_data(offsets, values, validity)) + let data_type = Self::default_data_type(); + Ok(Self::from_data(data_type, offsets, values, validity)) } } diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index bf627d2464f..00e297e3cff 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -37,14 +37,17 @@ pub struct Utf8Array { impl Utf8Array { /// Returns a new empty [`Utf8Array`]. #[inline] - pub fn new_empty() -> Self { - unsafe { Self::from_data_unchecked(Buffer::from(&[O::zero()]), Buffer::new(), None) } + pub fn new_empty(data_type: DataType) -> Self { + unsafe { + Self::from_data_unchecked(data_type, Buffer::from(&[O::zero()]), Buffer::new(), None) + } } /// Returns a new [`Utf8Array`] whose all slots are null / `None`. #[inline] - pub fn new_null(length: usize) -> Self { + pub fn new_null(data_type: DataType, length: usize) -> Self { Self::from_data( + data_type, Buffer::new_zeroed(length + 1), Buffer::new(), Some(Bitmap::new_zeroed(length)), @@ -54,21 +57,23 @@ impl Utf8Array { /// The canonical method to create a [`Utf8Array`] out of low-end APIs. /// # Panics /// This function panics iff: + /// * The `data_type`'s physical type is not consistent with the offset `O`. /// * The `offsets` and `values` are consistent /// * The `values` between `offsets` are utf8 encoded /// * The validity is not `None` and its length is different from `offsets`'s length minus one. - pub fn from_data(offsets: Buffer, values: Buffer, validity: Option) -> Self { + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { check_offsets_and_utf8(&offsets, &values); if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } Self { - data_type: if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }, + data_type, offsets, values, validity, @@ -76,10 +81,20 @@ impl Utf8Array { } } + /// Returns the default [`DataType`], `DataType::Utf8` or `DataType::LargeUtf8` + pub fn default_data_type() -> DataType { + if O::is_large() { + DataType::LargeUtf8 + } else { + DataType::Utf8 + } + } + /// The same as [`Utf8Array::from_data`] but does not check for utf8. /// # Safety /// `values` buffer must contain valid utf8 between every `offset` pub unsafe fn from_data_unchecked( + data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, @@ -87,11 +102,7 @@ impl Utf8Array { check_offsets(&offsets, values.len()); Self { - data_type: if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }, + data_type, offsets, values, validity, diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 61aa037aac5..fc7eb61c898 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -17,6 +17,7 @@ use super::Utf8Array; /// The mutable version of [`Utf8Array`]. See [`MutableArray`] for more details. #[derive(Debug)] pub struct MutableUtf8Array { + data_type: DataType, offsets: MutableBuffer, values: MutableBuffer, validity: Option, @@ -25,6 +26,7 @@ pub struct MutableUtf8Array { impl From> for Utf8Array { fn from(other: MutableUtf8Array) -> Self { Utf8Array::::from_data( + other.data_type, other.offsets.into(), other.values.into(), other.validity.map(|x| x.into()), @@ -44,6 +46,11 @@ impl MutableUtf8Array { let mut offsets = MutableBuffer::::new(); offsets.push(O::default()); Self { + data_type: if O::is_large() { + DataType::LargeUtf8 + } else { + DataType::Utf8 + }, offsets, values: MutableBuffer::::new(), validity: None, @@ -57,6 +64,7 @@ impl MutableUtf8Array { /// * The `values` between `offsets` are not utf8 encoded /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub fn from_data( + data_type: DataType, offsets: MutableBuffer, values: MutableBuffer, validity: Option, @@ -65,7 +73,13 @@ impl MutableUtf8Array { if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } + if O::is_large() { + assert_eq!(data_type, DataType::LargeUtf8) + } else { + assert_eq!(data_type, DataType::Utf8) + } Self { + data_type, offsets, values, validity, @@ -80,6 +94,7 @@ impl MutableUtf8Array { /// * The `offsets` and `values` are inconsistent /// * The validity is not `None` and its length is different from `offsets`'s length minus one. pub unsafe fn from_data_unchecked( + data_type: DataType, offsets: MutableBuffer, values: MutableBuffer, validity: Option, @@ -88,13 +103,27 @@ impl MutableUtf8Array { if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } + if O::is_large() { + assert_eq!(data_type, DataType::LargeUtf8) + } else { + assert_eq!(data_type, DataType::Utf8) + } Self { + data_type, offsets, values, validity, } } + fn default_data_type() -> DataType { + if O::is_large() { + DataType::LargeUtf8 + } else { + DataType::Utf8 + } + } + /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots. pub fn with_capacity(capacity: usize) -> Self { Self::with_capacities(capacity, 0) @@ -106,6 +135,7 @@ impl MutableUtf8Array { offsets.push(O::default()); Self { + data_type: Self::default_data_type(), offsets, values: MutableBuffer::::with_capacity(values), validity: None, @@ -171,6 +201,7 @@ impl MutableArray for MutableUtf8Array { fn as_arc(&mut self) -> Arc { Arc::new(Utf8Array::from_data( + Self::default_data_type(), std::mem::take(&mut self.offsets).into(), std::mem::take(&mut self.values).into(), std::mem::take(&mut self.validity).map(|x| x.into()), @@ -218,7 +249,7 @@ impl MutableUtf8Array { let (validity, offsets, values) = trusted_len_unzip(iterator); // soundness: P is `str` - Self::from_data_unchecked(offsets, values, validity) + Self::from_data_unchecked(Self::default_data_type(), offsets, values, validity) } /// Creates a [`MutableUtf8Array`] from an iterator of trusted length. @@ -240,7 +271,7 @@ impl MutableUtf8Array { // soundness: I is `TrustedLen` let (offsets, values) = unsafe { trusted_len_values_iter(iterator) }; // soundness: T is AsRef - unsafe { Self::from_data_unchecked(offsets, values, None) } + unsafe { Self::from_data_unchecked(Self::default_data_type(), offsets, values, None) } } /// Creates a new [`MutableUtf8Array`] from an iterator. @@ -275,7 +306,12 @@ impl MutableUtf8Array { let (validity, offsets, values) = try_trusted_len_unzip(iterator)?; // soundness: P is `str` - Ok(Self::from_data_unchecked(offsets, values, validity)) + Ok(Self::from_data_unchecked( + Self::default_data_type(), + offsets, + values, + validity, + )) } /// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length. @@ -293,7 +329,7 @@ impl MutableUtf8Array { pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { let (offsets, values) = values_iter(iterator); // soundness: T: AsRef - unsafe { Self::from_data_unchecked(offsets, values, None) } + unsafe { Self::from_data_unchecked(Self::default_data_type(), offsets, values, None) } } } diff --git a/src/compute/cast/utf8_to.rs b/src/compute/cast/utf8_to.rs index 5a46dd46360..e267199a015 100644 --- a/src/compute/cast/utf8_to.rs +++ b/src/compute/cast/utf8_to.rs @@ -107,18 +107,24 @@ pub fn utf8_to_timestamp_ns(from: &Utf8Array) -> PrimitiveArray) -> Utf8Array { + let data_type = Utf8Array::::default_data_type(); let values = from.values().clone(); let offsets = from.offsets().iter().map(|x| *x as i64); let offsets = Buffer::from_trusted_len_iter(offsets); - unsafe { Utf8Array::::from_data_unchecked(offsets, values, from.validity().clone()) } + unsafe { + Utf8Array::::from_data_unchecked(data_type, offsets, values, from.validity().clone()) + } } pub fn utf8_large_to_utf8(from: &Utf8Array) -> Result> { + let data_type = Utf8Array::::default_data_type(); let values = from.values().clone(); let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(ArrowError::from_external_error)?; let offsets = from.offsets().iter().map(|x| *x as i32); let offsets = Buffer::from_trusted_len_iter(offsets); - Ok(unsafe { Utf8Array::::from_data_unchecked(offsets, values, from.validity().clone()) }) + Ok(unsafe { + Utf8Array::::from_data_unchecked(data_type, offsets, values, from.validity().clone()) + }) } diff --git a/src/compute/substring.rs b/src/compute/substring.rs index 2e4fd209b6c..c82029b747f 100644 --- a/src/compute/substring.rs +++ b/src/compute/substring.rs @@ -60,7 +60,12 @@ fn utf8_substring(array: &Utf8Array, start: O, length: &Option) new_values.extend_from_slice(&values[start..start + length]); }); - Utf8Array::::from_data(new_offsets.into(), new_values.into(), validity.clone()) + Utf8Array::::from_data( + array.data_type().clone(), + new_offsets.into(), + new_values.into(), + validity.clone(), + ) } /// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`. diff --git a/src/compute/take/utf8.rs b/src/compute/take/utf8.rs index 9d95b86623f..99fc091f491 100644 --- a/src/compute/take/utf8.rs +++ b/src/compute/take/utf8.rs @@ -25,6 +25,7 @@ pub fn take( values: &Utf8Array, indices: &PrimitiveArray, ) -> Utf8Array { + let data_type = values.data_type().clone(); let indices_has_validity = indices.null_count() > 0; let values_has_validity = values.null_count() > 0; @@ -36,7 +37,7 @@ pub fn take( (false, true) => take_indices_validity(values.offsets(), values.values(), indices), (true, true) => take_values_indices_validity(values, indices), }; - unsafe { Utf8Array::::from_data_unchecked(offsets, values, validity) } + unsafe { Utf8Array::::from_data_unchecked(data_type, offsets, values, validity) } } #[cfg(test)] diff --git a/src/io/ipc/read/array/utf8.rs b/src/io/ipc/read/array/utf8.rs index 17ffed8da86..788c4c16cdc 100644 --- a/src/io/ipc/read/array/utf8.rs +++ b/src/io/ipc/read/array/utf8.rs @@ -4,6 +4,7 @@ use std::io::{Read, Seek}; use crate::array::{Offset, Utf8Array}; use crate::buffer::Buffer; +use crate::datatypes::DataType; use crate::error::Result; use crate::io::ipc::gen::Message::BodyCompression; use crate::types::NativeType; @@ -14,6 +15,7 @@ use super::super::read_basic::*; pub fn read_utf8( field_nodes: &mut VecDeque, + data_type: DataType, buffers: &mut VecDeque<&gen::Schema::Buffer>, reader: &mut R, block_offset: u64, @@ -55,7 +57,9 @@ where compression, )?; - Ok(Utf8Array::::from_data(offsets, values, validity)) + Ok(Utf8Array::::from_data( + data_type, offsets, values, validity, + )) } pub fn skip_utf8(field_nodes: &mut VecDeque, buffers: &mut VecDeque<&gen::Schema::Buffer>) { diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index 9866aaf2687..dbd62bedee7 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -205,6 +205,7 @@ pub fn read( DataType::Utf8 => { let array = read_utf8::( field_nodes, + data_type, buffers, reader, block_offset, @@ -216,6 +217,7 @@ pub fn read( DataType::LargeUtf8 => { let array = read_utf8::( field_nodes, + data_type, buffers, reader, block_offset, diff --git a/src/io/json_integration/read.rs b/src/io/json_integration/read.rs index 66901eae1fe..3ec75626134 100644 --- a/src/io/json_integration/read.rs +++ b/src/io/json_integration/read.rs @@ -158,7 +158,7 @@ fn to_binary(json_col: &ArrowJsonColumn) -> Arc { Arc::new(BinaryArray::from_data(offsets, values, validity)) } -fn to_utf8(json_col: &ArrowJsonColumn) -> Arc { +fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc { let validity = to_validity(&json_col.validity); let offsets = to_offsets::(json_col.offset.as_ref()); let values = json_col @@ -169,7 +169,7 @@ fn to_utf8(json_col: &ArrowJsonColumn) -> Arc { .map(|value| value.as_str().unwrap().as_bytes().to_vec()) .flatten() .collect(); - Arc::new(Utf8Array::from_data(offsets, values, validity)) + Arc::new(Utf8Array::from_data(data_type, offsets, values, validity)) } fn to_list( @@ -257,8 +257,8 @@ pub fn to_array( DataType::Float64 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), DataType::Binary => Ok(to_binary::(json_col)), DataType::LargeBinary => Ok(to_binary::(json_col)), - DataType::Utf8 => Ok(to_utf8::(json_col)), - DataType::LargeUtf8 => Ok(to_utf8::(json_col)), + DataType::Utf8 => Ok(to_utf8::(json_col, data_type.clone())), + DataType::LargeUtf8 => Ok(to_utf8::(json_col, data_type.clone())), DataType::FixedSizeBinary(_) => { let validity = to_validity(&json_col.validity); diff --git a/src/io/parquet/read/binary/basic.rs b/src/io/parquet/read/binary/basic.rs index 5171cd87dd0..e27c8072a48 100644 --- a/src/io/parquet/read/binary/basic.rs +++ b/src/io/parquet/read/binary/basic.rs @@ -288,6 +288,7 @@ where validity.into(), )), DataType::LargeUtf8 | DataType::Utf8 => Box::new(Utf8Array::from_data( + data_type.clone(), offsets.into(), values.into(), validity.into(), @@ -332,6 +333,7 @@ where validity.into(), )), DataType::LargeUtf8 | DataType::Utf8 => Box::new(Utf8Array::from_data( + data_type.clone(), offsets.into(), values.into(), validity.into(), diff --git a/src/io/parquet/read/binary/dictionary.rs b/src/io/parquet/read/binary/dictionary.rs index 011a19dd642..fc74810795a 100644 --- a/src/io/parquet/read/binary/dictionary.rs +++ b/src/io/parquet/read/binary/dictionary.rs @@ -12,6 +12,7 @@ use crate::{ array::{Array, DictionaryArray, DictionaryKey, Offset, PrimitiveArray, Utf8Array}, bitmap::{utils::BitmapIter, MutableBitmap}, buffer::MutableBuffer, + datatypes::DataType, error::{ArrowError, Result}, }; @@ -124,6 +125,7 @@ where pub fn iter_to_array( mut iter: I, metadata: &ColumnChunkMetaData, + data_type: DataType, ) -> Result> where ArrowError: From, @@ -149,6 +151,11 @@ where } let keys = PrimitiveArray::from_data(K::DATA_TYPE, indices.into(), validity.into()); - let values = Arc::new(Utf8Array::from_data(offsets.into(), values.into(), None)); + let values = Arc::new(Utf8Array::from_data( + data_type, + offsets.into(), + values.into(), + None, + )); Ok(Box::new(DictionaryArray::::from_data(keys, values))) } diff --git a/src/io/parquet/read/binary/nested.rs b/src/io/parquet/read/binary/nested.rs index 139e66ae156..4e02acd1afd 100644 --- a/src/io/parquet/read/binary/nested.rs +++ b/src/io/parquet/read/binary/nested.rs @@ -194,6 +194,7 @@ where validity.into(), )) as Arc, DataType::LargeUtf8 | DataType::Utf8 => Arc::new(Utf8Array::from_data( + inner_data_type.clone(), offsets.into(), values.into(), validity.into(), diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 1ade626d39b..212677e0f52 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -152,8 +152,8 @@ fn dict_read< Int64 | Date64 | Time64(_) | Duration(_) | Timestamp(_, _) => { primitive::iter_to_dict_array::(iter, metadata, data_type, |x: i64| x) } - Utf8 => binary::iter_to_dict_array::(iter, metadata), - LargeUtf8 => binary::iter_to_dict_array::(iter, metadata), + Utf8 => binary::iter_to_dict_array::(iter, metadata, data_type), + LargeUtf8 => binary::iter_to_dict_array::(iter, metadata, data_type), other => Err(ArrowError::NotYetImplemented(format!( "Reading dictionaries of type {:?}", other diff --git a/tests/it/array/utf8/mod.rs b/tests/it/array/utf8/mod.rs index c9cc823a740..49c993d5751 100644 --- a/tests/it/array/utf8/mod.rs +++ b/tests/it/array/utf8/mod.rs @@ -1,4 +1,4 @@ -use arrow2::{array::*, bitmap::Bitmap, buffer::Buffer, error::Result}; +use arrow2::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result}; mod mutable; @@ -23,6 +23,7 @@ fn basics() { assert!(array.is_valid(2)); let array2 = Utf8Array::::from_data( + DataType::Utf8, array.offsets().clone(), array.values().clone(), array.validity().clone(), @@ -39,7 +40,7 @@ fn basics() { #[test] fn empty() { - let array = Utf8Array::::new_empty(); + let array = Utf8Array::::new_empty(DataType::Utf8); assert_eq!(array.values().as_slice(), b""); assert_eq!(array.offsets().as_slice(), &[0]); assert_eq!(array.validity(), &None); @@ -59,7 +60,10 @@ fn from_slice() { let offsets = Buffer::from(&[0, 1, 2, 4]); let values = Buffer::from("abcc".as_bytes()); - assert_eq!(b, Utf8Array::::from_data(offsets, values, None)); + assert_eq!( + b, + Utf8Array::::from_data(DataType::Utf8, offsets, values, None) + ); } #[test] @@ -68,7 +72,10 @@ fn from_iter_values() { let offsets = Buffer::from(&[0, 1, 2, 4]); let values = Buffer::from("abcc".as_bytes()); - assert_eq!(b, Utf8Array::::from_data(offsets, values, None)); + assert_eq!( + b, + Utf8Array::::from_data(DataType::Utf8, offsets, values, None) + ); } #[test] @@ -78,7 +85,10 @@ fn from_trusted_len_iter() { let offsets = Buffer::from(&[0, 1, 2, 4]); let values = Buffer::from("abcc".as_bytes()); - assert_eq!(b, Utf8Array::::from_data(offsets, values, None)); + assert_eq!( + b, + Utf8Array::::from_data(DataType::Utf8, offsets, values, None) + ); } #[test] @@ -92,5 +102,8 @@ fn try_from_trusted_len_iter() { let offsets = Buffer::from(&[0, 1, 2, 4]); let values = Buffer::from("abcc".as_bytes()); - assert_eq!(b, Utf8Array::::from_data(offsets, values, None)); + assert_eq!( + b, + Utf8Array::::from_data(DataType::Utf8, offsets, values, None) + ); } From 49f4980048b677b26d44dd66df8422f64f458ebc Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 10:50:42 +0000 Subject: [PATCH 04/12] Migrated Null --- src/array/growable/mod.rs | 2 +- src/array/growable/null.rs | 21 ++++++++++++++------- src/array/mod.rs | 4 ++-- src/array/null.rs | 12 ++++++------ src/compute/take/mod.rs | 5 ++++- src/io/ipc/read/array/null.rs | 9 ++++++--- src/io/ipc/read/deserialize.rs | 2 +- src/io/json/read/deserialize.rs | 2 +- src/io/json_integration/read.rs | 5 ++++- tests/it/array/growable/null.rs | 13 +++++++++---- 10 files changed, 48 insertions(+), 27 deletions(-) diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index ecc65035692..25993dfdb4a 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -97,7 +97,7 @@ pub fn make_growable<'a>( assert!(arrays.iter().all(|&item| item.data_type() == data_type)); match data_type { - DataType::Null => Box::new(null::GrowableNull::new()), + DataType::Null => Box::new(null::GrowableNull::new(data_type.clone())), DataType::Boolean => { let arrays = arrays .iter() diff --git a/src/array/growable/null.rs b/src/array/growable/null.rs index 74c4498a897..725968521fe 100644 --- a/src/array/growable/null.rs +++ b/src/array/growable/null.rs @@ -1,23 +1,30 @@ use std::sync::Arc; -use crate::array::{Array, NullArray}; +use crate::{ + array::{Array, NullArray}, + datatypes::DataType, +}; use super::Growable; /// Concrete [`Growable`] for the [`NullArray`]. pub struct GrowableNull { + data_type: DataType, length: usize, } impl Default for GrowableNull { fn default() -> Self { - Self { length: 0 } + Self::new(DataType::Null) } } impl GrowableNull { - pub fn new() -> Self { - Self::default() + pub fn new(data_type: DataType) -> Self { + Self { + data_type, + length: 0, + } } } @@ -31,16 +38,16 @@ impl<'a> Growable<'a> for GrowableNull { } fn as_arc(&mut self) -> Arc { - Arc::new(NullArray::from_data(self.length)) + Arc::new(NullArray::from_data(self.data_type.clone(), self.length)) } fn as_box(&mut self) -> Box { - Box::new(NullArray::from_data(self.length)) + Box::new(NullArray::from_data(self.data_type.clone(), self.length)) } } impl From for NullArray { fn from(val: GrowableNull) -> Self { - NullArray::from_data(val.length) + NullArray::from_data(val.data_type, val.length) } } diff --git a/src/array/mod.rs b/src/array/mod.rs index a9110351414..7266f0384f0 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -215,7 +215,7 @@ impl Display for dyn Array { /// Creates a new [`Array`] with a [`Array::len`] of 0. pub fn new_empty_array(data_type: DataType) -> Box { match data_type { - DataType::Null => Box::new(NullArray::new_empty()), + DataType::Null => Box::new(NullArray::new_empty(data_type)), DataType::Boolean => Box::new(BooleanArray::new_empty()), DataType::Int8 => Box::new(PrimitiveArray::::new_empty(data_type)), DataType::Int16 => Box::new(PrimitiveArray::::new_empty(data_type)), @@ -264,7 +264,7 @@ pub fn new_empty_array(data_type: DataType) -> Box { /// for all types except Union, which does not have a validity. pub fn new_null_array(data_type: DataType, length: usize) -> Box { match data_type { - DataType::Null => Box::new(NullArray::new_null(length)), + DataType::Null => Box::new(NullArray::new_null(data_type, length)), DataType::Boolean => Box::new(BooleanArray::new_null(length)), DataType::Int8 => Box::new(PrimitiveArray::::new_null(data_type, length)), DataType::Int16 => Box::new(PrimitiveArray::::new_null(data_type, length)), diff --git a/src/array/null.rs b/src/array/null.rs index 4587e6bb174..dcddf92e512 100644 --- a/src/array/null.rs +++ b/src/array/null.rs @@ -12,19 +12,19 @@ pub struct NullArray { impl NullArray { /// Returns a new empty [`NullArray`]. - pub fn new_empty() -> Self { - Self::from_data(0) + pub fn new_empty(data_type: DataType) -> Self { + Self::from_data(data_type, 0) } /// Returns a new [`NullArray`]. - pub fn new_null(length: usize) -> Self { - Self::from_data(length) + pub fn new_null(data_type: DataType, length: usize) -> Self { + Self::from_data(data_type, length) } /// Returns a new [`NullArray`]. - pub fn from_data(length: usize) -> Self { + pub fn from_data(data_type: DataType, length: usize) -> Self { Self { - data_type: DataType::Null, + data_type, length, offset: 0, } diff --git a/src/compute/take/mod.rs b/src/compute/take/mod.rs index ebbb172bb5a..ada94c38fe5 100644 --- a/src/compute/take/mod.rs +++ b/src/compute/take/mod.rs @@ -59,7 +59,10 @@ pub fn take(values: &dyn Array, indices: &PrimitiveArray) -> Result } match values.data_type() { - DataType::Null => Ok(Box::new(NullArray::from_data(indices.len()))), + DataType::Null => Ok(Box::new(NullArray::from_data( + values.data_type().clone(), + indices.len(), + ))), DataType::Boolean => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(boolean::take::(values, indices))) diff --git a/src/io/ipc/read/array/null.rs b/src/io/ipc/read/array/null.rs index 7d937cc6494..d1cdcb0e499 100644 --- a/src/io/ipc/read/array/null.rs +++ b/src/io/ipc/read/array/null.rs @@ -1,11 +1,14 @@ use std::collections::VecDeque; -use crate::array::NullArray; +use crate::{array::NullArray, datatypes::DataType}; use super::super::deserialize::Node; -pub fn read_null(field_nodes: &mut VecDeque) -> NullArray { - NullArray::from_data(field_nodes.pop_front().unwrap().0.length() as usize) +pub fn read_null(field_nodes: &mut VecDeque, data_type: DataType) -> NullArray { + NullArray::from_data( + data_type, + field_nodes.pop_front().unwrap().0.length() as usize, + ) } pub fn skip_null(field_nodes: &mut VecDeque) { diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index dbd62bedee7..d209c726eaa 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -33,7 +33,7 @@ pub fn read( ) -> Result> { match data_type { DataType::Null => { - let array = read_null(field_nodes); + let array = read_null(field_nodes, data_type); Ok(Arc::new(array)) } DataType::Boolean => { diff --git a/src/io/json/read/deserialize.rs b/src/io/json/read/deserialize.rs index 92e30c3a53e..8ac22d6dca5 100644 --- a/src/io/json/read/deserialize.rs +++ b/src/io/json/read/deserialize.rs @@ -224,7 +224,7 @@ fn read_dictionary(rows: &[&Value], data_type: DataType) -> Di pub fn read(rows: &[&Value], data_type: DataType) -> Arc { match &data_type { - DataType::Null => Arc::new(NullArray::from_data(rows.len())), + DataType::Null => Arc::new(NullArray::from_data(data_type, rows.len())), DataType::Boolean => Arc::new(read_boolean(rows)), DataType::Int8 => Arc::new(read_int::(rows, data_type)), DataType::Int16 => Arc::new(read_int::(rows, data_type)), diff --git a/src/io/json_integration/read.rs b/src/io/json_integration/read.rs index 3ec75626134..00a735d276e 100644 --- a/src/io/json_integration/read.rs +++ b/src/io/json_integration/read.rs @@ -220,7 +220,10 @@ pub fn to_array( ) -> Result> { let data_type = field.data_type(); match data_type { - DataType::Null => Ok(Arc::new(NullArray::from_data(json_col.count))), + DataType::Null => Ok(Arc::new(NullArray::from_data( + data_type.clone(), + json_col.count, + ))), DataType::Boolean => { let validity = to_validity(&json_col.validity); let values = json_col diff --git a/tests/it/array/growable/null.rs b/tests/it/array/growable/null.rs index 934f6a24c93..47e25ed3b9b 100644 --- a/tests/it/array/growable/null.rs +++ b/tests/it/array/growable/null.rs @@ -1,15 +1,20 @@ -use arrow2::array::growable::{Growable, GrowableNull}; -use arrow2::array::*; +use arrow2::{ + array::{ + growable::{Growable, GrowableNull}, + NullArray, + }, + datatypes::DataType, +}; #[test] fn null() { - let mut mutable = GrowableNull::new(); + let mut mutable = GrowableNull::default(); mutable.extend(0, 1, 2); mutable.extend(1, 0, 1); let result: NullArray = mutable.into(); - let expected = NullArray::from_data(3); + let expected = NullArray::from_data(DataType::Null, 3); assert_eq!(result, expected); } From 238508a23600c89020e705183432cf299bfc812b Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 11:08:58 +0000 Subject: [PATCH 05/12] Migrated Boolean --- src/array/boolean/ffi.rs | 2 +- src/array/boolean/mod.rs | 18 +++++++------- src/array/boolean/mutable.rs | 35 +++++++++++++++++++++------ src/array/growable/boolean.rs | 8 ++++-- src/array/mod.rs | 4 +-- src/compute/aggregate/min_max.rs | 2 +- src/compute/boolean.rs | 9 ++++--- src/compute/boolean_kleene.rs | 13 ++++++++-- src/compute/cast/mod.rs | 20 +++++++-------- src/compute/cast/primitive_to.rs | 14 ++++++++--- src/compute/comparison/boolean.rs | 11 ++++++--- src/compute/comparison/primitive.rs | 15 +++++++++--- src/compute/comparison/utf8.rs | 7 +++--- src/compute/contains.rs | 4 +-- src/compute/like.rs | 9 +++++-- src/compute/regex_match.rs | 7 +++++- src/compute/take/boolean.rs | 3 ++- src/compute/utils.rs | 3 ++- src/io/ipc/read/array/boolean.rs | 4 ++- src/io/ipc/read/deserialize.rs | 13 +++++++--- src/io/json_integration/read.rs | 6 ++++- src/io/parquet/read/boolean/basic.rs | 13 ++++++++-- src/io/parquet/read/boolean/nested.rs | 17 ++++++++++++- 23 files changed, 169 insertions(+), 68 deletions(-) diff --git a/src/array/boolean/ffi.rs b/src/array/boolean/ffi.rs index 9dedfbc41de..745c27e54c0 100644 --- a/src/array/boolean/ffi.rs +++ b/src/array/boolean/ffi.rs @@ -34,6 +34,6 @@ unsafe impl FromFfi for BooleanArray { values = values.slice(offset, length); validity = validity.map(|x| x.slice(offset, length)) } - Ok(Self::from_data(values, validity)) + Ok(Self::from_data(data_type, values, validity)) } } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 3679a361ee9..d9aecc80acc 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -14,6 +14,7 @@ pub use mutable::*; /// Cloning and slicing this struct is `O(1)`. #[derive(Debug, Clone)] pub struct BooleanArray { + data_type: DataType, values: Bitmap, validity: Option, offset: usize, @@ -21,28 +22,26 @@ pub struct BooleanArray { impl BooleanArray { /// Returns a new empty [`BooleanArray`]. - #[inline] - pub fn new_empty() -> Self { - Self::from_data(Bitmap::new(), None) + pub fn new_empty(data_type: DataType) -> Self { + Self::from_data(data_type, Bitmap::new(), None) } /// Returns a new [`BooleanArray`] whose all slots are null / `None`. - #[inline] - pub fn new_null(length: usize) -> Self { + pub fn new_null(data_type: DataType, length: usize) -> Self { let bitmap = Bitmap::new_zeroed(length); - Self::from_data(bitmap.clone(), Some(bitmap)) + Self::from_data(data_type, bitmap.clone(), Some(bitmap)) } /// The canonical method to create a [`BooleanArray`] out of low-end APIs. /// # Panics /// This function panics iff: /// * The validity is not `None` and its length is different from `values`'s length - #[inline] - pub fn from_data(values: Bitmap, validity: Option) -> Self { + pub fn from_data(data_type: DataType, values: Bitmap, validity: Option) -> Self { if let Some(ref validity) = validity { assert_eq!(values.len(), validity.len()); } Self { + data_type, values, validity, offset: 0, @@ -58,6 +57,7 @@ impl BooleanArray { pub fn slice(&self, offset: usize, length: usize) -> Self { let validity = self.validity.clone().map(|x| x.slice(offset, length)); Self { + data_type: self.data_type.clone(), values: self.values.clone().slice(offset, length), validity, offset: self.offset + offset, @@ -100,7 +100,7 @@ impl Array for BooleanArray { #[inline] fn data_type(&self) -> &DataType { - &DataType::Boolean + &self.data_type } #[inline] diff --git a/src/array/boolean/mutable.rs b/src/array/boolean/mutable.rs index c0ccf5938e3..4be152a3515 100644 --- a/src/array/boolean/mutable.rs +++ b/src/array/boolean/mutable.rs @@ -17,13 +17,18 @@ use super::BooleanArray; /// This struct does not allocate a validity until one is required (i.e. push a null to it). #[derive(Debug)] pub struct MutableBooleanArray { + data_type: DataType, values: MutableBitmap, validity: Option, } impl From for BooleanArray { fn from(other: MutableBooleanArray) -> Self { - BooleanArray::from_data(other.values.into(), other.validity.map(|x| x.into())) + BooleanArray::from_data( + other.data_type, + other.values.into(), + other.validity.map(|x| x.into()), + ) } } @@ -49,6 +54,7 @@ impl MutableBooleanArray { /// Creates an new [`MutableBooleanArray`] with a capacity of values. pub fn with_capacity(capacity: usize) -> Self { Self { + data_type: DataType::Boolean, values: MutableBitmap::with_capacity(capacity), validity: None, } @@ -63,8 +69,16 @@ impl MutableBooleanArray { } /// Canonical method to create a new [`MutableBooleanArray`]. - pub fn from_data(values: MutableBitmap, validity: Option) -> Self { - Self { values, validity } + pub fn from_data( + data_type: DataType, + values: MutableBitmap, + validity: Option, + ) -> Self { + Self { + data_type, + values, + validity, + } } /// Pushes a new entry to [`MutableBooleanArray`]. @@ -137,7 +151,11 @@ impl MutableBooleanArray { /// Creates a new [`MutableBooleanArray`] from an [`TrustedLen`] of `bool`. #[inline] pub fn from_trusted_len_values_iter>(iterator: I) -> Self { - Self::from_data(MutableBitmap::from_trusted_len_iter(iterator), None) + Self::from_data( + DataType::Boolean, + MutableBitmap::from_trusted_len_iter(iterator), + None, + ) } /// Creates a new [`MutableBooleanArray`] from a slice of `bool`. @@ -166,7 +184,7 @@ impl MutableBooleanArray { None }; - Self::from_data(values, validity) + Self::from_data(DataType::Boolean, values, validity) } /// Creates a [`BooleanArray`] from a [`TrustedLen`]. @@ -199,7 +217,7 @@ impl MutableBooleanArray { None }; - Ok(Self::from_data(values, validity)) + Ok(Self::from_data(DataType::Boolean, values, validity)) } /// Creates a [`BooleanArray`] from a [`TrustedLen`]. @@ -304,7 +322,7 @@ impl>> FromIterator for MutableBoolea }) .collect(); - MutableBooleanArray::from_data(values, validity.into()) + MutableBooleanArray::from_data(DataType::Boolean, values, validity.into()) } } @@ -319,13 +337,14 @@ impl MutableArray for MutableBooleanArray { fn as_arc(&mut self) -> Arc { Arc::new(BooleanArray::from_data( + self.data_type.clone(), std::mem::take(&mut self.values).into(), std::mem::take(&mut self.validity).map(|x| x.into()), )) } fn data_type(&self) -> &DataType { - &DataType::Boolean + &self.data_type } fn as_any(&self) -> &dyn std::any::Any { diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs index 04924652835..b3b858fa296 100644 --- a/src/array/growable/boolean.rs +++ b/src/array/growable/boolean.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use crate::{ array::{Array, BooleanArray}, bitmap::MutableBitmap, + datatypes::DataType, }; use super::{ @@ -13,6 +14,7 @@ use super::{ /// Concrete [`Growable`] for the [`BooleanArray`]. pub struct GrowableBoolean<'a> { arrays: Vec<&'a BooleanArray>, + data_type: DataType, validity: MutableBitmap, values: MutableBitmap, // function used to extend nulls from arrays. This function's lifetime is bound to the array @@ -24,6 +26,7 @@ impl<'a> GrowableBoolean<'a> { pub fn new(arrays: Vec<&'a BooleanArray>, mut use_validity: bool, capacity: usize) -> Self { // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. + let data_type = arrays[0].data_type().clone(); if !use_validity & arrays.iter().any(|array| array.null_count() > 0) { use_validity = true; }; @@ -35,6 +38,7 @@ impl<'a> GrowableBoolean<'a> { Self { arrays, + data_type, values: MutableBitmap::with_capacity(capacity), validity: MutableBitmap::with_capacity(capacity), extend_null_bits, @@ -45,7 +49,7 @@ impl<'a> GrowableBoolean<'a> { let validity = std::mem::take(&mut self.validity); let values = std::mem::take(&mut self.values); - BooleanArray::from_data(values.into(), validity.into()) + BooleanArray::from_data(self.data_type.clone(), values.into(), validity.into()) } } @@ -76,6 +80,6 @@ impl<'a> Growable<'a> for GrowableBoolean<'a> { impl<'a> From> for BooleanArray { fn from(val: GrowableBoolean<'a>) -> Self { - BooleanArray::from_data(val.values.into(), val.validity.into()) + BooleanArray::from_data(val.data_type, val.values.into(), val.validity.into()) } } diff --git a/src/array/mod.rs b/src/array/mod.rs index 7266f0384f0..8dd3250e212 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -216,7 +216,7 @@ impl Display for dyn Array { pub fn new_empty_array(data_type: DataType) -> Box { match data_type { DataType::Null => Box::new(NullArray::new_empty(data_type)), - DataType::Boolean => Box::new(BooleanArray::new_empty()), + DataType::Boolean => Box::new(BooleanArray::new_empty(data_type)), DataType::Int8 => Box::new(PrimitiveArray::::new_empty(data_type)), DataType::Int16 => Box::new(PrimitiveArray::::new_empty(data_type)), DataType::Int32 @@ -265,7 +265,7 @@ pub fn new_empty_array(data_type: DataType) -> Box { pub fn new_null_array(data_type: DataType, length: usize) -> Box { match data_type { DataType::Null => Box::new(NullArray::new_null(data_type, length)), - DataType::Boolean => Box::new(BooleanArray::new_null(length)), + DataType::Boolean => Box::new(BooleanArray::new_null(data_type, length)), DataType::Int8 => Box::new(PrimitiveArray::::new_null(data_type, length)), DataType::Int16 => Box::new(PrimitiveArray::::new_null(data_type, length)), DataType::Int32 diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index 368f2cc6eb6..858a10e6eed 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -486,7 +486,7 @@ mod tests { #[test] fn test_boolean_min_max_empty() { - let a = BooleanArray::new_empty(); + let a = BooleanArray::new_empty(DataType::Boolean); assert_eq!(None, min_boolean(&a)); assert_eq!(None, max_boolean(&a)); } diff --git a/src/compute/boolean.rs b/src/compute/boolean.rs index 9fe1d0aac6f..03a55f48f27 100644 --- a/src/compute/boolean.rs +++ b/src/compute/boolean.rs @@ -17,6 +17,7 @@ use crate::array::{Array, BooleanArray}; use crate::bitmap::{Bitmap, MutableBitmap}; +use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use super::utils::combine_validities; @@ -39,7 +40,7 @@ where let values = op(left_buffer, right_buffer); - Ok(BooleanArray::from_data(values, validity)) + Ok(BooleanArray::from_data(DataType::Boolean, values, validity)) } /// Performs `AND` operation on two arrays. If either left or right value is null then the @@ -99,7 +100,7 @@ pub fn or(lhs: &BooleanArray, rhs: &BooleanArray) -> Result { pub fn not(array: &BooleanArray) -> BooleanArray { let values = !array.values(); let validity = array.validity().clone(); - BooleanArray::from_data(values, validity) + BooleanArray::from_data(DataType::Boolean, values, validity) } /// Returns a non-null [BooleanArray] with whether each value of the array is null. @@ -123,7 +124,7 @@ pub fn is_null(input: &dyn Array) -> BooleanArray { Some(buffer) => !buffer, }; - BooleanArray::from_data(values, None) + BooleanArray::from_data(DataType::Boolean, values, None) } /// Returns a non-null [BooleanArray] with whether each value of the array is not null. @@ -142,7 +143,7 @@ pub fn is_not_null(input: &dyn Array) -> BooleanArray { None => Bitmap::from_trusted_len_iter(std::iter::repeat(true).take(input.len())), Some(buffer) => buffer.clone(), }; - BooleanArray::from_data(values, None) + BooleanArray::from_data(DataType::Boolean, values, None) } #[cfg(test)] diff --git a/src/compute/boolean_kleene.rs b/src/compute/boolean_kleene.rs index 1cdd2b41c7e..4e31f449a55 100644 --- a/src/compute/boolean_kleene.rs +++ b/src/compute/boolean_kleene.rs @@ -1,3 +1,4 @@ +use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::{ array::{Array, BooleanArray}, @@ -87,7 +88,11 @@ pub fn or(lhs: &BooleanArray, rhs: &BooleanArray) -> Result { } (None, None) => None, }; - Ok(BooleanArray::from_data(lhs_values | rhs_values, validity)) + Ok(BooleanArray::from_data( + DataType::Boolean, + lhs_values | rhs_values, + validity, + )) } /// Logical 'and' with [Kleene logic](https://en.wikipedia.org/wiki/Three-valued_logic#Kleene_and_Priest_logics) @@ -172,7 +177,11 @@ pub fn and(lhs: &BooleanArray, rhs: &BooleanArray) -> Result { } (None, None) => None, }; - Ok(BooleanArray::from_data(lhs_values & rhs_values, validity)) + Ok(BooleanArray::from_data( + DataType::Boolean, + lhs_values & rhs_values, + validity, + )) } #[cfg(test)] diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 9dd5ef5f33d..7154265c305 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -426,16 +426,16 @@ fn cast_with_options( ))), }, (_, Boolean) => match from_type { - UInt8 => primitive_to_boolean_dyn::(array), - UInt16 => primitive_to_boolean_dyn::(array), - UInt32 => primitive_to_boolean_dyn::(array), - UInt64 => primitive_to_boolean_dyn::(array), - Int8 => primitive_to_boolean_dyn::(array), - Int16 => primitive_to_boolean_dyn::(array), - Int32 => primitive_to_boolean_dyn::(array), - Int64 => primitive_to_boolean_dyn::(array), - Float32 => primitive_to_boolean_dyn::(array), - Float64 => primitive_to_boolean_dyn::(array), + UInt8 => primitive_to_boolean_dyn::(array, to_type.clone()), + UInt16 => primitive_to_boolean_dyn::(array, to_type.clone()), + UInt32 => primitive_to_boolean_dyn::(array, to_type.clone()), + UInt64 => primitive_to_boolean_dyn::(array, to_type.clone()), + Int8 => primitive_to_boolean_dyn::(array, to_type.clone()), + Int16 => primitive_to_boolean_dyn::(array, to_type.clone()), + Int32 => primitive_to_boolean_dyn::(array, to_type.clone()), + Int64 => primitive_to_boolean_dyn::(array, to_type.clone()), + Float32 => primitive_to_boolean_dyn::(array, to_type.clone()), + Float64 => primitive_to_boolean_dyn::(array, to_type.clone()), _ => Err(ArrowError::NotYetImplemented(format!( "Casting from {:?} to {:?} not supported", from_type, to_type, diff --git a/src/compute/cast/primitive_to.rs b/src/compute/cast/primitive_to.rs index dcab5683300..ec8d02e850c 100644 --- a/src/compute/cast/primitive_to.rs +++ b/src/compute/cast/primitive_to.rs @@ -14,19 +14,25 @@ use super::CastOptions; /// Returns a [`BooleanArray`] where every element is different from zero. /// Validity is preserved. -pub fn primitive_to_boolean(from: &PrimitiveArray) -> BooleanArray { +pub fn primitive_to_boolean( + from: &PrimitiveArray, + to_type: DataType, +) -> BooleanArray { let iter = from.values().iter().map(|v| *v != T::default()); let values = Bitmap::from_trusted_len_iter(iter); - BooleanArray::from_data(values, from.validity().clone()) + BooleanArray::from_data(to_type, values, from.validity().clone()) } -pub(super) fn primitive_to_boolean_dyn(from: &dyn Array) -> Result> +pub(super) fn primitive_to_boolean_dyn( + from: &dyn Array, + to_type: DataType, +) -> Result> where T: NativeType, { let from = from.as_any().downcast_ref().unwrap(); - Ok(Box::new(primitive_to_boolean::(from))) + Ok(Box::new(primitive_to_boolean::(from, to_type))) } /// Returns a [`Utf8Array`] where every element is the utf8 representation of the number. diff --git a/src/compute/comparison/boolean.rs b/src/compute/comparison/boolean.rs index 4d0395918ec..647e3036931 100644 --- a/src/compute/comparison/boolean.rs +++ b/src/compute/comparison/boolean.rs @@ -1,6 +1,7 @@ use crate::array::*; use crate::bitmap::Bitmap; use crate::buffer::MutableBuffer; +use crate::datatypes::DataType; use crate::scalar::{BooleanScalar, Scalar}; use crate::{ bitmap::MutableBitmap, @@ -46,7 +47,11 @@ where let values = compare_values_op(lhs.values(), rhs.values(), op); - Ok(BooleanArray::from_data(values.into(), validity)) + Ok(BooleanArray::from_data( + DataType::Boolean, + values.into(), + validity, + )) } /// Evaluate `op(left, right)` for [`BooleanArray`] and scalar using @@ -67,7 +72,7 @@ where values.push(op(lhs_remainder, rhs)) }; let values = MutableBitmap::from_buffer(values, lhs.len()).into(); - BooleanArray::from_data(values, lhs.validity().clone()) + BooleanArray::from_data(DataType::Boolean, values, lhs.validity().clone()) } /// Perform `lhs == rhs` operation on two arrays. @@ -148,7 +153,7 @@ pub fn compare(lhs: &BooleanArray, rhs: &BooleanArray, op: Operator) -> Result BooleanArray { if !rhs.is_valid() { - return BooleanArray::new_null(lhs.len()); + return BooleanArray::new_null(DataType::Boolean, lhs.len()); } compare_scalar_non_null(lhs, rhs.value(), op) } diff --git a/src/compute/comparison/primitive.rs b/src/compute/comparison/primitive.rs index 65f752426f0..561868edeac 100644 --- a/src/compute/comparison/primitive.rs +++ b/src/compute/comparison/primitive.rs @@ -16,6 +16,7 @@ // under the License. use crate::bitmap::Bitmap; +use crate::datatypes::DataType; use crate::scalar::{PrimitiveScalar, Scalar}; use crate::{array::*, types::NativeType}; use crate::{ @@ -72,7 +73,11 @@ where let values = compare_values_op(lhs.values(), rhs.values(), op); - Ok(BooleanArray::from_data(values.into(), validity)) + Ok(BooleanArray::from_data( + DataType::Boolean, + values.into(), + validity, + )) } /// Evaluate `op(left, right)` for [`PrimitiveArray`] and scalar using @@ -100,7 +105,11 @@ where values.push(op(lhs, rhs)) }; - BooleanArray::from_data(Bitmap::from_u8_buffer(values, lhs.len()), validity) + BooleanArray::from_data( + DataType::Boolean, + Bitmap::from_u8_buffer(values, lhs.len()), + validity, + ) } /// Perform `lhs == rhs` operation on two arrays. @@ -225,7 +234,7 @@ pub fn compare_scalar( op: Operator, ) -> BooleanArray { if !rhs.is_valid() { - return BooleanArray::new_null(lhs.len()); + return BooleanArray::new_null(DataType::Boolean, lhs.len()); } compare_scalar_non_null(lhs, rhs.value(), op) } diff --git a/src/compute/comparison/utf8.rs b/src/compute/comparison/utf8.rs index e36b89167ea..17f365ec23f 100644 --- a/src/compute/comparison/utf8.rs +++ b/src/compute/comparison/utf8.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::scalar::{Scalar, Utf8Scalar}; use crate::{array::*, bitmap::Bitmap}; @@ -42,7 +43,7 @@ where .map(|(lhs, rhs)| op(lhs, rhs)); let values = Bitmap::from_trusted_len_iter(values); - Ok(BooleanArray::from_data(values, validity)) + Ok(BooleanArray::from_data(DataType::Boolean, values, validity)) } /// Evaluate `op(lhs, rhs)` for [`PrimitiveArray`] and scalar using @@ -57,7 +58,7 @@ where let values = lhs.values_iter().map(|lhs| op(lhs, rhs)); let values = Bitmap::from_trusted_len_iter(values); - BooleanArray::from_data(values, validity) + BooleanArray::from_data(DataType::Boolean, values, validity) } /// Perform `lhs == rhs` operation on [`StringArray`] / [`LargeStringArray`]. @@ -141,7 +142,7 @@ pub fn compare_scalar( op: Operator, ) -> BooleanArray { if !rhs.is_valid() { - return BooleanArray::new_null(lhs.len()); + return BooleanArray::new_null(DataType::Boolean, lhs.len()); } compare_scalar_non_null(lhs, rhs.value(), op) } diff --git a/src/compute/contains.rs b/src/compute/contains.rs index d9452da03f6..44cc7a83639 100644 --- a/src/compute/contains.rs +++ b/src/compute/contains.rs @@ -59,7 +59,7 @@ where }); let values = Bitmap::from_trusted_len_iter(values); - Ok(BooleanArray::from_data(values, validity)) + Ok(BooleanArray::from_data(DataType::Boolean, values, validity)) } /// Checks if a [`GenericListArray`] contains a value in the [`Utf8Array`] @@ -93,7 +93,7 @@ where }); let values = Bitmap::from_trusted_len_iter(values); - Ok(BooleanArray::from_data(values, validity)) + Ok(BooleanArray::from_data(DataType::Boolean, values, validity)) } macro_rules! primitive { diff --git a/src/compute/like.rs b/src/compute/like.rs index 4ca328ab3e9..abda4f912e8 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use regex::Regex; +use crate::datatypes::DataType; use crate::{array::*, bitmap::Bitmap}; use crate::{ compute::utils::combine_validities, @@ -52,7 +53,7 @@ fn a_like_utf8 bool>( } }))?; - Ok(BooleanArray::from_data(values, validity)) + Ok(BooleanArray::from_data(DataType::Boolean, values, validity)) } /// Returns `lhs LIKE rhs` operation on two [`Utf8Array`]. @@ -112,7 +113,11 @@ fn a_like_utf8_scalar bool>( })?; Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(re.is_match(x)))) }; - Ok(BooleanArray::from_data(values, validity.clone())) + Ok(BooleanArray::from_data( + DataType::Boolean, + values, + validity.clone(), + )) } /// Returns `lhs LIKE rhs` operation. diff --git a/src/compute/regex_match.rs b/src/compute/regex_match.rs index 3e8d8220b67..12779067368 100644 --- a/src/compute/regex_match.rs +++ b/src/compute/regex_match.rs @@ -21,6 +21,7 @@ use regex::Regex; use super::utils::{combine_validities, unary_utf8_boolean}; use crate::array::{BooleanArray, Offset, Utf8Array}; +use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::{array::*, bitmap::Bitmap}; @@ -59,7 +60,11 @@ pub fn regex_match(values: &Utf8Array, regex: &Utf8Array) -> Re }); let new_values = Bitmap::try_from_trusted_len_iter(iterator)?; - Ok(BooleanArray::from_data(new_values, validity)) + Ok(BooleanArray::from_data( + DataType::Boolean, + new_values, + validity, + )) } /// Regex matches diff --git a/src/compute/take/boolean.rs b/src/compute/take/boolean.rs index dc1df7820db..2051b14223f 100644 --- a/src/compute/take/boolean.rs +++ b/src/compute/take/boolean.rs @@ -108,6 +108,7 @@ fn take_values_indices_validity( /// `take` implementation for boolean arrays pub fn take(values: &BooleanArray, indices: &PrimitiveArray) -> BooleanArray { + let data_type = values.data_type().clone(); let indices_has_validity = indices.null_count() > 0; let values_has_validity = values.null_count() > 0; @@ -118,7 +119,7 @@ pub fn take(values: &BooleanArray, indices: &PrimitiveArray) -> Boo (true, true) => take_values_indices_validity(values, indices), }; - BooleanArray::from_data(values, validity) + BooleanArray::from_data(data_type, values, validity) } #[cfg(test)] diff --git a/src/compute/utils.rs b/src/compute/utils.rs index 5683620b45e..6378d9fca37 100644 --- a/src/compute/utils.rs +++ b/src/compute/utils.rs @@ -18,6 +18,7 @@ use crate::{ array::{Array, BooleanArray, Offset, Utf8Array}, bitmap::Bitmap, + datatypes::DataType, }; pub fn combine_validities(lhs: &Option, rhs: &Option) -> Option { @@ -42,5 +43,5 @@ pub fn unary_utf8_boolean bool>( op(value.unwrap()) }); let values = Bitmap::from_trusted_len_iter(iterator); - BooleanArray::from_data(values, validity) + BooleanArray::from_data(DataType::Boolean, values, validity) } diff --git a/src/io/ipc/read/array/boolean.rs b/src/io/ipc/read/array/boolean.rs index 866b11e9022..b7bd4433692 100644 --- a/src/io/ipc/read/array/boolean.rs +++ b/src/io/ipc/read/array/boolean.rs @@ -2,6 +2,7 @@ use std::collections::VecDeque; use std::io::{Read, Seek}; use crate::array::BooleanArray; +use crate::datatypes::DataType; use crate::error::Result; use super::super::super::gen; @@ -10,6 +11,7 @@ use super::super::read_basic::*; pub fn read_boolean( field_nodes: &mut VecDeque, + data_type: DataType, buffers: &mut VecDeque<&gen::Schema::Buffer>, reader: &mut R, block_offset: u64, @@ -35,7 +37,7 @@ pub fn read_boolean( is_little_endian, None, )?; - Ok(BooleanArray::from_data(values, validity)) + Ok(BooleanArray::from_data(data_type, values, validity)) } pub fn skip_boolean( diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index d209c726eaa..76abca6f71a 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -36,10 +36,15 @@ pub fn read( let array = read_null(field_nodes, data_type); Ok(Arc::new(array)) } - DataType::Boolean => { - read_boolean(field_nodes, buffers, reader, block_offset, is_little_endian) - .map(|x| Arc::new(x) as Arc) - } + DataType::Boolean => read_boolean( + field_nodes, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + ) + .map(|x| Arc::new(x) as Arc), DataType::Int8 => read_primitive::( field_nodes, data_type, diff --git a/src/io/json_integration/read.rs b/src/io/json_integration/read.rs index 00a735d276e..a0396ca36c3 100644 --- a/src/io/json_integration/read.rs +++ b/src/io/json_integration/read.rs @@ -233,7 +233,11 @@ pub fn to_array( .iter() .map(|value| value.as_bool().unwrap()) .collect::(); - Ok(Arc::new(BooleanArray::from_data(values, validity))) + Ok(Arc::new(BooleanArray::from_data( + data_type.clone(), + values, + validity, + ))) } DataType::Int8 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), DataType::Int16 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), diff --git a/src/io/parquet/read/boolean/basic.rs b/src/io/parquet/read/boolean/basic.rs index f23f631b2ff..914b2921b9a 100644 --- a/src/io/parquet/read/boolean/basic.rs +++ b/src/io/parquet/read/boolean/basic.rs @@ -1,6 +1,7 @@ use crate::{ array::BooleanArray, bitmap::{utils::BitmapIter, MutableBitmap}, + datatypes::DataType, error::{ArrowError, Result}, }; @@ -85,7 +86,11 @@ where )? } - Ok(BooleanArray::from_data(values.into(), validity.into())) + Ok(BooleanArray::from_data( + DataType::Boolean, + values.into(), + validity.into(), + )) } pub async fn stream_to_array(pages: I, metadata: &ColumnChunkMetaData) -> Result @@ -109,7 +114,11 @@ where )? } - Ok(BooleanArray::from_data(values.into(), validity.into())) + Ok(BooleanArray::from_data( + DataType::Boolean, + values.into(), + validity.into(), + )) } fn extend_from_page( diff --git a/src/io/parquet/read/boolean/nested.rs b/src/io/parquet/read/boolean/nested.rs index 1cb0ffc7c52..fc48477ea88 100644 --- a/src/io/parquet/read/boolean/nested.rs +++ b/src/io/parquet/read/boolean/nested.rs @@ -157,7 +157,22 @@ where )? } - let values = Arc::new(BooleanArray::from_data(values.into(), validity.into())); + let inner_data_type = match data_type { + DataType::List(ref inner) => inner.data_type(), + DataType::LargeList(ref inner) => inner.data_type(), + _ => { + return Err(ArrowError::NotYetImplemented(format!( + "Read nested datatype {:?}", + data_type + ))) + } + }; + + let values = Arc::new(BooleanArray::from_data( + inner_data_type.clone(), + values.into(), + validity.into(), + )); create_list(data_type, &mut nested, values) } From 98c90cef72405f6f29d0b01ca4a22d1ed9230f87 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 11:24:55 +0000 Subject: [PATCH 06/12] Migrated Binary. --- benches/filter_kernels.rs | 3 ++- src/array/binary/ffi.rs | 7 ++++- src/array/binary/from.rs | 2 +- src/array/binary/mod.rs | 33 +++++++++++++++++------- src/array/binary/mutable.rs | 10 +++---- src/array/growable/binary.rs | 12 ++++++--- src/array/growable/boolean.rs | 3 ++- src/array/mod.rs | 8 +++--- src/array/utf8/mod.rs | 4 +++ src/compute/cast/binary_to.rs | 11 +++++--- src/compute/cast/mod.rs | 7 +++-- src/compute/take/binary.rs | 3 ++- src/io/ipc/read/array/binary.rs | 6 ++++- src/io/ipc/read/deserialize.rs | 2 ++ src/io/json_integration/read.rs | 8 +++--- src/io/parquet/read/binary/basic.rs | 2 ++ src/io/parquet/read/binary/dictionary.rs | 1 + src/io/parquet/read/binary/nested.rs | 1 + tests/it/array/binary/mod.rs | 4 ++- tests/it/array/boolean/mod.rs | 9 +++++-- tests/it/array/boolean/mutable.rs | 7 ++++- 21 files changed, 103 insertions(+), 40 deletions(-) diff --git a/benches/filter_kernels.rs b/benches/filter_kernels.rs index 65aaad86751..9d2a59e9b0c 100644 --- a/benches/filter_kernels.rs +++ b/benches/filter_kernels.rs @@ -40,7 +40,8 @@ fn add_benchmark(c: &mut Criterion) { let size = 2usize.pow(log2_size); let filter_array = create_boolean_array(size, 0.0, 0.9); - let filter_array = BooleanArray::from_data(filter_array.values().clone(), None); + let filter_array = + BooleanArray::from_data(DataType::Boolean, filter_array.values().clone(), None); let arr_a = create_primitive_array::(size, DataType::Float32, 0.0); c.bench_function(&format!("filter 2^{} f32", log2_size), |b| { diff --git a/src/array/binary/ffi.rs b/src/array/binary/ffi.rs index 63d0ea66f36..9a4257cabf1 100644 --- a/src/array/binary/ffi.rs +++ b/src/array/binary/ffi.rs @@ -44,6 +44,11 @@ unsafe impl FromFfi for BinaryArray { validity = validity.map(|x| x.slice(offset, length)) } - Ok(Self::from_data(offsets, values, validity)) + Ok(Self::from_data( + Self::default_data_type(), + offsets, + values, + validity, + )) } } diff --git a/src/array/binary/from.rs b/src/array/binary/from.rs index ef9a287a9c6..60fda084131 100644 --- a/src/array/binary/from.rs +++ b/src/array/binary/from.rs @@ -31,7 +31,7 @@ impl BinaryArray { // soundness: I is `TrustedLen` let (validity, offsets, values) = unsafe { trusted_len_unzip(iterator) }; - Self::from_data(offsets, values, validity) + Self::from_data(Self::default_data_type(), offsets, values, validity) } } diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 7c9a2fc3dcb..fa336d2dc8c 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -25,14 +25,15 @@ pub struct BinaryArray { // constructors impl BinaryArray { /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. - pub fn new_empty() -> Self { - Self::from_data(Buffer::from(&[O::zero()]), Buffer::new(), None) + pub fn new_empty(data_type: DataType) -> Self { + Self::from_data(data_type, Buffer::from(&[O::zero()]), Buffer::new(), None) } /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`. #[inline] - pub fn new_null(length: usize) -> Self { + pub fn new_null(data_type: DataType, length: usize) -> Self { Self::from_data( + data_type, Buffer::new_zeroed(length + 1), Buffer::new(), Some(Bitmap::new_zeroed(length)), @@ -43,19 +44,24 @@ impl BinaryArray { /// # Panics /// * The length of the offset buffer must be larger than 1 /// * The length of the values must be equal to the last offset value - pub fn from_data(offsets: Buffer, values: Buffer, validity: Option) -> Self { + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { check_offsets(&offsets, values.len()); if let Some(validity) = &validity { assert_eq!(offsets.len() - 1, validity.len()); } + if data_type != Self::default_data_type() { + panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary") + } + Self { - data_type: if O::is_large() { - DataType::LargeBinary - } else { - DataType::Binary - }, + data_type, offsets, values, validity, @@ -63,6 +69,15 @@ impl BinaryArray { } } + /// Returns the default [`DataType`], `DataType::Binary` or `DataType::LargeBinary` + pub fn default_data_type() -> DataType { + if O::is_large() { + DataType::LargeBinary + } else { + DataType::Binary + } + } + /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`]. /// # Implementation /// This function is `O(1)`: all data will be shared between both arrays. diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 3ae30d436d9..52ae198fa7e 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -16,6 +16,7 @@ use super::BinaryArray; /// This struct does not allocate a validity until one is required (i.e. push a null to it). #[derive(Debug)] pub struct MutableBinaryArray { + data_type: DataType, offsets: MutableBuffer, values: MutableBuffer, validity: Option, @@ -24,6 +25,7 @@ pub struct MutableBinaryArray { impl From> for BinaryArray { fn from(other: MutableBinaryArray) -> Self { BinaryArray::::from_data( + other.data_type, other.offsets.into(), other.values.into(), other.validity.map(|x| x.into()), @@ -52,6 +54,7 @@ impl MutableBinaryArray { let mut offsets = MutableBuffer::::with_capacity(capacity + 1); offsets.push(O::default()); Self { + data_type: BinaryArray::::default_data_type(), offsets, values: MutableBuffer::::new(), validity: None, @@ -114,6 +117,7 @@ impl MutableArray for MutableBinaryArray { fn as_arc(&mut self) -> Arc { Arc::new(BinaryArray::from_data( + self.data_type.clone(), std::mem::take(&mut self.offsets).into(), std::mem::take(&mut self.values).into(), std::mem::take(&mut self.validity).map(|x| x.into()), @@ -121,11 +125,7 @@ impl MutableArray for MutableBinaryArray { } fn data_type(&self) -> &DataType { - if O::is_large() { - &DataType::LargeUtf8 - } else { - &DataType::Utf8 - } + &self.data_type } fn as_any(&self) -> &dyn std::any::Any { diff --git a/src/array/growable/binary.rs b/src/array/growable/binary.rs index f12a3a2a78d..f51dd90a5bd 100644 --- a/src/array/growable/binary.rs +++ b/src/array/growable/binary.rs @@ -4,6 +4,7 @@ use crate::{ array::{Array, BinaryArray, Offset}, bitmap::MutableBitmap, buffer::MutableBuffer, + datatypes::DataType, }; use super::{ @@ -14,6 +15,7 @@ use super::{ /// Concrete [`Growable`] for the [`BinaryArray`]. pub struct GrowableBinary<'a, O: Offset> { arrays: Vec<&'a BinaryArray>, + data_type: DataType, validity: MutableBitmap, values: MutableBuffer, offsets: MutableBuffer, @@ -25,8 +27,10 @@ pub struct GrowableBinary<'a, O: Offset> { impl<'a, O: Offset> GrowableBinary<'a, O> { /// # Panics - /// This function panics if any of the `arrays` is not downcastable to `PrimitiveArray`. + /// If `arrays` is empty. pub fn new(arrays: Vec<&'a BinaryArray>, mut use_validity: bool, capacity: usize) -> Self { + let data_type = arrays[0].data_type().clone(); + // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. if !use_validity & arrays.iter().any(|array| array.null_count() > 0) { @@ -44,6 +48,7 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { Self { arrays, + data_type, values: MutableBuffer::with_capacity(0), offsets, length, @@ -53,11 +58,12 @@ impl<'a, O: Offset> GrowableBinary<'a, O> { } fn to(&mut self) -> BinaryArray { + let data_type = self.data_type.clone(); let validity = std::mem::take(&mut self.validity); let offsets = std::mem::take(&mut self.offsets); let values = std::mem::take(&mut self.values); - BinaryArray::::from_data(offsets.into(), values.into(), validity.into()) + BinaryArray::::from_data(data_type, offsets.into(), values.into(), validity.into()) } } @@ -94,6 +100,6 @@ impl<'a, O: Offset> Growable<'a> for GrowableBinary<'a, O> { impl<'a, O: Offset> From> for BinaryArray { fn from(val: GrowableBinary<'a, O>) -> Self { - BinaryArray::::from_data(val.offsets.into(), val.values.into(), val.validity.into()) + BinaryArray::::from_data(val.data_type, val.offsets.into(), val.values.into(), val.validity.into()) } } diff --git a/src/array/growable/boolean.rs b/src/array/growable/boolean.rs index b3b858fa296..1dca6c1b1eb 100644 --- a/src/array/growable/boolean.rs +++ b/src/array/growable/boolean.rs @@ -24,9 +24,10 @@ pub struct GrowableBoolean<'a> { impl<'a> GrowableBoolean<'a> { pub fn new(arrays: Vec<&'a BooleanArray>, mut use_validity: bool, capacity: usize) -> Self { + let data_type = arrays[0].data_type().clone(); + // if any of the arrays has nulls, insertions from any array requires setting bits // as there is at least one array with nulls. - let data_type = arrays[0].data_type().clone(); if !use_validity & arrays.iter().any(|array| array.null_count() > 0) { use_validity = true; }; diff --git a/src/array/mod.rs b/src/array/mod.rs index 8dd3250e212..d35b27b5207 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -241,8 +241,8 @@ pub fn new_empty_array(data_type: DataType) -> Box { DataType::Float16 => unreachable!(), DataType::Float32 => Box::new(PrimitiveArray::::new_empty(data_type)), DataType::Float64 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Binary => Box::new(BinaryArray::::new_empty()), - DataType::LargeBinary => Box::new(BinaryArray::::new_empty()), + DataType::Binary => Box::new(BinaryArray::::new_empty(data_type)), + DataType::LargeBinary => Box::new(BinaryArray::::new_empty(data_type)), DataType::FixedSizeBinary(_) => Box::new(FixedSizeBinaryArray::new_empty(data_type)), DataType::Utf8 => Box::new(Utf8Array::::new_empty(data_type)), DataType::LargeUtf8 => Box::new(Utf8Array::::new_empty(data_type)), @@ -290,8 +290,8 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { DataType::Float16 => unreachable!(), DataType::Float32 => Box::new(PrimitiveArray::::new_null(data_type, length)), DataType::Float64 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Binary => Box::new(BinaryArray::::new_null(length)), - DataType::LargeBinary => Box::new(BinaryArray::::new_null(length)), + DataType::Binary => Box::new(BinaryArray::::new_null(data_type, length)), + DataType::LargeBinary => Box::new(BinaryArray::::new_null(data_type, length)), DataType::FixedSizeBinary(_) => Box::new(FixedSizeBinaryArray::new_null(data_type, length)), DataType::Utf8 => Box::new(Utf8Array::::new_null(data_type, length)), DataType::LargeUtf8 => Box::new(Utf8Array::::new_null(data_type, length)), diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 00e297e3cff..2e022b51439 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -72,6 +72,10 @@ impl Utf8Array { assert_eq!(offsets.len() - 1, validity.len()); } + if data_type != Self::default_data_type() { + panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") + } + Self { data_type, offsets, diff --git a/src/compute/cast/binary_to.rs b/src/compute/cast/binary_to.rs index 77492ad60cc..79edc2af5da 100644 --- a/src/compute/cast/binary_to.rs +++ b/src/compute/cast/binary_to.rs @@ -1,16 +1,20 @@ use std::convert::TryFrom; +use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::{array::*, buffer::Buffer}; -pub fn binary_to_large_binary(from: &BinaryArray) -> BinaryArray { +pub fn binary_to_large_binary(from: &BinaryArray, to_data_type: DataType) -> BinaryArray { let values = from.values().clone(); let offsets = from.offsets().iter().map(|x| *x as i64); let offsets = Buffer::from_trusted_len_iter(offsets); - BinaryArray::::from_data(offsets, values, from.validity().clone()) + BinaryArray::::from_data(to_data_type, offsets, values, from.validity().clone()) } -pub fn binary_large_to_binary(from: &BinaryArray) -> Result> { +pub fn binary_large_to_binary( + from: &BinaryArray, + to_data_type: DataType, +) -> Result> { let values = from.values().clone(); let _ = i32::try_from(*from.offsets().last().unwrap()).map_err(ArrowError::from_external_error)?; @@ -18,6 +22,7 @@ pub fn binary_large_to_binary(from: &BinaryArray) -> Result::from_data( + to_data_type, offsets, values, from.validity().clone(), diff --git a/src/compute/cast/mod.rs b/src/compute/cast/mod.rs index 7154265c305..451002b7b07 100644 --- a/src/compute/cast/mod.rs +++ b/src/compute/cast/mod.rs @@ -562,9 +562,12 @@ fn cast_with_options( (Binary, LargeBinary) => Ok(Box::new(binary_to_large_binary( array.as_any().downcast_ref().unwrap(), + to_type.clone(), ))), - (LargeBinary, Binary) => binary_large_to_binary(array.as_any().downcast_ref().unwrap()) - .map(|x| Box::new(x) as Box), + (LargeBinary, Binary) => { + binary_large_to_binary(array.as_any().downcast_ref().unwrap(), to_type.clone()) + .map(|x| Box::new(x) as Box) + } // start numeric casts (UInt8, UInt16) => primitive_to_primitive_dyn::(array, to_type, as_options), diff --git a/src/compute/take/binary.rs b/src/compute/take/binary.rs index 3208899467a..4bfb46abeda 100644 --- a/src/compute/take/binary.rs +++ b/src/compute/take/binary.rs @@ -25,6 +25,7 @@ pub fn take( values: &BinaryArray, indices: &PrimitiveArray, ) -> BinaryArray { + let data_type = values.data_type().clone(); let indices_has_validity = indices.null_count() > 0; let values_has_validity = values.null_count() > 0; @@ -36,5 +37,5 @@ pub fn take( (false, true) => take_indices_validity(values.offsets(), values.values(), indices), (true, true) => take_values_indices_validity(values, indices), }; - BinaryArray::::from_data(offsets, values, validity) + BinaryArray::::from_data(data_type, offsets, values, validity) } diff --git a/src/io/ipc/read/array/binary.rs b/src/io/ipc/read/array/binary.rs index 0e1ed3cd193..a232a16708b 100644 --- a/src/io/ipc/read/array/binary.rs +++ b/src/io/ipc/read/array/binary.rs @@ -4,6 +4,7 @@ use std::io::{Read, Seek}; use crate::array::{BinaryArray, Offset}; use crate::buffer::Buffer; +use crate::datatypes::DataType; use crate::error::Result; use crate::io::ipc::gen::Message::BodyCompression; use crate::types::NativeType; @@ -14,6 +15,7 @@ use super::super::read_basic::*; pub fn read_binary( field_nodes: &mut VecDeque, + data_type: DataType, buffers: &mut VecDeque<&gen::Schema::Buffer>, reader: &mut R, block_offset: u64, @@ -55,7 +57,9 @@ where compression, )?; - Ok(BinaryArray::::from_data(offsets, values, validity)) + Ok(BinaryArray::::from_data( + data_type, offsets, values, validity, + )) } pub fn skip_binary(field_nodes: &mut VecDeque, buffers: &mut VecDeque<&gen::Schema::Buffer>) { diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index 76abca6f71a..865b79059e6 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -176,6 +176,7 @@ pub fn read( DataType::Binary => { let array = read_binary::( field_nodes, + data_type, buffers, reader, block_offset, @@ -187,6 +188,7 @@ pub fn read( DataType::LargeBinary => { let array = read_binary::( field_nodes, + data_type, buffers, reader, block_offset, diff --git a/src/io/json_integration/read.rs b/src/io/json_integration/read.rs index a0396ca36c3..7810967ffc1 100644 --- a/src/io/json_integration/read.rs +++ b/src/io/json_integration/read.rs @@ -144,7 +144,7 @@ fn to_primitive( PrimitiveArray::::from_data(data_type, values, validity) } -fn to_binary(json_col: &ArrowJsonColumn) -> Arc { +fn to_binary(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc { let validity = to_validity(&json_col.validity); let offsets = to_offsets::(json_col.offset.as_ref()); let values = json_col @@ -155,7 +155,7 @@ fn to_binary(json_col: &ArrowJsonColumn) -> Arc { .map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap()) .flatten() .collect(); - Arc::new(BinaryArray::from_data(offsets, values, validity)) + Arc::new(BinaryArray::from_data(data_type, offsets, values, validity)) } fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc { @@ -262,8 +262,8 @@ pub fn to_array( DataType::UInt64 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), DataType::Float32 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), DataType::Float64 => Ok(Arc::new(to_primitive::(json_col, data_type.clone()))), - DataType::Binary => Ok(to_binary::(json_col)), - DataType::LargeBinary => Ok(to_binary::(json_col)), + DataType::Binary => Ok(to_binary::(json_col, data_type.clone())), + DataType::LargeBinary => Ok(to_binary::(json_col, data_type.clone())), DataType::Utf8 => Ok(to_utf8::(json_col, data_type.clone())), DataType::LargeUtf8 => Ok(to_utf8::(json_col, data_type.clone())), DataType::FixedSizeBinary(_) => { diff --git a/src/io/parquet/read/binary/basic.rs b/src/io/parquet/read/binary/basic.rs index e27c8072a48..bf07f06e0f6 100644 --- a/src/io/parquet/read/binary/basic.rs +++ b/src/io/parquet/read/binary/basic.rs @@ -283,6 +283,7 @@ where Ok(match data_type { DataType::LargeBinary | DataType::Binary => Box::new(BinaryArray::from_data( + data_type.clone(), offsets.into(), values.into(), validity.into(), @@ -328,6 +329,7 @@ where Ok(match data_type { DataType::LargeBinary | DataType::Binary => Box::new(BinaryArray::from_data( + data_type.clone(), offsets.into(), values.into(), validity.into(), diff --git a/src/io/parquet/read/binary/dictionary.rs b/src/io/parquet/read/binary/dictionary.rs index fc74810795a..b0aa38f3f1e 100644 --- a/src/io/parquet/read/binary/dictionary.rs +++ b/src/io/parquet/read/binary/dictionary.rs @@ -151,6 +151,7 @@ where } let keys = PrimitiveArray::from_data(K::DATA_TYPE, indices.into(), validity.into()); + let data_type = DictionaryArray::::get_child(&data_type).clone(); let values = Arc::new(Utf8Array::from_data( data_type, offsets.into(), diff --git a/src/io/parquet/read/binary/nested.rs b/src/io/parquet/read/binary/nested.rs index 4e02acd1afd..211a79ef958 100644 --- a/src/io/parquet/read/binary/nested.rs +++ b/src/io/parquet/read/binary/nested.rs @@ -189,6 +189,7 @@ where let values = match inner_data_type { DataType::LargeBinary | DataType::Binary => Arc::new(BinaryArray::from_data( + inner_data_type.clone(), offsets.into(), values.into(), validity.into(), diff --git a/tests/it/array/binary/mod.rs b/tests/it/array/binary/mod.rs index 1f48730acd6..a7c64010348 100644 --- a/tests/it/array/binary/mod.rs +++ b/tests/it/array/binary/mod.rs @@ -1,6 +1,7 @@ use arrow2::{ array::{Array, BinaryArray}, bitmap::Bitmap, + datatypes::DataType, }; #[test] @@ -24,6 +25,7 @@ fn basics() { assert!(array.is_valid(2)); let array2 = BinaryArray::::from_data( + DataType::Binary, array.offsets().clone(), array.values().clone(), array.validity().clone(), @@ -40,7 +42,7 @@ fn basics() { #[test] fn empty() { - let array = BinaryArray::::new_empty(); + let array = BinaryArray::::new_empty(DataType::Binary); assert_eq!(array.values().as_slice(), b""); assert_eq!(array.offsets().as_slice(), &[0]); assert_eq!(array.validity(), &None); diff --git a/tests/it/array/boolean/mod.rs b/tests/it/array/boolean/mod.rs index ce2cb815d13..f2376878120 100644 --- a/tests/it/array/boolean/mod.rs +++ b/tests/it/array/boolean/mod.rs @@ -1,6 +1,7 @@ use arrow2::{ array::{Array, BooleanArray}, bitmap::Bitmap, + datatypes::DataType, }; mod mutable; @@ -23,7 +24,11 @@ fn basics() { assert!(!array.is_valid(1)); assert!(array.is_valid(2)); - let array2 = BooleanArray::from_data(array.values().clone(), array.validity().clone()); + let array2 = BooleanArray::from_data( + DataType::Boolean, + array.values().clone(), + array.validity().clone(), + ); assert_eq!(array, array2); let array = array.slice(1, 2); @@ -33,7 +38,7 @@ fn basics() { #[test] fn empty() { - let array = BooleanArray::new_empty(); + let array = BooleanArray::new_empty(DataType::Boolean); assert_eq!(array.values().len(), 0); assert_eq!(array.validity(), &None); } diff --git a/tests/it/array/boolean/mutable.rs b/tests/it/array/boolean/mutable.rs index 70ca6845f29..d3d8a3228e3 100644 --- a/tests/it/array/boolean/mutable.rs +++ b/tests/it/array/boolean/mutable.rs @@ -1,5 +1,6 @@ use arrow2::array::{MutableArray, MutableBooleanArray}; use arrow2::bitmap::MutableBitmap; +use arrow2::datatypes::DataType; use arrow2::error::Result; #[test] @@ -53,7 +54,11 @@ fn try_from_trusted_len_iter() { #[test] fn reserve() { - let mut a = MutableBooleanArray::from_data(MutableBitmap::new(), Some(MutableBitmap::new())); + let mut a = MutableBooleanArray::from_data( + DataType::Boolean, + MutableBitmap::new(), + Some(MutableBitmap::new()), + ); a.reserve(10); assert!(a.validity().as_ref().unwrap().capacity() > 0); From 445cd79c224615f59166733990cf450365f00dbb Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 12:12:50 +0000 Subject: [PATCH 07/12] Added (private) PhysicalType. --- src/array/equal/mod.rs | 68 ++++---- src/array/ffi.rs | 67 ++++---- src/array/mod.rs | 285 +++++++++++++++------------------ src/compute/filter.rs | 133 +++++++-------- src/compute/take/mod.rs | 61 ++++--- src/datatypes/mod.rs | 49 ++++++ src/datatypes/physical_type.rs | 88 ++++++++++ src/ffi/array.rs | 72 ++++----- src/io/ipc/read/deserialize.rs | 111 +++++-------- src/io/ipc/write/serialize.rs | 99 ++++-------- src/types/mod.rs | 46 ++---- 11 files changed, 522 insertions(+), 557 deletions(-) create mode 100644 src/datatypes/physical_type.rs diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index 765907ab4d2..aff33cfd23d 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -1,7 +1,4 @@ -use crate::{ - datatypes::{DataType, IntervalUnit}, - types::{days_ms, NativeType}, -}; +use crate::types::{days_ms, NativeType}; use super::*; @@ -164,138 +161,131 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool { return false; } - match lhs.data_type() { - DataType::Null => { + use crate::datatypes::PhysicalType::*; + match lhs.data_type().to_physical_type() { + Null => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); null::equal(lhs, rhs) } - DataType::Boolean => { + Boolean => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); boolean::equal(lhs, rhs) } - DataType::UInt8 => { + UInt8 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::UInt16 => { + UInt16 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::UInt32 => { + UInt32 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::UInt64 => { + UInt64 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Int8 => { + Int8 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Int16 => { + Int16 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { + Int32 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { + Int64 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Decimal(_, _) => { + Int128 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Interval(IntervalUnit::DayTime) => { + DaysMs => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Float16 => unreachable!(), - DataType::Float32 => { + Float32 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Float64 => { + Float64 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); primitive::equal::(lhs, rhs) } - DataType::Utf8 => { + Utf8 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); utf8::equal::(lhs, rhs) } - DataType::LargeUtf8 => { + LargeUtf8 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); utf8::equal::(lhs, rhs) } - DataType::Binary => { + Binary => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); binary::equal::(lhs, rhs) } - DataType::LargeBinary => { + LargeBinary => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); binary::equal::(lhs, rhs) } - DataType::List(_) => { + List => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); list::equal::(lhs, rhs) } - DataType::LargeList(_) => { + LargeList => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); list::equal::(lhs, rhs) } - DataType::Struct(_) => { + Struct => { let lhs = lhs.as_any().downcast_ref::().unwrap(); let rhs = rhs.as_any().downcast_ref::().unwrap(); struct_::equal(lhs, rhs) } - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); dictionary::equal::<$T>(lhs, rhs) }) } - DataType::FixedSizeBinary(_) => { + FixedSizeBinary => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); fixed_size_binary::equal(lhs, rhs) } - DataType::FixedSizeList(_, _) => { + FixedSizeList => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); fixed_size_list::equal(lhs, rhs) } - DataType::Union(_, _, _) => { + Union => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); union::equal(lhs, rhs) diff --git a/src/array/ffi.rs b/src/array/ffi.rs index ea03602336f..a921550fb0d 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use crate::datatypes::DataType; +use crate::datatypes::PhysicalType; use crate::{array::*, ffi}; use crate::error::Result; @@ -41,43 +41,34 @@ type BuffersChildren = ( ); pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { - match array.data_type() { - DataType::Null => ffi_dyn!(array, NullArray), - DataType::Boolean => ffi_dyn!(array, BooleanArray), - DataType::Int8 => ffi_dyn!(array, PrimitiveArray), - DataType::Int16 => ffi_dyn!(array, PrimitiveArray), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - ffi_dyn!(array, PrimitiveArray) - } - DataType::Interval(IntervalUnit::DayTime) => ffi_dyn!(array, PrimitiveArray), - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => ffi_dyn!(array, PrimitiveArray), - DataType::Decimal(_, _) => ffi_dyn!(array, PrimitiveArray), - DataType::UInt8 => ffi_dyn!(array, PrimitiveArray), - DataType::UInt16 => ffi_dyn!(array, PrimitiveArray), - DataType::UInt32 => ffi_dyn!(array, PrimitiveArray), - DataType::UInt64 => ffi_dyn!(array, PrimitiveArray), - DataType::Float16 => unreachable!(), - DataType::Float32 => ffi_dyn!(array, PrimitiveArray), - DataType::Float64 => ffi_dyn!(array, PrimitiveArray), - DataType::Binary => ffi_dyn!(array, BinaryArray), - DataType::LargeBinary => ffi_dyn!(array, BinaryArray), - DataType::FixedSizeBinary(_) => ffi_dyn!(array, FixedSizeBinaryArray), - DataType::Utf8 => ffi_dyn!(array, Utf8Array::), - DataType::LargeUtf8 => ffi_dyn!(array, Utf8Array::), - DataType::List(_) => ffi_dyn!(array, ListArray::), - DataType::LargeList(_) => ffi_dyn!(array, ListArray::), - DataType::FixedSizeList(_, _) => ffi_dyn!(array, FixedSizeListArray), - DataType::Struct(_) => ffi_dyn!(array, StructArray), - DataType::Union(_, _, _) => ffi_dyn!(array, UnionArray), - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + use PhysicalType::*; + match array.data_type().to_physical_type() { + Null => ffi_dyn!(array, NullArray), + Boolean => ffi_dyn!(array, BooleanArray), + Int8 => ffi_dyn!(array, PrimitiveArray), + Int16 => ffi_dyn!(array, PrimitiveArray), + Int32 => ffi_dyn!(array, PrimitiveArray), + DaysMs => ffi_dyn!(array, PrimitiveArray), + Int64 => ffi_dyn!(array, PrimitiveArray), + Int128 => ffi_dyn!(array, PrimitiveArray), + UInt8 => ffi_dyn!(array, PrimitiveArray), + UInt16 => ffi_dyn!(array, PrimitiveArray), + UInt32 => ffi_dyn!(array, PrimitiveArray), + UInt64 => ffi_dyn!(array, PrimitiveArray), + Float32 => ffi_dyn!(array, PrimitiveArray), + Float64 => ffi_dyn!(array, PrimitiveArray), + Binary => ffi_dyn!(array, BinaryArray), + LargeBinary => ffi_dyn!(array, BinaryArray), + FixedSizeBinary => ffi_dyn!(array, FixedSizeBinaryArray), + Utf8 => ffi_dyn!(array, Utf8Array::), + LargeUtf8 => ffi_dyn!(array, Utf8Array::), + List => ffi_dyn!(array, ListArray::), + LargeList => ffi_dyn!(array, ListArray::), + FixedSizeList => ffi_dyn!(array, FixedSizeListArray), + Struct => ffi_dyn!(array, StructArray), + Union => ffi_dyn!(array, UnionArray), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { let array = array.as_any().downcast_ref::>().unwrap(); ( array.buffers(), diff --git a/src/array/mod.rs b/src/array/mod.rs index d35b27b5207..1b054ceb75a 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -20,7 +20,7 @@ use crate::error::Result; use crate::types::days_ms; use crate::{ bitmap::{Bitmap, MutableBitmap}, - datatypes::{DataType, IntervalUnit}, + datatypes::DataType, }; /// A trait representing an immutable Arrow array. Arrow arrays are trait objects @@ -164,47 +164,53 @@ macro_rules! with_match_dictionary_key_type {( } })} +macro_rules! with_match_physical_dictionary_key_type {( + $key_type:expr, | $_:tt $T:ident | $($body:tt)* +) => ({ + macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )} + use crate::datatypes::DictionaryIndexType::*; + match $key_type { + Int8 => __with_ty__! { i8 }, + Int16 => __with_ty__! { i16 }, + Int32 => __with_ty__! { i32 }, + Int64 => __with_ty__! { i64 }, + UInt8 => __with_ty__! { u8 }, + UInt16 => __with_ty__! { u16 }, + UInt32 => __with_ty__! { u32 }, + UInt64 => __with_ty__! { u64 }, + } +})} + impl Display for dyn Array { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.data_type() { - DataType::Null => fmt_dyn!(self, NullArray, f), - DataType::Boolean => fmt_dyn!(self, BooleanArray, f), - DataType::Int8 => fmt_dyn!(self, PrimitiveArray, f), - DataType::Int16 => fmt_dyn!(self, PrimitiveArray, f), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - fmt_dyn!(self, PrimitiveArray, f) - } - DataType::Interval(IntervalUnit::DayTime) => { - fmt_dyn!(self, PrimitiveArray, f) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => fmt_dyn!(self, PrimitiveArray, f), - DataType::Decimal(_, _) => fmt_dyn!(self, PrimitiveArray, f), - DataType::UInt8 => fmt_dyn!(self, PrimitiveArray, f), - DataType::UInt16 => fmt_dyn!(self, PrimitiveArray, f), - DataType::UInt32 => fmt_dyn!(self, PrimitiveArray, f), - DataType::UInt64 => fmt_dyn!(self, PrimitiveArray, f), - DataType::Float16 => unreachable!(), - DataType::Float32 => fmt_dyn!(self, PrimitiveArray, f), - DataType::Float64 => fmt_dyn!(self, PrimitiveArray, f), - DataType::Binary => fmt_dyn!(self, BinaryArray, f), - DataType::LargeBinary => fmt_dyn!(self, BinaryArray, f), - DataType::FixedSizeBinary(_) => fmt_dyn!(self, FixedSizeBinaryArray, f), - DataType::Utf8 => fmt_dyn!(self, Utf8Array::, f), - DataType::LargeUtf8 => fmt_dyn!(self, Utf8Array::, f), - DataType::List(_) => fmt_dyn!(self, ListArray::, f), - DataType::LargeList(_) => fmt_dyn!(self, ListArray::, f), - DataType::FixedSizeList(_, _) => fmt_dyn!(self, FixedSizeListArray, f), - DataType::Struct(_) => fmt_dyn!(self, StructArray, f), - DataType::Union(_, _, _) => fmt_dyn!(self, UnionArray, f), - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + use crate::datatypes::PhysicalType::*; + match self.data_type().to_physical_type() { + Null => fmt_dyn!(self, NullArray, f), + Boolean => fmt_dyn!(self, BooleanArray, f), + Int8 => fmt_dyn!(self, PrimitiveArray, f), + Int16 => fmt_dyn!(self, PrimitiveArray, f), + Int32 => fmt_dyn!(self, PrimitiveArray, f), + DaysMs => fmt_dyn!(self, PrimitiveArray, f), + Int64 => fmt_dyn!(self, PrimitiveArray, f), + Int128 => fmt_dyn!(self, PrimitiveArray, f), + UInt8 => fmt_dyn!(self, PrimitiveArray, f), + UInt16 => fmt_dyn!(self, PrimitiveArray, f), + UInt32 => fmt_dyn!(self, PrimitiveArray, f), + UInt64 => fmt_dyn!(self, PrimitiveArray, f), + Float32 => fmt_dyn!(self, PrimitiveArray, f), + Float64 => fmt_dyn!(self, PrimitiveArray, f), + Binary => fmt_dyn!(self, BinaryArray, f), + LargeBinary => fmt_dyn!(self, BinaryArray, f), + FixedSizeBinary => fmt_dyn!(self, FixedSizeBinaryArray, f), + Utf8 => fmt_dyn!(self, Utf8Array::, f), + LargeUtf8 => fmt_dyn!(self, Utf8Array::, f), + List => fmt_dyn!(self, ListArray::, f), + LargeList => fmt_dyn!(self, ListArray::, f), + FixedSizeList => fmt_dyn!(self, FixedSizeListArray, f), + Struct => fmt_dyn!(self, StructArray, f), + Union => fmt_dyn!(self, UnionArray, f), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { fmt_dyn!(self, DictionaryArray::<$T>, f) }) } @@ -214,45 +220,34 @@ impl Display for dyn Array { /// Creates a new [`Array`] with a [`Array::len`] of 0. pub fn new_empty_array(data_type: DataType) -> Box { - match data_type { - DataType::Null => Box::new(NullArray::new_empty(data_type)), - DataType::Boolean => Box::new(BooleanArray::new_empty(data_type)), - DataType::Int8 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Int16 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - Box::new(PrimitiveArray::::new_empty(data_type)) - } - DataType::Interval(IntervalUnit::DayTime) => { - Box::new(PrimitiveArray::::new_empty(data_type)) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Decimal(_, _) => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::UInt8 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::UInt16 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::UInt32 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::UInt64 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Float16 => unreachable!(), - DataType::Float32 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Float64 => Box::new(PrimitiveArray::::new_empty(data_type)), - DataType::Binary => Box::new(BinaryArray::::new_empty(data_type)), - DataType::LargeBinary => Box::new(BinaryArray::::new_empty(data_type)), - DataType::FixedSizeBinary(_) => Box::new(FixedSizeBinaryArray::new_empty(data_type)), - DataType::Utf8 => Box::new(Utf8Array::::new_empty(data_type)), - DataType::LargeUtf8 => Box::new(Utf8Array::::new_empty(data_type)), - DataType::List(_) => Box::new(ListArray::::new_empty(data_type)), - DataType::LargeList(_) => Box::new(ListArray::::new_empty(data_type)), - DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_empty(data_type)), - DataType::Struct(_) => Box::new(StructArray::new_empty(data_type)), - DataType::Union(_, _, _) => Box::new(UnionArray::new_empty(data_type)), - DataType::Dictionary(ref key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + use crate::datatypes::PhysicalType::*; + match data_type.to_physical_type() { + Null => Box::new(NullArray::new_empty(data_type)), + Boolean => Box::new(BooleanArray::new_empty(data_type)), + Int8 => Box::new(PrimitiveArray::::new_empty(data_type)), + Int16 => Box::new(PrimitiveArray::::new_empty(data_type)), + Int32 => Box::new(PrimitiveArray::::new_empty(data_type)), + DaysMs => Box::new(PrimitiveArray::::new_empty(data_type)), + Int64 => Box::new(PrimitiveArray::::new_empty(data_type)), + Int128 => Box::new(PrimitiveArray::::new_empty(data_type)), + UInt8 => Box::new(PrimitiveArray::::new_empty(data_type)), + UInt16 => Box::new(PrimitiveArray::::new_empty(data_type)), + UInt32 => Box::new(PrimitiveArray::::new_empty(data_type)), + UInt64 => Box::new(PrimitiveArray::::new_empty(data_type)), + Float32 => Box::new(PrimitiveArray::::new_empty(data_type)), + Float64 => Box::new(PrimitiveArray::::new_empty(data_type)), + Binary => Box::new(BinaryArray::::new_empty(data_type)), + LargeBinary => Box::new(BinaryArray::::new_empty(data_type)), + FixedSizeBinary => Box::new(FixedSizeBinaryArray::new_empty(data_type)), + Utf8 => Box::new(Utf8Array::::new_empty(data_type)), + LargeUtf8 => Box::new(Utf8Array::::new_empty(data_type)), + List => Box::new(ListArray::::new_empty(data_type)), + LargeList => Box::new(ListArray::::new_empty(data_type)), + FixedSizeList => Box::new(FixedSizeListArray::new_empty(data_type)), + Struct => Box::new(StructArray::new_empty(data_type)), + Union => Box::new(UnionArray::new_empty(data_type)), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::new_empty(data_type)) }) } @@ -263,45 +258,34 @@ pub fn new_empty_array(data_type: DataType) -> Box { /// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`] /// for all types except Union, which does not have a validity. pub fn new_null_array(data_type: DataType, length: usize) -> Box { - match data_type { - DataType::Null => Box::new(NullArray::new_null(data_type, length)), - DataType::Boolean => Box::new(BooleanArray::new_null(data_type, length)), - DataType::Int8 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Int16 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - Box::new(PrimitiveArray::::new_null(data_type, length)) - } - DataType::Interval(IntervalUnit::DayTime) => { - Box::new(PrimitiveArray::::new_null(data_type, length)) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Decimal(_, _) => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::UInt8 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::UInt16 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::UInt32 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::UInt64 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Float16 => unreachable!(), - DataType::Float32 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Float64 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DataType::Binary => Box::new(BinaryArray::::new_null(data_type, length)), - DataType::LargeBinary => Box::new(BinaryArray::::new_null(data_type, length)), - DataType::FixedSizeBinary(_) => Box::new(FixedSizeBinaryArray::new_null(data_type, length)), - DataType::Utf8 => Box::new(Utf8Array::::new_null(data_type, length)), - DataType::LargeUtf8 => Box::new(Utf8Array::::new_null(data_type, length)), - DataType::List(_) => Box::new(ListArray::::new_null(data_type, length)), - DataType::LargeList(_) => Box::new(ListArray::::new_null(data_type, length)), - DataType::FixedSizeList(_, _) => Box::new(FixedSizeListArray::new_null(data_type, length)), - DataType::Struct(_) => Box::new(StructArray::new_null(data_type, length)), - DataType::Union(_, _, _) => Box::new(UnionArray::new_null(data_type, length)), - DataType::Dictionary(ref key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + use crate::datatypes::PhysicalType::*; + match data_type.to_physical_type() { + Null => Box::new(NullArray::new_null(data_type, length)), + Boolean => Box::new(BooleanArray::new_null(data_type, length)), + Int8 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Int16 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Int32 => Box::new(PrimitiveArray::::new_null(data_type, length)), + DaysMs => Box::new(PrimitiveArray::::new_null(data_type, length)), + Int64 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Int128 => Box::new(PrimitiveArray::::new_null(data_type, length)), + UInt8 => Box::new(PrimitiveArray::::new_null(data_type, length)), + UInt16 => Box::new(PrimitiveArray::::new_null(data_type, length)), + UInt32 => Box::new(PrimitiveArray::::new_null(data_type, length)), + UInt64 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Float32 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Float64 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Binary => Box::new(BinaryArray::::new_null(data_type, length)), + LargeBinary => Box::new(BinaryArray::::new_null(data_type, length)), + FixedSizeBinary => Box::new(FixedSizeBinaryArray::new_null(data_type, length)), + Utf8 => Box::new(Utf8Array::::new_null(data_type, length)), + LargeUtf8 => Box::new(Utf8Array::::new_null(data_type, length)), + List => Box::new(ListArray::::new_null(data_type, length)), + LargeList => Box::new(ListArray::::new_null(data_type, length)), + FixedSizeList => Box::new(FixedSizeListArray::new_null(data_type, length)), + Struct => Box::new(StructArray::new_null(data_type, length)), + Union => Box::new(UnionArray::new_null(data_type, length)), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::new_null(data_type, length)) }) } @@ -320,43 +304,34 @@ macro_rules! clone_dyn { /// This operation is `O(1)` over `len`, as it amounts to increase two ref counts /// and moving the concrete struct under a `Box`. pub fn clone(array: &dyn Array) -> Box { - match array.data_type() { - DataType::Null => clone_dyn!(array, NullArray), - DataType::Boolean => clone_dyn!(array, BooleanArray), - DataType::Int8 => clone_dyn!(array, PrimitiveArray), - DataType::Int16 => clone_dyn!(array, PrimitiveArray), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - clone_dyn!(array, PrimitiveArray) - } - DataType::Interval(IntervalUnit::DayTime) => clone_dyn!(array, PrimitiveArray), - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => clone_dyn!(array, PrimitiveArray), - DataType::Decimal(_, _) => clone_dyn!(array, PrimitiveArray), - DataType::UInt8 => clone_dyn!(array, PrimitiveArray), - DataType::UInt16 => clone_dyn!(array, PrimitiveArray), - DataType::UInt32 => clone_dyn!(array, PrimitiveArray), - DataType::UInt64 => clone_dyn!(array, PrimitiveArray), - DataType::Float16 => unreachable!(), - DataType::Float32 => clone_dyn!(array, PrimitiveArray), - DataType::Float64 => clone_dyn!(array, PrimitiveArray), - DataType::Binary => clone_dyn!(array, BinaryArray), - DataType::LargeBinary => clone_dyn!(array, BinaryArray), - DataType::FixedSizeBinary(_) => clone_dyn!(array, FixedSizeBinaryArray), - DataType::Utf8 => clone_dyn!(array, Utf8Array::), - DataType::LargeUtf8 => clone_dyn!(array, Utf8Array::), - DataType::List(_) => clone_dyn!(array, ListArray::), - DataType::LargeList(_) => clone_dyn!(array, ListArray::), - DataType::FixedSizeList(_, _) => clone_dyn!(array, FixedSizeListArray), - DataType::Struct(_) => clone_dyn!(array, StructArray), - DataType::Union(_, _, _) => clone_dyn!(array, UnionArray), - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + use crate::datatypes::PhysicalType::*; + match array.data_type().to_physical_type() { + Null => clone_dyn!(array, NullArray), + Boolean => clone_dyn!(array, BooleanArray), + Int8 => clone_dyn!(array, PrimitiveArray), + Int16 => clone_dyn!(array, PrimitiveArray), + Int32 => clone_dyn!(array, PrimitiveArray), + DaysMs => clone_dyn!(array, PrimitiveArray), + Int64 => clone_dyn!(array, PrimitiveArray), + Int128 => clone_dyn!(array, PrimitiveArray), + UInt8 => clone_dyn!(array, PrimitiveArray), + UInt16 => clone_dyn!(array, PrimitiveArray), + UInt32 => clone_dyn!(array, PrimitiveArray), + UInt64 => clone_dyn!(array, PrimitiveArray), + Float32 => clone_dyn!(array, PrimitiveArray), + Float64 => clone_dyn!(array, PrimitiveArray), + Binary => clone_dyn!(array, BinaryArray), + LargeBinary => clone_dyn!(array, BinaryArray), + FixedSizeBinary => clone_dyn!(array, FixedSizeBinaryArray), + Utf8 => clone_dyn!(array, Utf8Array::), + LargeUtf8 => clone_dyn!(array, Utf8Array::), + List => clone_dyn!(array, ListArray::), + LargeList => clone_dyn!(array, ListArray::), + FixedSizeList => clone_dyn!(array, FixedSizeListArray), + Struct => clone_dyn!(array, StructArray), + Union => clone_dyn!(array, UnionArray), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { clone_dyn!(array, DictionaryArray::<$T>) }) } diff --git a/src/compute/filter.rs b/src/compute/filter.rs index f7132cc4721..2e92ef0083e 100644 --- a/src/compute/filter.rs +++ b/src/compute/filter.rs @@ -15,12 +15,9 @@ // specific language governing permissions and limitations // under the License. +use crate::array::growable::make_growable; use crate::array::growable::Growable; use crate::record_batch::RecordBatch; -use crate::{ - array::growable::make_growable, - datatypes::{DataType, IntervalUnit}, -}; use crate::{array::*, bitmap::Bitmap, types::NativeType}; use crate::{ bitmap::{utils::SlicesIterator, MutableBitmap}, @@ -88,7 +85,7 @@ fn filter_growable<'a>(growable: &mut impl Growable<'a>, chunks: &[(usize, usize .for_each(|(start, len)| growable.extend(0, *start, *len)); } -macro_rules! dyn_build_filter { +macro_rules! dyn_filter { ($ty:ty, $array:expr, $filter_count:expr, $chunks:expr) => {{ let array = $array.as_any().downcast_ref().unwrap(); let mut growable = @@ -109,60 +106,42 @@ pub fn build_filter(filter: &BooleanArray) -> Result { let filter_count = iter.slots(); let chunks = iter.collect::>(); - Ok(Box::new(move |array: &dyn Array| match array.data_type() { - DataType::UInt8 => { - dyn_build_filter!(u8, array, filter_count, chunks) - } - DataType::UInt16 => { - dyn_build_filter!(u16, array, filter_count, chunks) - } - DataType::UInt32 => { - dyn_build_filter!(u32, array, filter_count, chunks) - } - DataType::UInt64 => { - dyn_build_filter!(u64, array, filter_count, chunks) - } - DataType::Int8 => { - dyn_build_filter!(i8, array, filter_count, chunks) - } - DataType::Int16 => { - dyn_build_filter!(i16, array, filter_count, chunks) - } - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - dyn_build_filter!(i32, array, filter_count, chunks) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - dyn_build_filter!(i64, array, filter_count, chunks) - } - DataType::Interval(IntervalUnit::DayTime) => { - dyn_build_filter!(days_ms, array, filter_count, chunks) - } - DataType::Float32 => { - dyn_build_filter!(f32, array, filter_count, chunks) - } - DataType::Float64 => { - dyn_build_filter!(f64, array, filter_count, chunks) - } - DataType::Utf8 => { - let array = array.as_any().downcast_ref::>().unwrap(); - let mut growable = growable::GrowableUtf8::::new(vec![array], false, filter_count); - filter_growable(&mut growable, &chunks); - let array: Utf8Array = growable.into(); - Box::new(array) - } - _ => { - let mut mutable = make_growable(&[array], false, filter_count); - chunks - .iter() - .for_each(|(start, len)| mutable.extend(0, *start, *len)); - mutable.as_box() + use crate::datatypes::PhysicalType::*; + Ok(Box::new(move |array: &dyn Array| { + match array.data_type().to_physical_type() { + UInt8 => dyn_filter!(u8, array, filter_count, chunks), + UInt16 => dyn_filter!(u16, array, filter_count, chunks), + UInt32 => dyn_filter!(u32, array, filter_count, chunks), + UInt64 => dyn_filter!(u64, array, filter_count, chunks), + Int8 => dyn_filter!(i8, array, filter_count, chunks), + Int16 => dyn_filter!(i16, array, filter_count, chunks), + Int32 => dyn_filter!(i32, array, filter_count, chunks), + Int64 => dyn_filter!(i64, array, filter_count, chunks), + Int128 => dyn_filter!(i128, array, filter_count, chunks), + DaysMs => dyn_filter!(days_ms, array, filter_count, chunks), + Float32 => dyn_filter!(f32, array, filter_count, chunks), + Float64 => dyn_filter!(f64, array, filter_count, chunks), + Utf8 => { + let array = array.as_any().downcast_ref::>().unwrap(); + let mut growable = growable::GrowableUtf8::new(vec![array], false, filter_count); + filter_growable(&mut growable, &chunks); + let array: Utf8Array = growable.into(); + Box::new(array) + } + LargeUtf8 => { + let array = array.as_any().downcast_ref::>().unwrap(); + let mut growable = growable::GrowableUtf8::new(vec![array], false, filter_count); + filter_growable(&mut growable, &chunks); + let array: Utf8Array = growable.into(); + Box::new(array) + } + _ => { + let mut mutable = make_growable(&[array], false, filter_count); + chunks + .iter() + .for_each(|(start, len)| mutable.extend(0, *start, *len)); + mutable.as_box() + } } })) } @@ -185,55 +164,53 @@ pub fn build_filter(filter: &BooleanArray) -> Result { /// # } /// ``` pub fn filter(array: &dyn Array, filter: &BooleanArray) -> Result> { - match array.data_type() { - DataType::UInt8 => { + use crate::datatypes::PhysicalType::*; + match array.data_type().to_physical_type() { + UInt8 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::UInt16 => { + UInt16 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::UInt32 => { + UInt32 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::UInt64 => { + UInt64 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Int8 => { + Int8 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Int16 => { + Int16 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { + Int32 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { + Int64 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Interval(IntervalUnit::DayTime) => { + Int128 => { + let array = array.as_any().downcast_ref().unwrap(); + Ok(Box::new(filter_primitive::(array, filter))) + } + DaysMs => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Float32 => { + Float32 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } - DataType::Float64 => { + Float64 => { let array = array.as_any().downcast_ref().unwrap(); Ok(Box::new(filter_primitive::(array, filter))) } diff --git a/src/compute/take/mod.rs b/src/compute/take/mod.rs index ada94c38fe5..01296bc3df9 100644 --- a/src/compute/take/mod.rs +++ b/src/compute/take/mod.rs @@ -19,7 +19,7 @@ use crate::{ array::{new_empty_array, Array, NullArray, PrimitiveArray}, - datatypes::{DataType, IntervalUnit}, + datatypes::DataType, error::Result, types::{days_ms, Index}, }; @@ -58,65 +58,58 @@ pub fn take(values: &dyn Array, indices: &PrimitiveArray) -> Result return Ok(new_empty_array(values.data_type().clone())); } - match values.data_type() { - DataType::Null => Ok(Box::new(NullArray::from_data( + use crate::datatypes::PhysicalType::*; + match values.data_type().to_physical_type() { + Null => Ok(Box::new(NullArray::from_data( values.data_type().clone(), indices.len(), ))), - DataType::Boolean => { + Boolean => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(boolean::take::(values, indices))) } - DataType::Int8 => downcast_take!(i8, values, indices), - DataType::Int16 => downcast_take!(i16, values, indices), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => downcast_take!(i32, values, indices), - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Timestamp(_, _) => downcast_take!(i64, values, indices), - DataType::Interval(IntervalUnit::DayTime) => downcast_take!(days_ms, values, indices), - DataType::UInt8 => downcast_take!(u8, values, indices), - DataType::UInt16 => downcast_take!(u16, values, indices), - DataType::UInt32 => downcast_take!(u32, values, indices), - DataType::UInt64 => downcast_take!(u64, values, indices), - DataType::Float16 => unreachable!(), - DataType::Float32 => downcast_take!(f32, values, indices), - DataType::Float64 => downcast_take!(f64, values, indices), - DataType::Decimal(_, _) => downcast_take!(i128, values, indices), - DataType::Utf8 => { + Int8 => downcast_take!(i8, values, indices), + Int16 => downcast_take!(i16, values, indices), + Int32 => downcast_take!(i32, values, indices), + Int64 => downcast_take!(i64, values, indices), + Int128 => downcast_take!(i128, values, indices), + DaysMs => downcast_take!(days_ms, values, indices), + UInt8 => downcast_take!(u8, values, indices), + UInt16 => downcast_take!(u16, values, indices), + UInt32 => downcast_take!(u32, values, indices), + UInt64 => downcast_take!(u64, values, indices), + Float32 => downcast_take!(f32, values, indices), + Float64 => downcast_take!(f64, values, indices), + Utf8 => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(utf8::take::(values, indices))) } - DataType::LargeUtf8 => { + LargeUtf8 => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(utf8::take::(values, indices))) } - DataType::Binary => { + Binary => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(binary::take::(values, indices))) } - DataType::LargeBinary => { + LargeBinary => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(binary::take::(values, indices))) } - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { downcast_dict_take!($T, values, indices) }) } - DataType::Struct(_) => { + Struct => { let array = values.as_any().downcast_ref().unwrap(); Ok(Box::new(structure::take::<_>(array, indices)?)) } - DataType::List(_) => { + List => { let array = values.as_any().downcast_ref().unwrap(); Ok(Box::new(list::take::(array, indices))) } - DataType::LargeList(_) => { + LargeList => { let array = values.as_any().downcast_ref().unwrap(); Ok(Box::new(list::take::(array, indices))) } @@ -183,7 +176,7 @@ pub fn can_take(data_type: &DataType) -> bool { mod tests { use std::sync::Arc; - use crate::datatypes::Field; + use crate::datatypes::{Field, IntervalUnit}; use crate::{array::*, bitmap::MutableBitmap, types::NativeType}; use super::*; diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 7aa4a886ed3..c955b274b23 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -1,8 +1,10 @@ //! Metadata declarations such as [`DataType`], [`Field`] and [`Schema`]. mod field; +mod physical_type; mod schema; pub use field::Field; +pub(crate) use physical_type::*; pub use schema::Schema; /// The set of datatypes that are supported by this implementation of Apache Arrow. @@ -169,6 +171,53 @@ impl DataType { _ => self == other, } } + + /// Returns the physical type of the logical type + pub(crate) fn to_physical_type(&self) -> PhysicalType { + use DataType::*; + match self { + Null => PhysicalType::Null, + Boolean => PhysicalType::Boolean, + Int8 => PhysicalType::Int8, + Int16 => PhysicalType::Int16, + Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => PhysicalType::Int32, + Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => PhysicalType::Int64, + Decimal(_, _) => PhysicalType::Int128, + UInt8 => PhysicalType::UInt8, + UInt16 => PhysicalType::UInt16, + UInt32 => PhysicalType::UInt32, + UInt64 => PhysicalType::UInt64, + Float16 => unreachable!(), + Float32 => PhysicalType::Float32, + Float64 => PhysicalType::Float64, + Interval(IntervalUnit::DayTime) => PhysicalType::DaysMs, + Binary => PhysicalType::Binary, + FixedSizeBinary(_) => PhysicalType::FixedSizeBinary, + LargeBinary => PhysicalType::LargeBinary, + Utf8 => PhysicalType::Utf8, + LargeUtf8 => PhysicalType::LargeUtf8, + List(_) => PhysicalType::List, + FixedSizeList(_, _) => PhysicalType::FixedSizeList, + LargeList(_) => PhysicalType::LargeList, + Struct(_) => PhysicalType::Struct, + Union(_, _, _) => PhysicalType::Union, + Dictionary(key, _) => PhysicalType::Dictionary(to_dictionary_index_type(key.as_ref())), + } + } +} + +fn to_dictionary_index_type(data_type: &DataType) -> DictionaryIndexType { + match data_type { + DataType::Int8 => DictionaryIndexType::Int8, + DataType::Int16 => DictionaryIndexType::Int16, + DataType::Int32 => DictionaryIndexType::Int32, + DataType::Int64 => DictionaryIndexType::Int64, + DataType::UInt8 => DictionaryIndexType::UInt8, + DataType::UInt16 => DictionaryIndexType::UInt16, + DataType::UInt32 => DictionaryIndexType::UInt32, + DataType::UInt64 => DictionaryIndexType::UInt64, + _ => ::core::unreachable!("A dictionary key type can only be of integer types"), + } } // backward compatibility diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs new file mode 100644 index 00000000000..701bf237018 --- /dev/null +++ b/src/datatypes/physical_type.rs @@ -0,0 +1,88 @@ +/// Represents a physical type: a unique in-memory representation. +/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum DictionaryIndexType { + /// A signed 8-bit integer. + Int8, + /// A signed 16-bit integer. + Int16, + /// A signed 32-bit integer. + Int32, + /// A signed 64-bit integer. + Int64, + /// An unsigned 8-bit integer. + UInt8, + /// An unsigned 16-bit integer. + UInt16, + /// An unsigned 32-bit integer. + UInt32, + /// An unsigned 64-bit integer. + UInt64, +} + +/// Represents a physical type: a unique in-memory representation of an Arrow array. +/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and +/// a one-to-one mapping with all structs in this crate that implement [`crate::array::Array`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum PhysicalType { + Null, + /// A boolean datatype representing the values `true` and `false`. + Boolean, + /// A signed 8-bit integer. + Int8, + /// A signed 16-bit integer. + Int16, + /// A signed 32-bit integer. + Int32, + /// A signed 64-bit integer. + Int64, + /// A signed 128-bit integer. + Int128, + /// An unsigned 8-bit integer. + UInt8, + /// An unsigned 16-bit integer. + UInt16, + /// An unsigned 32-bit integer. + UInt32, + /// An unsigned 64-bit integer. + UInt64, + /// A 32-bit floating point number. + Float32, + /// A 64-bit floating point number. + Float64, + /// Two i32 representing days and ms + DaysMs, + /// Opaque binary data of variable length. + Binary, + /// Opaque binary data of fixed size. + /// Enum parameter specifies the number of bytes per value. + FixedSizeBinary, + /// Opaque binary data of variable length and 64-bit offsets. + LargeBinary, + /// A variable-length string in Unicode with UTF-8 encoding. + Utf8, + /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. + LargeUtf8, + /// A list of some logical data type with variable length. + List, + /// A list of some logical data type with fixed length. + FixedSizeList, + /// A list of some logical data type with variable length and 64-bit offsets. + LargeList, + /// A nested datatype that contains a number of sub-fields. + Struct, + /// A nested datatype that can represent slots of differing types. + /// Third argument represents sparsness + Union, + /// A dictionary encoded array (`key_type`, `value_type`), where + /// each array element is an index of `key_type` into an + /// associated dictionary of `value_type`. + /// + /// Dictionary arrays are used to store columns of `value_type` + /// that contain many repeated values using less memory, but with + /// a higher CPU overhead for some operations. + /// + /// This type mostly used to represent low cardinality string + /// arrays or a limited set of primitive types as integers. + Dictionary(DictionaryIndexType), +} diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 611160f525c..66702a336ef 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -21,10 +21,7 @@ use super::ffi::ArrowArrayRef; use crate::array::{BooleanArray, FromFfi}; use crate::error::{ArrowError, Result}; use crate::types::days_ms; -use crate::{ - array::*, - datatypes::{DataType, IntervalUnit}, -}; +use crate::{array::*, datatypes::PhysicalType}; /// Reads a valid `ffi` interface into a `Box` /// # Errors @@ -32,52 +29,39 @@ use crate::{ /// * the data type is not supported /// * the interface is not valid (e.g. a null pointer) pub fn try_from(array: A) -> Result> { - let array: Box = match array.field().data_type() { - DataType::Boolean => Box::new(BooleanArray::try_from_ffi(array)?), - DataType::Int8 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::Int16 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - Box::new(PrimitiveArray::::try_from_ffi(array)?) - } - DataType::Interval(IntervalUnit::DayTime) => { - Box::new(PrimitiveArray::::try_from_ffi(array)?) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::Decimal(_, _) => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::UInt8 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::UInt16 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::UInt32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::UInt64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::Float16 => unreachable!(), - DataType::Float32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::Float64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DataType::Utf8 => Box::new(Utf8Array::::try_from_ffi(array)?), - DataType::LargeUtf8 => Box::new(Utf8Array::::try_from_ffi(array)?), - DataType::Binary => Box::new(BinaryArray::::try_from_ffi(array)?), - DataType::LargeBinary => Box::new(BinaryArray::::try_from_ffi(array)?), - DataType::List(_) => Box::new(ListArray::::try_from_ffi(array)?), - DataType::LargeList(_) => Box::new(ListArray::::try_from_ffi(array)?), - DataType::Struct(_) => Box::new(StructArray::try_from_ffi(array)?), - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + use PhysicalType::*; + Ok(match array.field().data_type().to_physical_type() { + Boolean => Box::new(BooleanArray::try_from_ffi(array)?), + Int8 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Int16 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Int32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + DaysMs => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Int64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Int128 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + UInt8 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + UInt16 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + UInt32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + UInt64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Float32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Float64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Utf8 => Box::new(Utf8Array::::try_from_ffi(array)?), + LargeUtf8 => Box::new(Utf8Array::::try_from_ffi(array)?), + Binary => Box::new(BinaryArray::::try_from_ffi(array)?), + LargeBinary => Box::new(BinaryArray::::try_from_ffi(array)?), + List => Box::new(ListArray::::try_from_ffi(array)?), + LargeList => Box::new(ListArray::::try_from_ffi(array)?), + Struct => Box::new(StructArray::try_from_ffi(array)?), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { Box::new(DictionaryArray::<$T>::try_from_ffi(array)?) }) } - DataType::Union(_, _, _) => Box::new(UnionArray::try_from_ffi(array)?), + Union => Box::new(UnionArray::try_from_ffi(array)?), data_type => { return Err(ArrowError::NotYetImplemented(format!( - "Reading DataType \"{}\" is not yet supported.", + "Importing PhysicalType \"{:?}\" is not yet supported.", data_type ))) } - }; - - Ok(array) + }) } diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index 865b79059e6..386fd980fc5 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -11,7 +11,7 @@ use std::{ use gen::Schema::MetadataVersion; -use crate::datatypes::{DataType, IntervalUnit}; +use crate::datatypes::{DataType, PhysicalType}; use crate::error::Result; use crate::io::ipc::gen::Message::BodyCompression; use crate::{array::*, types::days_ms}; @@ -31,12 +31,13 @@ pub fn read( compression: Option, version: MetadataVersion, ) -> Result> { - match data_type { - DataType::Null => { + use PhysicalType::*; + match data_type.to_physical_type() { + Null => { let array = read_null(field_nodes, data_type); Ok(Arc::new(array)) } - DataType::Boolean => read_boolean( + Boolean => read_boolean( field_nodes, data_type, buffers, @@ -45,7 +46,7 @@ pub fn read( is_little_endian, ) .map(|x| Arc::new(x) as Arc), - DataType::Int8 => read_primitive::( + Int8 => read_primitive::( field_nodes, data_type, buffers, @@ -55,7 +56,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Int16 => read_primitive::( + Int16 => read_primitive::( field_nodes, data_type, buffers, @@ -65,10 +66,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => read_primitive::( + Int32 => read_primitive::( field_nodes, data_type, buffers, @@ -78,11 +76,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => read_primitive::( + Int64 => read_primitive::( field_nodes, data_type, buffers, @@ -92,7 +86,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Decimal(_, _) => read_primitive::( + Int128 => read_primitive::( field_nodes, data_type, buffers, @@ -102,7 +96,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Interval(IntervalUnit::DayTime) => read_primitive::( + DaysMs => read_primitive::( field_nodes, data_type, buffers, @@ -112,7 +106,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::UInt8 => read_primitive::( + UInt8 => read_primitive::( field_nodes, data_type, buffers, @@ -122,7 +116,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::UInt16 => read_primitive::( + UInt16 => read_primitive::( field_nodes, data_type, buffers, @@ -132,7 +126,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::UInt32 => read_primitive::( + UInt32 => read_primitive::( field_nodes, data_type, buffers, @@ -142,7 +136,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::UInt64 => read_primitive::( + UInt64 => read_primitive::( field_nodes, data_type, buffers, @@ -152,8 +146,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Float16 => unreachable!(), - DataType::Float32 => read_primitive::( + Float32 => read_primitive::( field_nodes, data_type, buffers, @@ -163,7 +156,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Float64 => read_primitive::( + Float64 => read_primitive::( field_nodes, data_type, buffers, @@ -173,7 +166,7 @@ pub fn read( compression, ) .map(|x| Arc::new(x) as Arc), - DataType::Binary => { + Binary => { let array = read_binary::( field_nodes, data_type, @@ -185,7 +178,7 @@ pub fn read( )?; Ok(Arc::new(array)) } - DataType::LargeBinary => { + LargeBinary => { let array = read_binary::( field_nodes, data_type, @@ -197,7 +190,7 @@ pub fn read( )?; Ok(Arc::new(array)) } - DataType::FixedSizeBinary(_) => { + FixedSizeBinary => { let array = read_fixed_size_binary( field_nodes, data_type, @@ -209,7 +202,7 @@ pub fn read( )?; Ok(Arc::new(array)) } - DataType::Utf8 => { + Utf8 => { let array = read_utf8::( field_nodes, data_type, @@ -221,7 +214,7 @@ pub fn read( )?; Ok(Arc::new(array)) } - DataType::LargeUtf8 => { + LargeUtf8 => { let array = read_utf8::( field_nodes, data_type, @@ -233,7 +226,7 @@ pub fn read( )?; Ok(Arc::new(array)) } - DataType::List(_) => read_list::( + List => read_list::( field_nodes, data_type, buffers, @@ -244,7 +237,7 @@ pub fn read( version, ) .map(|x| Arc::new(x) as Arc), - DataType::LargeList(_) => read_list::( + LargeList => read_list::( field_nodes, data_type, buffers, @@ -255,7 +248,7 @@ pub fn read( version, ) .map(|x| Arc::new(x) as Arc), - DataType::FixedSizeList(_, _) => read_fixed_size_list( + FixedSizeList => read_fixed_size_list( field_nodes, data_type, buffers, @@ -266,7 +259,7 @@ pub fn read( version, ) .map(|x| Arc::new(x) as Arc), - DataType::Struct(_) => read_struct( + Struct => read_struct( field_nodes, data_type, buffers, @@ -277,8 +270,8 @@ pub fn read( version, ) .map(|x| Arc::new(x) as Arc), - DataType::Dictionary(ref key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { read_dictionary::<$T, _>( field_nodes, buffers, @@ -289,7 +282,7 @@ pub fn read( .map(|x| Arc::new(x) as Arc) }) } - DataType::Union(_, _, _) => read_union( + Union => read_union( field_nodes, data_type, buffers, @@ -308,36 +301,20 @@ pub fn skip( data_type: &DataType, buffers: &mut VecDeque<&gen::Schema::Buffer>, ) { - match data_type { - DataType::Null => skip_null(field_nodes), - DataType::Boolean => skip_boolean(field_nodes, buffers), - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(_) - | DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Decimal(_, _) - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float32 - | DataType::Float16 - | DataType::Float64 => skip_primitive(field_nodes, buffers), - DataType::LargeBinary | DataType::Binary => skip_binary(field_nodes, buffers), - DataType::LargeUtf8 | DataType::Utf8 => skip_utf8(field_nodes, buffers), - DataType::FixedSizeBinary(_) => skip_fixed_size_binary(field_nodes, buffers), - DataType::List(_) => skip_list::(field_nodes, data_type, buffers), - DataType::LargeList(_) => skip_list::(field_nodes, data_type, buffers), - DataType::FixedSizeList(_, _) => skip_fixed_size_list(field_nodes, data_type, buffers), - DataType::Struct(_) => skip_struct(field_nodes, data_type, buffers), - DataType::Dictionary(_, _) => skip_dictionary(field_nodes, buffers), - DataType::Union(_, _, _) => skip_union(field_nodes, data_type, buffers), + use PhysicalType::*; + match data_type.to_physical_type() { + Null => skip_null(field_nodes), + Boolean => skip_boolean(field_nodes, buffers), + Int8 | Int16 | Int32 | Int64 | Int128 | UInt8 | UInt16 | UInt32 | UInt64 | Float32 + | Float64 | DaysMs => skip_primitive(field_nodes, buffers), + LargeBinary | Binary => skip_binary(field_nodes, buffers), + LargeUtf8 | Utf8 => skip_utf8(field_nodes, buffers), + FixedSizeBinary => skip_fixed_size_binary(field_nodes, buffers), + List => skip_list::(field_nodes, data_type, buffers), + LargeList => skip_list::(field_nodes, data_type, buffers), + FixedSizeList => skip_fixed_size_list(field_nodes, data_type, buffers), + Struct => skip_struct(field_nodes, data_type, buffers), + Dictionary(_) => skip_dictionary(field_nodes, buffers), + Union => skip_union(field_nodes, data_type, buffers), } } diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index b238e1b88d7..ba121564b35 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -18,7 +18,7 @@ use crate::{ array::*, bitmap::Bitmap, - datatypes::{DataType, IntervalUnit}, + datatypes::{DataType, PhysicalType}, endianess::is_native_little_endian, io::ipc::gen::Message, trusted_len::TrustedLen, @@ -346,79 +346,36 @@ pub fn write( array.len() as i64, array.null_count() as i64, )); - match array.data_type() { - DataType::Null => (), - DataType::Boolean => write_boolean(array, buffers, arrow_data, offset, is_little_endian), - DataType::Int8 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Int16 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Decimal(_, _) => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Interval(IntervalUnit::DayTime) => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::UInt8 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::UInt16 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::UInt32 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::UInt64 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Float16 => unreachable!(), - DataType::Float32 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Float64 => { - write_primitive::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::Binary => { - write_binary::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::LargeBinary => { - write_binary::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::FixedSizeBinary(_) => { + use PhysicalType::*; + match array.data_type().to_physical_type() { + Null => (), + Boolean => write_boolean(array, buffers, arrow_data, offset, is_little_endian), + Int8 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Int16 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Int32 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Int64 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Int128 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + DaysMs => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + UInt8 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + UInt16 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + UInt32 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + UInt64 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Float32 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Float64 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Binary => write_binary::(array, buffers, arrow_data, offset, is_little_endian), + LargeBinary => write_binary::(array, buffers, arrow_data, offset, is_little_endian), + FixedSizeBinary => { write_fixed_size_binary(array, buffers, arrow_data, offset, is_little_endian) } - DataType::Utf8 => write_utf8::(array, buffers, arrow_data, offset, is_little_endian), - DataType::LargeUtf8 => { - write_utf8::(array, buffers, arrow_data, offset, is_little_endian) - } - DataType::List(_) => { - write_list::(array, buffers, arrow_data, nodes, offset, is_little_endian) - } - DataType::LargeList(_) => { - write_list::(array, buffers, arrow_data, nodes, offset, is_little_endian) - } - DataType::FixedSizeList(_, _) => { + Utf8 => write_utf8::(array, buffers, arrow_data, offset, is_little_endian), + LargeUtf8 => write_utf8::(array, buffers, arrow_data, offset, is_little_endian), + List => write_list::(array, buffers, arrow_data, nodes, offset, is_little_endian), + LargeList => write_list::(array, buffers, arrow_data, nodes, offset, is_little_endian), + FixedSizeList => { write_fixed_size_list(array, buffers, arrow_data, nodes, offset, is_little_endian) } - DataType::Struct(_) => { - write_struct(array, buffers, arrow_data, nodes, offset, is_little_endian) - } - DataType::Dictionary(_, _) => { + Struct => write_struct(array, buffers, arrow_data, nodes, offset, is_little_endian), + Dictionary(_) => { write_dictionary( array, buffers, @@ -429,7 +386,7 @@ pub fn write( true, ); } - DataType::Union(_, _, _) => { + Union => { write_union(array, buffers, arrow_data, nodes, offset, is_little_endian); } } diff --git a/src/types/mod.rs b/src/types/mod.rs index 3a30ee7801d..31b2996aadc 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -15,7 +15,7 @@ mod index; pub mod simd; pub use index::*; -use crate::datatypes::{DataType, IntervalUnit, TimeUnit}; +use crate::datatypes::{DataType, IntervalUnit, PhysicalType}; /// Trait denoting anything that has a natural logical [`DataType`]. /// For example, [`DataType::Int32`] for `i32`. @@ -35,7 +35,8 @@ macro_rules! create_relation { unsafe impl Relation for $native_ty { #[inline] fn is_valid(data_type: &DataType) -> bool { - matches!(data_type, $($impl_pattern)|+) + let physical_type = data_type.to_physical_type(); + matches!(physical_type, $($impl_pattern)|+) } } }; @@ -127,34 +128,17 @@ natural_type!(f64, DataType::Float64); natural_type!(days_ms, DataType::Interval(IntervalUnit::DayTime)); natural_type!(i128, DataType::Decimal(32, 32)); // users should set the decimal when creating an array -create_relation!(u8, &DataType::UInt8); -create_relation!(u16, &DataType::UInt16); -create_relation!(u32, &DataType::UInt32); -create_relation!(u64, &DataType::UInt64); -create_relation!(i8, &DataType::Int8); -create_relation!(i16, &DataType::Int16); -create_relation!( - i32, - &DataType::Int32 - | &DataType::Date32 - | &DataType::Time32(TimeUnit::Millisecond) - | &DataType::Time32(TimeUnit::Second) - | &DataType::Interval(IntervalUnit::YearMonth) -); - -create_relation!( - i64, - &DataType::Int64 - | &DataType::Date64 - | &DataType::Time64(TimeUnit::Microsecond) - | &DataType::Time64(TimeUnit::Nanosecond) - | &DataType::Timestamp(_, _) - | &DataType::Duration(_) -); - -create_relation!(i128, &DataType::Decimal(_, _)); -create_relation!(f32, &DataType::Float32); -create_relation!(f64, &DataType::Float64); +create_relation!(u8, PhysicalType::UInt8); +create_relation!(u16, PhysicalType::UInt16); +create_relation!(u32, PhysicalType::UInt32); +create_relation!(u64, PhysicalType::UInt64); +create_relation!(i8, PhysicalType::Int8); +create_relation!(i16, PhysicalType::Int16); +create_relation!(i32, PhysicalType::Int32); +create_relation!(i64, PhysicalType::Int64); +create_relation!(i128, PhysicalType::Int128); +create_relation!(f32, PhysicalType::Float32); +create_relation!(f64, PhysicalType::Float64); /// The in-memory representation of the DayMillisecond variant of arrow's "Interval" logical type. #[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash)] @@ -217,7 +201,7 @@ unsafe impl NativeType for days_ms { } } -create_relation!(days_ms, &DataType::Interval(IntervalUnit::DayTime)); +create_relation!(days_ms, PhysicalType::DaysMs); impl days_ms { /// A new [`days_ms`]. From 1343b1d30bae9b9d5e0eea23898f77a9603dfad4 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 12:29:30 +0000 Subject: [PATCH 08/12] migrated. --- src/compute/aggregate/memory.rs | 43 ++++++++++++--------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 8ec120da5c1..1174a1d4967 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -1,6 +1,6 @@ use crate::array::*; use crate::bitmap::Bitmap; -use crate::datatypes::{DataType, IntervalUnit}; +use crate::datatypes::PhysicalType; use crate::types::days_ms; fn validity_size(validity: &Option) -> usize { @@ -50,8 +50,8 @@ macro_rules! dyn_dict { /// /// FFI buffers are included in this estimation. pub fn estimated_bytes_size(array: &dyn Array) -> usize { - use DataType::*; - match array.data_type() { + use PhysicalType::*; + match array.data_type().to_physical_type() { Null => 0, Boolean => { let array = array.as_any().downcast_ref::().unwrap(); @@ -59,21 +59,18 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { } Int8 => dyn_primitive!(array, i8), Int16 => dyn_primitive!(array, i16), - Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => { - dyn_primitive!(array, i32) - } - Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => dyn_primitive!(array, i64), + Int32 => dyn_primitive!(array, i32), + Int64 => dyn_primitive!(array, i64), + Int128 => dyn_primitive!(array, i128), + DaysMs => dyn_primitive!(array, days_ms), UInt8 => dyn_primitive!(array, u16), UInt16 => dyn_primitive!(array, u16), UInt32 => dyn_primitive!(array, u32), UInt64 => dyn_primitive!(array, u64), - Float16 => unreachable!(), Float32 => dyn_primitive!(array, f32), Float64 => dyn_primitive!(array, f64), - Decimal(_, _) => dyn_primitive!(array, i128), - Interval(IntervalUnit::DayTime) => dyn_primitive!(array, days_ms), Binary => dyn_binary!(array, BinaryArray, i32), - FixedSizeBinary(_) => { + FixedSizeBinary => { let array = array .as_any() .downcast_ref::() @@ -83,23 +80,23 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { LargeBinary => dyn_binary!(array, BinaryArray, i64), Utf8 => dyn_binary!(array, Utf8Array, i32), LargeUtf8 => dyn_binary!(array, Utf8Array, i64), - List(_) => { + List => { let array = array.as_any().downcast_ref::>().unwrap(); estimated_bytes_size(array.values().as_ref()) + array.offsets().len() * std::mem::size_of::() + validity_size(array.validity()) } - FixedSizeList(_, _) => { + FixedSizeList => { let array = array.as_any().downcast_ref::>().unwrap(); estimated_bytes_size(array.values().as_ref()) + validity_size(array.validity()) } - LargeList(_) => { + LargeList => { let array = array.as_any().downcast_ref::>().unwrap(); estimated_bytes_size(array.values().as_ref()) + array.offsets().len() * std::mem::size_of::() + validity_size(array.validity()) } - Struct(_) => { + Struct => { let array = array.as_any().downcast_ref::().unwrap(); array .values() @@ -109,7 +106,7 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { .sum::() + validity_size(array.validity()) } - Union(_, _, _) => { + Union => { let array = array.as_any().downcast_ref::().unwrap(); let types = array.types().len() * std::mem::size_of::(); let offsets = array @@ -125,17 +122,9 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { .sum::(); types + offsets + fields } - Dictionary(keys, _) => match keys.as_ref() { - Int8 => dyn_dict!(array, i8), - Int16 => dyn_dict!(array, i16), - Int32 => dyn_dict!(array, i32), - Int64 => dyn_dict!(array, i64), - UInt8 => dyn_dict!(array, u8), - UInt16 => dyn_dict!(array, u16), - UInt32 => dyn_dict!(array, u32), - UInt64 => dyn_dict!(array, u64), - _ => unreachable!(), - }, + Dictionary(key_type) => with_match_physical_dictionary_key_type!(key_type, |$T| { + dyn_dict!(array, $T) + }), } } From c4c34fbfd9605119003d1d8cfc42f0d63d94df7c Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 30 Aug 2021 14:44:36 +0000 Subject: [PATCH 09/12] Migrate. --- src/array/binary/mod.rs | 2 +- src/array/boolean/mod.rs | 8 ++- src/array/boolean/mutable.rs | 5 +- src/array/fixed_size_binary/mod.rs | 9 ++- src/array/fixed_size_list/mod.rs | 18 +++--- src/array/growable/mod.rs | 70 ++++++++++-------------- src/array/list/mod.rs | 14 ++--- src/array/struct_.rs | 2 +- src/array/utf8/mod.rs | 6 +- src/array/utf8/mutable.rs | 24 ++------ src/datatypes/mod.rs | 6 +- src/datatypes/physical_type.rs | 33 ++++------- src/io/ipc/read/array/fixed_size_list.rs | 6 +- src/types/mod.rs | 5 +- 14 files changed, 93 insertions(+), 115 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index fa336d2dc8c..a268d010957 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -56,7 +56,7 @@ impl BinaryArray { assert_eq!(offsets.len() - 1, validity.len()); } - if data_type != Self::default_data_type() { + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary") } diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index d9aecc80acc..8c1207e3902 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -1,4 +1,7 @@ -use crate::{bitmap::Bitmap, datatypes::DataType}; +use crate::{ + bitmap::Bitmap, + datatypes::{DataType, PhysicalType}, +}; use super::{display_fmt, Array}; @@ -40,6 +43,9 @@ impl BooleanArray { if let Some(ref validity) = validity { assert_eq!(values.len(), validity.len()); } + if data_type.to_physical_type() != PhysicalType::Boolean { + panic!("BooleanArray can only be initialized with DataType::Boolean") + } Self { data_type, values, diff --git a/src/array/boolean/mutable.rs b/src/array/boolean/mutable.rs index 4be152a3515..8ab6d01faa1 100644 --- a/src/array/boolean/mutable.rs +++ b/src/array/boolean/mutable.rs @@ -4,7 +4,7 @@ use std::sync::Arc; use crate::{ array::{Array, MutableArray, TryExtend, TryPush}, bitmap::MutableBitmap, - datatypes::DataType, + datatypes::{DataType, PhysicalType}, error::Result, trusted_len::TrustedLen, }; @@ -74,6 +74,9 @@ impl MutableBooleanArray { values: MutableBitmap, validity: Option, ) -> Self { + if data_type.to_physical_type() != PhysicalType::Boolean { + panic!("MutableBooleanArray can only be initialized with DataType::Boolean") + } Self { data_type, values, diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 71a9a687513..2a91db09ce6 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -40,7 +40,7 @@ impl FixedSizeBinaryArray { Self { size, - data_type: DataType::FixedSizeBinary(size), + data_type, values, validity, offset: 0, @@ -97,10 +97,9 @@ impl FixedSizeBinaryArray { impl FixedSizeBinaryArray { pub(crate) fn get_size(data_type: &DataType) -> &i32 { - if let DataType::FixedSizeBinary(size) = data_type { - size - } else { - panic!("Wrong DataType") + match data_type { + DataType::FixedSizeBinary(size) => size, + _ => panic!("Wrong DataType"), } } } diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 4147de5b6da..6b5e1e4e7b1 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -26,13 +26,18 @@ pub struct FixedSizeListArray { impl FixedSizeListArray { /// Returns a new empty [`FixedSizeListArray`]. pub fn new_empty(data_type: DataType) -> Self { - let values = new_empty_array(Self::get_child_and_size(&data_type).0.clone()).into(); + let values = + new_empty_array(Self::get_child_and_size(&data_type).0.data_type().clone()).into(); Self::from_data(data_type, values, None) } /// Returns a new null [`FixedSizeListArray`]. pub fn new_null(data_type: DataType, length: usize) -> Self { - let values = new_null_array(Self::get_child_and_size(&data_type).0.clone(), length).into(); + let values = new_null_array( + Self::get_child_and_size(&data_type).0.data_type().clone(), + length, + ) + .into(); Self::from_data(data_type, values, Some(Bitmap::new_zeroed(length))) } @@ -88,11 +93,10 @@ impl FixedSizeListArray { } impl FixedSizeListArray { - pub(crate) fn get_child_and_size(data_type: &DataType) -> (&DataType, &i32) { - if let DataType::FixedSizeList(field, size) = data_type { - (field.data_type(), size) - } else { - panic!("Wrong DataType") + pub(crate) fn get_child_and_size(data_type: &DataType) -> (&Field, &i32) { + match data_type { + DataType::FixedSizeList(child, size) => (child.as_ref(), size), + _ => panic!("Wrong DataType"), } } diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 25993dfdb4a..4a67b90a02f 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -96,9 +96,10 @@ pub fn make_growable<'a>( let data_type = arrays[0].data_type(); assert!(arrays.iter().all(|&item| item.data_type() == data_type)); - match data_type { - DataType::Null => Box::new(null::GrowableNull::new(data_type.clone())), - DataType::Boolean => { + use PhysicalType::*; + match data_type.to_physical_type() { + Null => Box::new(null::GrowableNull::new(data_type.clone())), + Boolean => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -109,33 +110,19 @@ pub fn make_growable<'a>( capacity, )) } - DataType::Int8 => dyn_growable!(i8, arrays, use_validity, capacity), - DataType::Int16 => dyn_growable!(i16, arrays, use_validity, capacity), - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - dyn_growable!(i32, arrays, use_validity, capacity) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - dyn_growable!(i64, arrays, use_validity, capacity) - } - DataType::Interval(IntervalUnit::DayTime) => { - dyn_growable!(days_ms, arrays, use_validity, capacity) - } - DataType::Decimal(_, _) => dyn_growable!(i128, arrays, use_validity, capacity), - DataType::UInt8 => dyn_growable!(u8, arrays, use_validity, capacity), - DataType::UInt16 => dyn_growable!(u16, arrays, use_validity, capacity), - DataType::UInt32 => dyn_growable!(u32, arrays, use_validity, capacity), - DataType::UInt64 => dyn_growable!(u64, arrays, use_validity, capacity), - DataType::Float16 => unreachable!(), - DataType::Float32 => dyn_growable!(f32, arrays, use_validity, capacity), - DataType::Float64 => dyn_growable!(f64, arrays, use_validity, capacity), - DataType::Utf8 => { + Int8 => dyn_growable!(i8, arrays, use_validity, capacity), + Int16 => dyn_growable!(i16, arrays, use_validity, capacity), + Int32 => dyn_growable!(i32, arrays, use_validity, capacity), + Int64 => dyn_growable!(i64, arrays, use_validity, capacity), + Int128 => dyn_growable!(i128, arrays, use_validity, capacity), + DaysMs => dyn_growable!(days_ms, arrays, use_validity, capacity), + UInt8 => dyn_growable!(u8, arrays, use_validity, capacity), + UInt16 => dyn_growable!(u16, arrays, use_validity, capacity), + UInt32 => dyn_growable!(u32, arrays, use_validity, capacity), + UInt64 => dyn_growable!(u64, arrays, use_validity, capacity), + Float32 => dyn_growable!(f32, arrays, use_validity, capacity), + Float64 => dyn_growable!(f64, arrays, use_validity, capacity), + Utf8 => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -146,7 +133,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::LargeUtf8 => { + LargeUtf8 => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -157,7 +144,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::Binary => { + Binary => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -168,7 +155,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::LargeBinary => { + LargeBinary => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -179,7 +166,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::FixedSizeBinary(_) => { + FixedSizeBinary => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -190,8 +177,7 @@ pub fn make_growable<'a>( capacity, )) } - - DataType::List(_) => { + List => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -202,7 +188,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::LargeList(_) => { + LargeList => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -213,7 +199,7 @@ pub fn make_growable<'a>( capacity, )) } - DataType::Struct(_) => { + Struct => { let arrays = arrays .iter() .map(|array| array.as_any().downcast_ref().unwrap()) @@ -224,10 +210,10 @@ pub fn make_growable<'a>( capacity, )) } - DataType::FixedSizeList(_, _) => todo!(), - DataType::Union(_, _, _) => todo!(), - DataType::Dictionary(key_type, _) => { - with_match_dictionary_key_type!(key_type.as_ref(), |$T| { + FixedSizeList => todo!(), + Union => todo!(), + Dictionary(key_type) => { + with_match_physical_dictionary_key_type!(key_type, |$T| { dyn_dict_growable!($T, arrays, use_validity, capacity) }) } diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 8e767dda0a8..95808e2c191 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -133,15 +133,15 @@ impl ListArray { #[inline] pub fn get_child_field(data_type: &DataType) -> &Field { if O::is_large() { - if let DataType::LargeList(child) = data_type { - child.as_ref() - } else { - panic!("Wrong DataType") + match data_type { + DataType::LargeList(child) => child.as_ref(), + _ => panic!("Wrong DataType"), } - } else if let DataType::List(child) = data_type { - child.as_ref() } else { - panic!("Wrong DataType") + match data_type { + DataType::List(child) => child.as_ref(), + _ => panic!("Wrong DataType"), + } } } diff --git a/src/array/struct_.rs b/src/array/struct_.rs index 488df1097d5..a18ee3bdd56 100644 --- a/src/array/struct_.rs +++ b/src/array/struct_.rs @@ -24,7 +24,7 @@ use super::{ffi::ToFfi, new_empty_array, new_null_array, Array, FromFfi}; /// Field::new("c", DataType::Int32, false), /// ]; /// -/// let array = StructArray::from_data(fields, vec![boolean, int], None); +/// let array = StructArray::from_data(DataType::Struct(fields), vec![boolean, int], None); /// ``` #[derive(Debug, Clone)] pub struct StructArray { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 2e022b51439..a5cb62b57a2 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -72,7 +72,7 @@ impl Utf8Array { assert_eq!(offsets.len() - 1, validity.len()); } - if data_type != Self::default_data_type() { + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") } @@ -105,6 +105,10 @@ impl Utf8Array { ) -> Self { check_offsets(&offsets, values.len()); + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") + } + Self { data_type, offsets, diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index fc7eb61c898..80e1e292ae6 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -46,11 +46,7 @@ impl MutableUtf8Array { let mut offsets = MutableBuffer::::new(); offsets.push(O::default()); Self { - data_type: if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - }, + data_type: Self::default_data_type(), offsets, values: MutableBuffer::::new(), validity: None, @@ -73,10 +69,8 @@ impl MutableUtf8Array { if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } - if O::is_large() { - assert_eq!(data_type, DataType::LargeUtf8) - } else { - assert_eq!(data_type, DataType::Utf8) + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") } Self { data_type, @@ -103,10 +97,8 @@ impl MutableUtf8Array { if let Some(ref validity) = validity { assert_eq!(offsets.len() - 1, validity.len()); } - if O::is_large() { - assert_eq!(data_type, DataType::LargeUtf8) - } else { - assert_eq!(data_type, DataType::Utf8) + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + panic!("MutableUtf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8") } Self { data_type, @@ -117,11 +109,7 @@ impl MutableUtf8Array { } fn default_data_type() -> DataType { - if O::is_large() { - DataType::LargeUtf8 - } else { - DataType::Utf8 - } + Utf8Array::::default_data_type() } /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots. diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index c955b274b23..01e5d03c2cb 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -4,7 +4,7 @@ mod physical_type; mod schema; pub use field::Field; -pub(crate) use physical_type::*; +pub use physical_type::*; pub use schema::Schema; /// The set of datatypes that are supported by this implementation of Apache Arrow. @@ -172,8 +172,8 @@ impl DataType { } } - /// Returns the physical type of the logical type - pub(crate) fn to_physical_type(&self) -> PhysicalType { + /// the [`PhysicalType`] of this [`DataType`]. + pub fn to_physical_type(&self) -> PhysicalType { use DataType::*; match self { Null => PhysicalType::Null, diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs index 701bf237018..6c5f47fa7b8 100644 --- a/src/datatypes/physical_type.rs +++ b/src/datatypes/physical_type.rs @@ -1,5 +1,4 @@ -/// Represents a physical type: a unique in-memory representation. -/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`]. +/// the set of valid indices used to index a dictionary-encoded Array. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum DictionaryIndexType { /// A signed 8-bit integer. @@ -20,13 +19,14 @@ pub enum DictionaryIndexType { UInt64, } -/// Represents a physical type: a unique in-memory representation of an Arrow array. +/// The set of physical types: unique in-memory representations of an Arrow array. /// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and -/// a one-to-one mapping with all structs in this crate that implement [`crate::array::Array`]. +/// a one-to-one mapping with each struct in this crate that implements [`crate::array::Array`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum PhysicalType { + /// A Null with no allocation. Null, - /// A boolean datatype representing the values `true` and `false`. + /// A boolean represented as a single bit. Boolean, /// A signed 8-bit integer. Int8, @@ -55,7 +55,6 @@ pub enum PhysicalType { /// Opaque binary data of variable length. Binary, /// Opaque binary data of fixed size. - /// Enum parameter specifies the number of bytes per value. FixedSizeBinary, /// Opaque binary data of variable length and 64-bit offsets. LargeBinary, @@ -63,26 +62,16 @@ pub enum PhysicalType { Utf8, /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. LargeUtf8, - /// A list of some logical data type with variable length. + /// A list of some data type with variable length. List, - /// A list of some logical data type with fixed length. + /// A list of some data type with fixed length. FixedSizeList, - /// A list of some logical data type with variable length and 64-bit offsets. + /// A list of some data type with variable length and 64-bit offsets. LargeList, - /// A nested datatype that contains a number of sub-fields. + /// A nested type that contains an arbitrary number of fields. Struct, - /// A nested datatype that can represent slots of differing types. - /// Third argument represents sparsness + /// A nested type that represents slots of differing types. Union, - /// A dictionary encoded array (`key_type`, `value_type`), where - /// each array element is an index of `key_type` into an - /// associated dictionary of `value_type`. - /// - /// Dictionary arrays are used to store columns of `value_type` - /// that contain many repeated values using less memory, but with - /// a higher CPU overhead for some operations. - /// - /// This type mostly used to represent low cardinality string - /// arrays or a limited set of primitive types as integers. + /// A dictionary encoded array by `DictionaryIndexType`. Dictionary(DictionaryIndexType), } diff --git a/src/io/ipc/read/array/fixed_size_list.rs b/src/io/ipc/read/array/fixed_size_list.rs index 8fb9b45cbcd..3527665fcc0 100644 --- a/src/io/ipc/read/array/fixed_size_list.rs +++ b/src/io/ipc/read/array/fixed_size_list.rs @@ -37,7 +37,7 @@ pub fn read_fixed_size_list( let values = read( field_nodes, - value_data_type.clone(), + value_data_type.data_type().clone(), buffers, reader, block_offset, @@ -57,7 +57,7 @@ pub fn skip_fixed_size_list( let _ = buffers.pop_front().unwrap(); - let (data_type, _) = FixedSizeListArray::get_child_and_size(data_type); + let (field, _) = FixedSizeListArray::get_child_and_size(data_type); - skip(field_nodes, data_type, buffers) + skip(field_nodes, field.data_type(), buffers) } diff --git a/src/types/mod.rs b/src/types/mod.rs index 31b2996aadc..fe030d73424 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -31,12 +31,11 @@ pub unsafe trait Relation { } macro_rules! create_relation { - ($native_ty:ty, $($impl_pattern:pat)|+) => { + ($native_ty:ty, $physical_ty:expr) => { unsafe impl Relation for $native_ty { #[inline] fn is_valid(data_type: &DataType) -> bool { - let physical_type = data_type.to_physical_type(); - matches!(physical_type, $($impl_pattern)|+) + data_type.to_physical_type() == $physical_ty } } }; From 5f86cc7100f951b553cef9318f292bafec17cec5 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 31 Aug 2021 09:37:46 +0000 Subject: [PATCH 10/12] Updated Guide. --- guide/src/high_level.md | 185 ++++++++++++++++++---------------------- 1 file changed, 83 insertions(+), 102 deletions(-) diff --git a/guide/src/high_level.md b/guide/src/high_level.md index 8d6e93aded8..9cc1c86f0e6 100644 --- a/guide/src/high_level.md +++ b/guide/src/high_level.md @@ -4,59 +4,75 @@ The simplest way to think about an arrow `Array` is that it represents `Vec>` and has a logical type (see [metadata](../metadata.md))) associated with it. Probably the simplest array in this crate is `PrimitiveArray`. It can be constructed -from an iterator as follows: +from a slice as follows: + +```rust +# use arrow2::array::{Array, PrimitiveArray}; +# fn main() { +let array = PrimitiveArray::from([Some(1), None, Some(123)]); +assert_eq!(array.len(), 3) +# } +``` + +from a slice of values, + +```rust +# use arrow2::array::{Array, PrimitiveArray}; +# fn main() { +let array = PrimitiveArray::from_slice([1, 0, 123]); +assert_eq!(array.len(), 3) +# } +``` + +or from an iterator ```rust # use arrow2::array::{Array, PrimitiveArray}; -# use arrow2::datatypes::DataType; # fn main() { -let array = [Some(1), None, Some(123)] - .iter() - .collect::>(); +let array: PrimitiveArray = [Some(1), None, Some(123)].iter().collect(); assert_eq!(array.len(), 3) # } ``` A `PrimitiveArray` has 3 components: -1. A physical type (`i32`) +1. A physical type (e.g. `i32`) 2. A logical type (e.g. `DataType::Int32`) 3. Data The main differences from a `Vec>` are: -* Its data is laid out in memory as a `Buffer` and an `Option`. -* It has an associated logical datatype. +* Its data is laid out in memory as a `Buffer` and an `Option` (see [../low_level.md]) +* It has an associated logical type (`DataType`). -The first difference allows interoperability with Arrow's ecosystem and efficient SIMD operations (we will re-visit this below); the second difference is that it gives semantic meaning to the array. In the example +The first allows interoperability with Arrow's ecosystem and efficient SIMD operations +(we will re-visit this below); the second is that it gives semantic meaning to the array. +In the example ```rust # use arrow2::array::PrimitiveArray; # use arrow2::datatypes::DataType; # fn main() { let ints = PrimitiveArray::::from([Some(1), None]); -let dates = PrimitiveArray::::from([Some(1), None]); +let dates = PrimitiveArray::::from([Some(1), None]).to(DataType::Date32); # } ``` -`ints` and `dates` have the same in-memory representation but different logic representations (e.g. dates are usually represented as a string). +`ints` and `dates` have the same in-memory representation but different logic +representations (e.g. dates are usually printed to users as "yyyy-mm-dd"). -Some physical types (e.g. `i32`) have a "natural" logical `DataType` (e.g. `DataType::Int32`). -These types support a more compact notation: +All physical types (e.g. `i32`) have a "natural" logical `DataType` (e.g. `DataType::Int32`) +which is assigned when allocating arrays from iterators, slices, etc. ```rust # use arrow2::array::{Array, Int32Array, PrimitiveArray}; # use arrow2::datatypes::DataType; # fn main() { -/// Int32Array = PrimitiveArray -let array = [Some(1), None, Some(123)].iter().collect::(); -assert_eq!(array.len(), 3); -let array = Int32Array::from(&[Some(1), None, Some(123)]); -assert_eq!(array.len(), 3); -let array = Int32Array::from_slice(&[1, 123]); -assert_eq!(array.len(), 2); +let array = PrimitiveArray::from_slice([1, 0, 123]); +assert_eq!(array.data_type(), &DataType::Int32); # } ``` +they can be cheaply converted to via `.to(DataType)`. The following arrays are supported: @@ -68,7 +84,8 @@ The following arrays are supported: * `FixedSizeBinaryArray` (like `BinaryArray`, but fixed size) * `ListArray` and `ListArray` (nested arrays) * `FixedSizeListArray` (nested arrays of fixed size) -* `StructArray` (when each row has different logical types) +* `StructArray` (every row has multiple logical types) +* `UnionArray` (every row has a different logical type) * `DictionaryArray` (nested array with encoded values) ## Dynamic Array @@ -78,107 +95,71 @@ implement the trait `Array` and can be cast to `&dyn Array`, i.e. they can be tu a trait object. This enables arrays to have types that are dynamic in nature. ```rust -# use std::sync::Arc; # use arrow2::array::{Array, PrimitiveArray}; -# use arrow2::datatypes::DataType; # fn main() { -let data = vec![ - Some(vec![Some(1i32), Some(2), Some(3)]), - None, - Some(vec![Some(4), None, Some(6)]), -]; - let a = PrimitiveArray::::from(&[Some(1), None]); let a: &dyn Array = &a; # } ``` -Note how we have not specified the inner type explicitly in the signature `ListArray`. -Instead, `ListArray` has an inner `Array` representing all its values (available via `.values()`). - ### Downcast and `as_any` -Given a trait object `&dyn Array`, we know its logical type via `Array::data_type()` and can use it to downcast the array to its concrete type: +Given a trait object `array: &dyn Array`, we know its physical type via +`array.data_type().to_physical_type()`, which we use to downcast the array +to its concrete type: ```rust # use arrow2::array::{Array, PrimitiveArray}; -# use arrow2::datatypes::DataType; +# use arrow2::datatypes::PhysicalType; # fn main() { -let array = [Some(1), None, Some(123)] - .iter() - .collect::>(); +let a = PrimitiveArray::::from(&[Some(1), None]); let array = &array as &dyn Array; -let array = array.as_any().downcast_ref::>().unwrap(); +match array.data_type().to_physical_type() { + PhysicalType::Int32 => { + let array = array.as_any().downcast_ref::>().unwrap(); + let values: &[i32] = array.values(); + assert_eq!(values, &[1, 0]); + } + _ => todo!() +} # } ``` There is a many-to-one relationship between `DataType` and an Array (i.e. a physical representation). The relationship is the following: -| `DataType` | `PhysicalType` | -|-----------------------|---------------------------| -| `UInt8` | `PrimitiveArray` | -| `UInt16` | `PrimitiveArray` | -| `UInt32` | `PrimitiveArray` | -| `UInt64` | `PrimitiveArray` | -| `Int8` | `PrimitiveArray` | -| `Int16` | `PrimitiveArray` | -| `Int32` | `PrimitiveArray` | -| `Int64` | `PrimitiveArray` | -| `Float32` | `PrimitiveArray` | -| `Float64` | `PrimitiveArray` | -| `Decimal(_,_)` | `PrimitiveArray` | -| `Date32` | `PrimitiveArray` | -| `Date64` | `PrimitiveArray` | -| `Time32(_)` | `PrimitiveArray` | -| `Time64(_)` | `PrimitiveArray` | -| `Timestamp(_,_)` | `PrimitiveArray` | -| `Interval(YearMonth)` | `PrimitiveArray` | -| `Interval(DayTime)` | `PrimitiveArray` | -| `Duration(_)` | `PrimitiveArray` | -| `Binary` | `BinaryArray` | -| `LargeBinary` | `BinaryArray` | -| `Utf8` | `Utf8Array` | -| `LargeUtf8` | `Utf8Array` | -| `List` | `ListArray` | -| `LargeList` | `ListArray` | -| `FixedSizeBinary(_)` | `FixedSizeBinaryArray` | -| `FixedSizeList(_,_)` | `FixedSizeListArray` | -| `Struct(_)` | `StructArray` | -| `Union(_,_,_)` | `UnionArray` | -| `Dictionary(UInt8,_)` | `DictionaryArray` | -| `Dictionary(UInt16,_)`| `DictionaryArray` | -| `Dictionary(UInt32,_)`| `DictionaryArray` | -| `Dictionary(UInt64,_)`| `DictionaryArray` | -| `Dictionary(Int8,_)` | `DictionaryArray` | -| `Dictionary(Int16,_)` | `DictionaryArray` | -| `Dictionary(Int32,_)` | `DictionaryArray` | -| `Dictionary(Int64,_)` | `DictionaryArray` | - -In this context, a common pattern to write operators that receive `&dyn Array` is: - -```rust -use arrow2::datatypes::DataType; -use arrow2::array::{Array, PrimitiveArray}; - -fn float_operator(array: &dyn Array) -> Result, String> { - match array.data_type() { - DataType::Float32 => { - let array = array.as_any().downcast_ref::>().unwrap(); - // let array = f32-specific operator - let array = array.clone(); - Ok(Box::new(array)) - } - DataType::Float64 => { - let array = array.as_any().downcast_ref::>().unwrap(); - // let array = f64-specific operator - let array = array.clone(); - Ok(Box::new(array)) - } - _ => Err("This operator is only valid for float point.".to_string()), - } -} -``` +| `PhysicalType` | `PhysicalType` | +|----------------------|---------------------------| +| `UInt8` | `PrimitiveArray` | +| `UInt16` | `PrimitiveArray` | +| `UInt32` | `PrimitiveArray` | +| `UInt64` | `PrimitiveArray` | +| `Int8` | `PrimitiveArray` | +| `Int16` | `PrimitiveArray` | +| `Int32` | `PrimitiveArray` | +| `Int64` | `PrimitiveArray` | +| `Int128` | `PrimitiveArray` | +| `Float32` | `PrimitiveArray` | +| `Float64` | `PrimitiveArray` | +| `DaysMs` | `PrimitiveArray` | +| `Binary` | `BinaryArray` | +| `LargeBinary` | `BinaryArray` | +| `Utf8` | `Utf8Array` | +| `LargeUtf8` | `Utf8Array` | +| `List` | `ListArray` | +| `LargeList` | `ListArray` | +| `FixedSizeBinary` | `FixedSizeBinaryArray` | +| `FixedSizeList` | `FixedSizeListArray` | +| `Struct` | `StructArray` | +| `Union` | `UnionArray` | +| `Dictionary(UInt8)` | `DictionaryArray` | +| `Dictionary(UInt16)` | `DictionaryArray` | +| `Dictionary(UInt32)` | `DictionaryArray` | +| `Dictionary(UInt64)` | `DictionaryArray` | +| `Dictionary(Int8)` | `DictionaryArray` | +| `Dictionary(Int16)` | `DictionaryArray` | +| `Dictionary(Int32)` | `DictionaryArray` | +| `Dictionary(Int64)` | `DictionaryArray` | ## From Iterator From cd9600e589f3981968ed6a5a529c6540562c5c7d Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 31 Aug 2021 10:09:09 +0000 Subject: [PATCH 11/12] Migrated to new Enum. --- src/array/equal/mod.rs | 61 +-------- src/array/ffi.rs | 15 +-- src/array/growable/mod.rs | 15 +-- src/array/mod.rs | 81 +++++------- src/compute/aggregate/memory.rs | 31 ++--- src/compute/filter.rs | 81 ++---------- src/compute/take/mod.rs | 39 +----- src/datatypes/mod.rs | 28 ++-- src/datatypes/physical_type.rs | 22 ++-- src/ffi/array.rs | 15 +-- src/io/ipc/read/array/fixed_size_list.rs | 1 + src/io/ipc/read/array/list.rs | 1 + src/io/ipc/read/array/struct_.rs | 1 + src/io/ipc/read/array/union.rs | 1 + src/io/ipc/read/common.rs | 1 + src/io/ipc/read/deserialize.rs | 136 ++------------------ src/io/ipc/write/serialize.rs | 15 +-- src/io/parquet/read/binary/dictionary.rs | 1 + src/io/parquet/read/primitive/dictionary.rs | 1 + src/types/mod.rs | 26 ++-- 20 files changed, 137 insertions(+), 435 deletions(-) diff --git a/src/array/equal/mod.rs b/src/array/equal/mod.rs index aff33cfd23d..32612a1fe07 100644 --- a/src/array/equal/mod.rs +++ b/src/array/equal/mod.rs @@ -173,66 +173,11 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool { let rhs = rhs.as_any().downcast_ref().unwrap(); boolean::equal(lhs, rhs) } - UInt8 => { + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - UInt16 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - UInt32 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - UInt64 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Int8 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Int16 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Int32 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Int64 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Int128 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - DaysMs => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Float32 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } - Float64 => { - let lhs = lhs.as_any().downcast_ref().unwrap(); - let rhs = rhs.as_any().downcast_ref().unwrap(); - primitive::equal::(lhs, rhs) - } + primitive::equal::<$T>(lhs, rhs) + }), Utf8 => { let lhs = lhs.as_any().downcast_ref().unwrap(); let rhs = rhs.as_any().downcast_ref().unwrap(); diff --git a/src/array/ffi.rs b/src/array/ffi.rs index a921550fb0d..7368f45420b 100644 --- a/src/array/ffi.rs +++ b/src/array/ffi.rs @@ -45,18 +45,9 @@ pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren { match array.data_type().to_physical_type() { Null => ffi_dyn!(array, NullArray), Boolean => ffi_dyn!(array, BooleanArray), - Int8 => ffi_dyn!(array, PrimitiveArray), - Int16 => ffi_dyn!(array, PrimitiveArray), - Int32 => ffi_dyn!(array, PrimitiveArray), - DaysMs => ffi_dyn!(array, PrimitiveArray), - Int64 => ffi_dyn!(array, PrimitiveArray), - Int128 => ffi_dyn!(array, PrimitiveArray), - UInt8 => ffi_dyn!(array, PrimitiveArray), - UInt16 => ffi_dyn!(array, PrimitiveArray), - UInt32 => ffi_dyn!(array, PrimitiveArray), - UInt64 => ffi_dyn!(array, PrimitiveArray), - Float32 => ffi_dyn!(array, PrimitiveArray), - Float64 => ffi_dyn!(array, PrimitiveArray), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + ffi_dyn!(array, PrimitiveArray<$T>) + }), Binary => ffi_dyn!(array, BinaryArray), LargeBinary => ffi_dyn!(array, BinaryArray), FixedSizeBinary => ffi_dyn!(array, FixedSizeBinaryArray), diff --git a/src/array/growable/mod.rs b/src/array/growable/mod.rs index 4a67b90a02f..7f57b87a6f7 100644 --- a/src/array/growable/mod.rs +++ b/src/array/growable/mod.rs @@ -110,18 +110,9 @@ pub fn make_growable<'a>( capacity, )) } - Int8 => dyn_growable!(i8, arrays, use_validity, capacity), - Int16 => dyn_growable!(i16, arrays, use_validity, capacity), - Int32 => dyn_growable!(i32, arrays, use_validity, capacity), - Int64 => dyn_growable!(i64, arrays, use_validity, capacity), - Int128 => dyn_growable!(i128, arrays, use_validity, capacity), - DaysMs => dyn_growable!(days_ms, arrays, use_validity, capacity), - UInt8 => dyn_growable!(u8, arrays, use_validity, capacity), - UInt16 => dyn_growable!(u16, arrays, use_validity, capacity), - UInt32 => dyn_growable!(u32, arrays, use_validity, capacity), - UInt64 => dyn_growable!(u64, arrays, use_validity, capacity), - Float32 => dyn_growable!(f32, arrays, use_validity, capacity), - Float64 => dyn_growable!(f64, arrays, use_validity, capacity), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + dyn_growable!($T, arrays, use_validity, capacity) + }), Utf8 => { let arrays = arrays .iter() diff --git a/src/array/mod.rs b/src/array/mod.rs index 1b054ceb75a..3b23a377e36 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -181,24 +181,36 @@ macro_rules! with_match_physical_dictionary_key_type {( } })} +macro_rules! with_match_primitive_type {( + $key_type:expr, | $_:tt $T:ident | $($body:tt)* +) => ({ + macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )} + use crate::datatypes::PrimitiveType::*; + match $key_type { + Int8 => __with_ty__! { i8 }, + Int16 => __with_ty__! { i16 }, + Int32 => __with_ty__! { i32 }, + Int64 => __with_ty__! { i64 }, + Int128 => __with_ty__! { i128 }, + DaysMs => __with_ty__! { days_ms }, + UInt8 => __with_ty__! { u8 }, + UInt16 => __with_ty__! { u16 }, + UInt32 => __with_ty__! { u32 }, + UInt64 => __with_ty__! { u64 }, + Float32 => __with_ty__! { f32 }, + Float64 => __with_ty__! { f64 }, + } +})} + impl Display for dyn Array { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use crate::datatypes::PhysicalType::*; match self.data_type().to_physical_type() { Null => fmt_dyn!(self, NullArray, f), Boolean => fmt_dyn!(self, BooleanArray, f), - Int8 => fmt_dyn!(self, PrimitiveArray, f), - Int16 => fmt_dyn!(self, PrimitiveArray, f), - Int32 => fmt_dyn!(self, PrimitiveArray, f), - DaysMs => fmt_dyn!(self, PrimitiveArray, f), - Int64 => fmt_dyn!(self, PrimitiveArray, f), - Int128 => fmt_dyn!(self, PrimitiveArray, f), - UInt8 => fmt_dyn!(self, PrimitiveArray, f), - UInt16 => fmt_dyn!(self, PrimitiveArray, f), - UInt32 => fmt_dyn!(self, PrimitiveArray, f), - UInt64 => fmt_dyn!(self, PrimitiveArray, f), - Float32 => fmt_dyn!(self, PrimitiveArray, f), - Float64 => fmt_dyn!(self, PrimitiveArray, f), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + fmt_dyn!(self, PrimitiveArray<$T>, f) + }), Binary => fmt_dyn!(self, BinaryArray, f), LargeBinary => fmt_dyn!(self, BinaryArray, f), FixedSizeBinary => fmt_dyn!(self, FixedSizeBinaryArray, f), @@ -224,18 +236,9 @@ pub fn new_empty_array(data_type: DataType) -> Box { match data_type.to_physical_type() { Null => Box::new(NullArray::new_empty(data_type)), Boolean => Box::new(BooleanArray::new_empty(data_type)), - Int8 => Box::new(PrimitiveArray::::new_empty(data_type)), - Int16 => Box::new(PrimitiveArray::::new_empty(data_type)), - Int32 => Box::new(PrimitiveArray::::new_empty(data_type)), - DaysMs => Box::new(PrimitiveArray::::new_empty(data_type)), - Int64 => Box::new(PrimitiveArray::::new_empty(data_type)), - Int128 => Box::new(PrimitiveArray::::new_empty(data_type)), - UInt8 => Box::new(PrimitiveArray::::new_empty(data_type)), - UInt16 => Box::new(PrimitiveArray::::new_empty(data_type)), - UInt32 => Box::new(PrimitiveArray::::new_empty(data_type)), - UInt64 => Box::new(PrimitiveArray::::new_empty(data_type)), - Float32 => Box::new(PrimitiveArray::::new_empty(data_type)), - Float64 => Box::new(PrimitiveArray::::new_empty(data_type)), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + Box::new(PrimitiveArray::<$T>::new_empty(data_type)) + }), Binary => Box::new(BinaryArray::::new_empty(data_type)), LargeBinary => Box::new(BinaryArray::::new_empty(data_type)), FixedSizeBinary => Box::new(FixedSizeBinaryArray::new_empty(data_type)), @@ -262,18 +265,9 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box { match data_type.to_physical_type() { Null => Box::new(NullArray::new_null(data_type, length)), Boolean => Box::new(BooleanArray::new_null(data_type, length)), - Int8 => Box::new(PrimitiveArray::::new_null(data_type, length)), - Int16 => Box::new(PrimitiveArray::::new_null(data_type, length)), - Int32 => Box::new(PrimitiveArray::::new_null(data_type, length)), - DaysMs => Box::new(PrimitiveArray::::new_null(data_type, length)), - Int64 => Box::new(PrimitiveArray::::new_null(data_type, length)), - Int128 => Box::new(PrimitiveArray::::new_null(data_type, length)), - UInt8 => Box::new(PrimitiveArray::::new_null(data_type, length)), - UInt16 => Box::new(PrimitiveArray::::new_null(data_type, length)), - UInt32 => Box::new(PrimitiveArray::::new_null(data_type, length)), - UInt64 => Box::new(PrimitiveArray::::new_null(data_type, length)), - Float32 => Box::new(PrimitiveArray::::new_null(data_type, length)), - Float64 => Box::new(PrimitiveArray::::new_null(data_type, length)), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + Box::new(PrimitiveArray::<$T>::new_null(data_type, length)) + }), Binary => Box::new(BinaryArray::::new_null(data_type, length)), LargeBinary => Box::new(BinaryArray::::new_null(data_type, length)), FixedSizeBinary => Box::new(FixedSizeBinaryArray::new_null(data_type, length)), @@ -308,18 +302,9 @@ pub fn clone(array: &dyn Array) -> Box { match array.data_type().to_physical_type() { Null => clone_dyn!(array, NullArray), Boolean => clone_dyn!(array, BooleanArray), - Int8 => clone_dyn!(array, PrimitiveArray), - Int16 => clone_dyn!(array, PrimitiveArray), - Int32 => clone_dyn!(array, PrimitiveArray), - DaysMs => clone_dyn!(array, PrimitiveArray), - Int64 => clone_dyn!(array, PrimitiveArray), - Int128 => clone_dyn!(array, PrimitiveArray), - UInt8 => clone_dyn!(array, PrimitiveArray), - UInt16 => clone_dyn!(array, PrimitiveArray), - UInt32 => clone_dyn!(array, PrimitiveArray), - UInt64 => clone_dyn!(array, PrimitiveArray), - Float32 => clone_dyn!(array, PrimitiveArray), - Float64 => clone_dyn!(array, PrimitiveArray), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + clone_dyn!(array, PrimitiveArray<$T>) + }), Binary => clone_dyn!(array, BinaryArray), LargeBinary => clone_dyn!(array, BinaryArray), FixedSizeBinary => clone_dyn!(array, FixedSizeBinaryArray), diff --git a/src/compute/aggregate/memory.rs b/src/compute/aggregate/memory.rs index 1174a1d4967..1d1cfbe3cfe 100644 --- a/src/compute/aggregate/memory.rs +++ b/src/compute/aggregate/memory.rs @@ -7,17 +7,6 @@ fn validity_size(validity: &Option) -> usize { validity.as_ref().map(|b| b.as_slice().0.len()).unwrap_or(0) } -macro_rules! dyn_primitive { - ($array:expr, $ty:ty) => {{ - let array = $array - .as_any() - .downcast_ref::>() - .unwrap(); - - array.values().len() * std::mem::size_of::<$ty>() + validity_size(array.validity()) - }}; -} - macro_rules! dyn_binary { ($array:expr, $ty:ty, $o:ty) => {{ let array = $array.as_any().downcast_ref::<$ty>().unwrap(); @@ -57,18 +46,14 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize { let array = array.as_any().downcast_ref::().unwrap(); array.values().as_slice().0.len() + validity_size(array.validity()) } - Int8 => dyn_primitive!(array, i8), - Int16 => dyn_primitive!(array, i16), - Int32 => dyn_primitive!(array, i32), - Int64 => dyn_primitive!(array, i64), - Int128 => dyn_primitive!(array, i128), - DaysMs => dyn_primitive!(array, days_ms), - UInt8 => dyn_primitive!(array, u16), - UInt16 => dyn_primitive!(array, u16), - UInt32 => dyn_primitive!(array, u32), - UInt64 => dyn_primitive!(array, u64), - Float32 => dyn_primitive!(array, f32), - Float64 => dyn_primitive!(array, f64), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + array.values().len() * std::mem::size_of::<$T>() + validity_size(array.validity()) + }), Binary => dyn_binary!(array, BinaryArray, i32), FixedSizeBinary => { let array = array diff --git a/src/compute/filter.rs b/src/compute/filter.rs index 2e92ef0083e..54102d0e1e3 100644 --- a/src/compute/filter.rs +++ b/src/compute/filter.rs @@ -85,17 +85,6 @@ fn filter_growable<'a>(growable: &mut impl Growable<'a>, chunks: &[(usize, usize .for_each(|(start, len)| growable.extend(0, *start, *len)); } -macro_rules! dyn_filter { - ($ty:ty, $array:expr, $filter_count:expr, $chunks:expr) => {{ - let array = $array.as_any().downcast_ref().unwrap(); - let mut growable = - growable::GrowablePrimitive::<$ty>::new(vec![array], false, $filter_count); - filter_growable(&mut growable, &$chunks); - let array: PrimitiveArray<$ty> = growable.into(); - Box::new(array) - }}; -} - /// Returns a prepared function optimized to filter multiple arrays. /// Creating this function requires time, but using it is faster than [filter] when the /// same filter needs to be applied to multiple arrays (e.g. a multi-column `RecordBatch`). @@ -109,18 +98,14 @@ pub fn build_filter(filter: &BooleanArray) -> Result { use crate::datatypes::PhysicalType::*; Ok(Box::new(move |array: &dyn Array| { match array.data_type().to_physical_type() { - UInt8 => dyn_filter!(u8, array, filter_count, chunks), - UInt16 => dyn_filter!(u16, array, filter_count, chunks), - UInt32 => dyn_filter!(u32, array, filter_count, chunks), - UInt64 => dyn_filter!(u64, array, filter_count, chunks), - Int8 => dyn_filter!(i8, array, filter_count, chunks), - Int16 => dyn_filter!(i16, array, filter_count, chunks), - Int32 => dyn_filter!(i32, array, filter_count, chunks), - Int64 => dyn_filter!(i64, array, filter_count, chunks), - Int128 => dyn_filter!(i128, array, filter_count, chunks), - DaysMs => dyn_filter!(days_ms, array, filter_count, chunks), - Float32 => dyn_filter!(f32, array, filter_count, chunks), - Float64 => dyn_filter!(f64, array, filter_count, chunks), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + let array = array.as_any().downcast_ref().unwrap(); + let mut growable = + growable::GrowablePrimitive::<$T>::new(vec![array], false, filter_count); + filter_growable(&mut growable, &chunks); + let array: PrimitiveArray<$T> = growable.into(); + Box::new(array) + }), Utf8 => { let array = array.as_any().downcast_ref::>().unwrap(); let mut growable = growable::GrowableUtf8::new(vec![array], false, filter_count); @@ -166,54 +151,10 @@ pub fn build_filter(filter: &BooleanArray) -> Result { pub fn filter(array: &dyn Array, filter: &BooleanArray) -> Result> { use crate::datatypes::PhysicalType::*; match array.data_type().to_physical_type() { - UInt8 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - UInt16 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - UInt32 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - UInt64 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Int8 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Int16 => { + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Int32 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Int64 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Int128 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - DaysMs => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Float32 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } - Float64 => { - let array = array.as_any().downcast_ref().unwrap(); - Ok(Box::new(filter_primitive::(array, filter))) - } + Ok(Box::new(filter_primitive::<$T>(array, filter))) + }), _ => { let iter = SlicesIterator::new(filter.values()); let mut mutable = make_growable(&[array], false, iter.slots()); diff --git a/src/compute/take/mod.rs b/src/compute/take/mod.rs index 01296bc3df9..751ebfc9f34 100644 --- a/src/compute/take/mod.rs +++ b/src/compute/take/mod.rs @@ -33,26 +33,6 @@ mod primitive; mod structure; mod utf8; -macro_rules! downcast_take { - ($type: ty, $values: expr, $indices: expr) => {{ - let values = $values - .as_any() - .downcast_ref() - .expect("Unable to downcast to a primitive array"); - Ok(Box::new(primitive::take::<$type, _>(&values, $indices))) - }}; -} - -macro_rules! downcast_dict_take { - ($type: ty, $values: expr, $indices: expr) => {{ - let values = $values - .as_any() - .downcast_ref() - .expect("Unable to downcast to a primitive array"); - Ok(Box::new(dict::take::<$type, _>(&values, $indices))) - }}; -} - pub fn take(values: &dyn Array, indices: &PrimitiveArray) -> Result> { if indices.len() == 0 { return Ok(new_empty_array(values.data_type().clone())); @@ -68,18 +48,10 @@ pub fn take(values: &dyn Array, indices: &PrimitiveArray) -> Result let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(boolean::take::(values, indices))) } - Int8 => downcast_take!(i8, values, indices), - Int16 => downcast_take!(i16, values, indices), - Int32 => downcast_take!(i32, values, indices), - Int64 => downcast_take!(i64, values, indices), - Int128 => downcast_take!(i128, values, indices), - DaysMs => downcast_take!(days_ms, values, indices), - UInt8 => downcast_take!(u8, values, indices), - UInt16 => downcast_take!(u16, values, indices), - UInt32 => downcast_take!(u32, values, indices), - UInt64 => downcast_take!(u64, values, indices), - Float32 => downcast_take!(f32, values, indices), - Float64 => downcast_take!(f64, values, indices), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + let values = values.as_any().downcast_ref().unwrap(); + Ok(Box::new(primitive::take::<$T, _>(&values, indices))) + }), Utf8 => { let values = values.as_any().downcast_ref().unwrap(); Ok(Box::new(utf8::take::(values, indices))) @@ -98,7 +70,8 @@ pub fn take(values: &dyn Array, indices: &PrimitiveArray) -> Result } Dictionary(key_type) => { with_match_physical_dictionary_key_type!(key_type, |$T| { - downcast_dict_take!($T, values, indices) + let values = values.as_any().downcast_ref().unwrap(); + Ok(Box::new(dict::take::<$T, _>(&values, indices))) }) } Struct => { diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 01e5d03c2cb..fe39d1e03f2 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -178,19 +178,23 @@ impl DataType { match self { Null => PhysicalType::Null, Boolean => PhysicalType::Boolean, - Int8 => PhysicalType::Int8, - Int16 => PhysicalType::Int16, - Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => PhysicalType::Int32, - Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => PhysicalType::Int64, - Decimal(_, _) => PhysicalType::Int128, - UInt8 => PhysicalType::UInt8, - UInt16 => PhysicalType::UInt16, - UInt32 => PhysicalType::UInt32, - UInt64 => PhysicalType::UInt64, + Int8 => PhysicalType::Primitive(PrimitiveType::Int8), + Int16 => PhysicalType::Primitive(PrimitiveType::Int16), + Int32 | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => { + PhysicalType::Primitive(PrimitiveType::Int32) + } + Int64 | Date64 | Timestamp(_, _) | Time64(_) | Duration(_) => { + PhysicalType::Primitive(PrimitiveType::Int64) + } + Decimal(_, _) => PhysicalType::Primitive(PrimitiveType::Int128), + UInt8 => PhysicalType::Primitive(PrimitiveType::UInt8), + UInt16 => PhysicalType::Primitive(PrimitiveType::UInt16), + UInt32 => PhysicalType::Primitive(PrimitiveType::UInt32), + UInt64 => PhysicalType::Primitive(PrimitiveType::UInt64), Float16 => unreachable!(), - Float32 => PhysicalType::Float32, - Float64 => PhysicalType::Float64, - Interval(IntervalUnit::DayTime) => PhysicalType::DaysMs, + Float32 => PhysicalType::Primitive(PrimitiveType::Float32), + Float64 => PhysicalType::Primitive(PrimitiveType::Float64), + Interval(IntervalUnit::DayTime) => PhysicalType::Primitive(PrimitiveType::DaysMs), Binary => PhysicalType::Binary, FixedSizeBinary(_) => PhysicalType::FixedSizeBinary, LargeBinary => PhysicalType::LargeBinary, diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs index 6c5f47fa7b8..aff17e906d1 100644 --- a/src/datatypes/physical_type.rs +++ b/src/datatypes/physical_type.rs @@ -19,15 +19,8 @@ pub enum DictionaryIndexType { UInt64, } -/// The set of physical types: unique in-memory representations of an Arrow array. -/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and -/// a one-to-one mapping with each struct in this crate that implements [`crate::array::Array`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum PhysicalType { - /// A Null with no allocation. - Null, - /// A boolean represented as a single bit. - Boolean, +pub enum PrimitiveType { /// A signed 8-bit integer. Int8, /// A signed 16-bit integer. @@ -52,6 +45,19 @@ pub enum PhysicalType { Float64, /// Two i32 representing days and ms DaysMs, +} + +/// The set of physical types: unique in-memory representations of an Arrow array. +/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and +/// a one-to-one mapping with each struct in this crate that implements [`crate::array::Array`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum PhysicalType { + /// A Null with no allocation. + Null, + /// A boolean represented as a single bit. + Boolean, + /// An array where each slot has a known compile-time size. + Primitive(PrimitiveType), /// Opaque binary data of variable length. Binary, /// Opaque binary data of fixed size. diff --git a/src/ffi/array.rs b/src/ffi/array.rs index 66702a336ef..60af0df0fe2 100644 --- a/src/ffi/array.rs +++ b/src/ffi/array.rs @@ -32,18 +32,9 @@ pub fn try_from(array: A) -> Result> { use PhysicalType::*; Ok(match array.field().data_type().to_physical_type() { Boolean => Box::new(BooleanArray::try_from_ffi(array)?), - Int8 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - Int16 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - Int32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - DaysMs => Box::new(PrimitiveArray::::try_from_ffi(array)?), - Int64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - Int128 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - UInt8 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - UInt16 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - UInt32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - UInt64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - Float32 => Box::new(PrimitiveArray::::try_from_ffi(array)?), - Float64 => Box::new(PrimitiveArray::::try_from_ffi(array)?), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + Box::new(PrimitiveArray::<$T>::try_from_ffi(array)?) + }), Utf8 => Box::new(Utf8Array::::try_from_ffi(array)?), LargeUtf8 => Box::new(Utf8Array::::try_from_ffi(array)?), Binary => Box::new(BinaryArray::::try_from_ffi(array)?), diff --git a/src/io/ipc/read/array/fixed_size_list.rs b/src/io/ipc/read/array/fixed_size_list.rs index 3527665fcc0..8aca4b7786a 100644 --- a/src/io/ipc/read/array/fixed_size_list.rs +++ b/src/io/ipc/read/array/fixed_size_list.rs @@ -12,6 +12,7 @@ use super::super::super::gen; use super::super::deserialize::{read, skip, Node}; use super::super::read_basic::*; +#[allow(clippy::too_many_arguments)] pub fn read_fixed_size_list( field_nodes: &mut VecDeque, data_type: DataType, diff --git a/src/io/ipc/read/array/list.rs b/src/io/ipc/read/array/list.rs index 61ff8b8612b..0f082750b46 100644 --- a/src/io/ipc/read/array/list.rs +++ b/src/io/ipc/read/array/list.rs @@ -14,6 +14,7 @@ use super::super::super::gen; use super::super::deserialize::{read, skip, Node}; use super::super::read_basic::*; +#[allow(clippy::too_many_arguments)] pub fn read_list( field_nodes: &mut VecDeque, data_type: DataType, diff --git a/src/io/ipc/read/array/struct_.rs b/src/io/ipc/read/array/struct_.rs index aa14105dbba..fd7c3652d8a 100644 --- a/src/io/ipc/read/array/struct_.rs +++ b/src/io/ipc/read/array/struct_.rs @@ -12,6 +12,7 @@ use super::super::super::gen; use super::super::deserialize::{read, skip, Node}; use super::super::read_basic::*; +#[allow(clippy::too_many_arguments)] pub fn read_struct( field_nodes: &mut VecDeque, data_type: DataType, diff --git a/src/io/ipc/read/array/union.rs b/src/io/ipc/read/array/union.rs index adaac0f13cd..32b0d938fe2 100644 --- a/src/io/ipc/read/array/union.rs +++ b/src/io/ipc/read/array/union.rs @@ -12,6 +12,7 @@ use super::super::super::gen; use super::super::deserialize::{read, skip, Node}; use super::super::read_basic::*; +#[allow(clippy::too_many_arguments)] pub fn read_union( field_nodes: &mut VecDeque, data_type: DataType, diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs index e0524c558cd..92975c8bd24 100644 --- a/src/io/ipc/read/common.rs +++ b/src/io/ipc/read/common.rs @@ -92,6 +92,7 @@ impl<'a, A, I: Iterator> Iterator for ProjectionIter<'a, A, I> { /// Creates a record batch from binary data using the `ipc::RecordBatch` indexes and the `Schema` /// # Panic /// Panics iff the projection is not in increasing order (e.g. `[1, 0]` nor `[0, 1, 1]` are valid) +#[allow(clippy::too_many_arguments)] pub fn read_record_batch( batch: gen::Message::RecordBatch, schema: Arc, diff --git a/src/io/ipc/read/deserialize.rs b/src/io/ipc/read/deserialize.rs index 386fd980fc5..663ec064280 100644 --- a/src/io/ipc/read/deserialize.rs +++ b/src/io/ipc/read/deserialize.rs @@ -21,6 +21,7 @@ use super::array::*; pub type Node<'a> = (&'a gen::Message::FieldNode, &'a Option>); +#[allow(clippy::too_many_arguments)] pub fn read( field_nodes: &mut VecDeque, data_type: DataType, @@ -46,126 +47,18 @@ pub fn read( is_little_endian, ) .map(|x| Arc::new(x) as Arc), - Int8 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - Int16 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - Int32 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - Int64 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - Int128 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - DaysMs => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - UInt8 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - UInt16 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - UInt32 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - UInt64 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - Float32 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), - Float64 => read_primitive::( - field_nodes, - data_type, - buffers, - reader, - block_offset, - is_little_endian, - compression, - ) - .map(|x| Arc::new(x) as Arc), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + read_primitive::<$T, _>( + field_nodes, + data_type, + buffers, + reader, + block_offset, + is_little_endian, + compression, + ) + .map(|x| Arc::new(x) as Arc) + }), Binary => { let array = read_binary::( field_nodes, @@ -305,8 +198,7 @@ pub fn skip( match data_type.to_physical_type() { Null => skip_null(field_nodes), Boolean => skip_boolean(field_nodes, buffers), - Int8 | Int16 | Int32 | Int64 | Int128 | UInt8 | UInt16 | UInt32 | UInt64 | Float32 - | Float64 | DaysMs => skip_primitive(field_nodes, buffers), + Primitive(_) => skip_primitive(field_nodes, buffers), LargeBinary | Binary => skip_binary(field_nodes, buffers), LargeUtf8 | Utf8 => skip_utf8(field_nodes, buffers), FixedSizeBinary => skip_fixed_size_binary(field_nodes, buffers), diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index ba121564b35..220f7b28fc3 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -350,18 +350,9 @@ pub fn write( match array.data_type().to_physical_type() { Null => (), Boolean => write_boolean(array, buffers, arrow_data, offset, is_little_endian), - Int8 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - Int16 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - Int32 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - Int64 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - Int128 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - DaysMs => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - UInt8 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - UInt16 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - UInt32 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - UInt64 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - Float32 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), - Float64 => write_primitive::(array, buffers, arrow_data, offset, is_little_endian), + Primitive(primitive) => with_match_primitive_type!(primitive, |$T| { + write_primitive::<$T>(array, buffers, arrow_data, offset, is_little_endian) + }), Binary => write_binary::(array, buffers, arrow_data, offset, is_little_endian), LargeBinary => write_binary::(array, buffers, arrow_data, offset, is_little_endian), FixedSizeBinary => { diff --git a/src/io/parquet/read/binary/dictionary.rs b/src/io/parquet/read/binary/dictionary.rs index b0aa38f3f1e..3609fa05a1f 100644 --- a/src/io/parquet/read/binary/dictionary.rs +++ b/src/io/parquet/read/binary/dictionary.rs @@ -16,6 +16,7 @@ use crate::{ error::{ArrowError, Result}, }; +#[allow(clippy::too_many_arguments)] fn read_dict_optional( validity_buffer: &[u8], indices_buffer: &[u8], diff --git a/src/io/parquet/read/primitive/dictionary.rs b/src/io/parquet/read/primitive/dictionary.rs index 8d7c7de4bca..6cf2a5245c2 100644 --- a/src/io/parquet/read/primitive/dictionary.rs +++ b/src/io/parquet/read/primitive/dictionary.rs @@ -18,6 +18,7 @@ use crate::{ types::NativeType as ArrowNativeType, }; +#[allow(clippy::too_many_arguments)] fn read_dict_optional( validity_buffer: &[u8], indices_buffer: &[u8], diff --git a/src/types/mod.rs b/src/types/mod.rs index fe030d73424..61125a6282d 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -15,7 +15,7 @@ mod index; pub mod simd; pub use index::*; -use crate::datatypes::{DataType, IntervalUnit, PhysicalType}; +use crate::datatypes::{DataType, IntervalUnit, PhysicalType, PrimitiveType}; /// Trait denoting anything that has a natural logical [`DataType`]. /// For example, [`DataType::Int32`] for `i32`. @@ -127,17 +127,17 @@ natural_type!(f64, DataType::Float64); natural_type!(days_ms, DataType::Interval(IntervalUnit::DayTime)); natural_type!(i128, DataType::Decimal(32, 32)); // users should set the decimal when creating an array -create_relation!(u8, PhysicalType::UInt8); -create_relation!(u16, PhysicalType::UInt16); -create_relation!(u32, PhysicalType::UInt32); -create_relation!(u64, PhysicalType::UInt64); -create_relation!(i8, PhysicalType::Int8); -create_relation!(i16, PhysicalType::Int16); -create_relation!(i32, PhysicalType::Int32); -create_relation!(i64, PhysicalType::Int64); -create_relation!(i128, PhysicalType::Int128); -create_relation!(f32, PhysicalType::Float32); -create_relation!(f64, PhysicalType::Float64); +create_relation!(u8, PhysicalType::Primitive(PrimitiveType::UInt8)); +create_relation!(u16, PhysicalType::Primitive(PrimitiveType::UInt16)); +create_relation!(u32, PhysicalType::Primitive(PrimitiveType::UInt32)); +create_relation!(u64, PhysicalType::Primitive(PrimitiveType::UInt64)); +create_relation!(i8, PhysicalType::Primitive(PrimitiveType::Int8)); +create_relation!(i16, PhysicalType::Primitive(PrimitiveType::Int16)); +create_relation!(i32, PhysicalType::Primitive(PrimitiveType::Int32)); +create_relation!(i64, PhysicalType::Primitive(PrimitiveType::Int64)); +create_relation!(i128, PhysicalType::Primitive(PrimitiveType::Int128)); +create_relation!(f32, PhysicalType::Primitive(PrimitiveType::Float32)); +create_relation!(f64, PhysicalType::Primitive(PrimitiveType::Float64)); /// The in-memory representation of the DayMillisecond variant of arrow's "Interval" logical type. #[derive(Debug, Copy, Clone, Default, PartialEq, Eq, Hash)] @@ -200,7 +200,7 @@ unsafe impl NativeType for days_ms { } } -create_relation!(days_ms, PhysicalType::DaysMs); +create_relation!(days_ms, PhysicalType::Primitive(PrimitiveType::DaysMs)); impl days_ms { /// A new [`days_ms`]. From 082e615658f5a27dccee2c815d7bb80c4a5df778 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 31 Aug 2021 10:26:25 +0000 Subject: [PATCH 12/12] Updated guide. --- guide/src/high_level.md | 102 +++++++++++++++++++++------------------- guide/src/metadata.md | 14 +++--- 2 files changed, 61 insertions(+), 55 deletions(-) diff --git a/guide/src/high_level.md b/guide/src/high_level.md index 9cc1c86f0e6..2a9fb52e19f 100644 --- a/guide/src/high_level.md +++ b/guide/src/high_level.md @@ -9,7 +9,7 @@ from a slice as follows: ```rust # use arrow2::array::{Array, PrimitiveArray}; # fn main() { -let array = PrimitiveArray::from([Some(1), None, Some(123)]); +let array = PrimitiveArray::::from([Some(1), None, Some(123)]); assert_eq!(array.len(), 3) # } ``` @@ -19,7 +19,7 @@ from a slice of values, ```rust # use arrow2::array::{Array, PrimitiveArray}; # fn main() { -let array = PrimitiveArray::from_slice([1, 0, 123]); +let array = PrimitiveArray::::from_slice([1.0, 0.0, 123.0]); assert_eq!(array.len(), 3) # } ``` @@ -29,7 +29,7 @@ or from an iterator ```rust # use arrow2::array::{Array, PrimitiveArray}; # fn main() { -let array: PrimitiveArray = [Some(1), None, Some(123)].iter().collect(); +let array: PrimitiveArray = [Some(1), None, Some(123)].iter().collect(); assert_eq!(array.len(), 3) # } ``` @@ -68,7 +68,7 @@ which is assigned when allocating arrays from iterators, slices, etc. # use arrow2::array::{Array, Int32Array, PrimitiveArray}; # use arrow2::datatypes::DataType; # fn main() { -let array = PrimitiveArray::from_slice([1, 0, 123]); +let array = PrimitiveArray::::from_slice([1, 0, 123]); assert_eq!(array.data_type(), &DataType::Int32); # } ``` @@ -105,62 +105,65 @@ let a: &dyn Array = &a; ### Downcast and `as_any` Given a trait object `array: &dyn Array`, we know its physical type via -`array.data_type().to_physical_type()`, which we use to downcast the array +`PhysicalType: array.data_type().to_physical_type()`, which we use to downcast the array to its concrete type: ```rust # use arrow2::array::{Array, PrimitiveArray}; # use arrow2::datatypes::PhysicalType; # fn main() { -let a = PrimitiveArray::::from(&[Some(1), None]); +let array = PrimitiveArray::::from(&[Some(1), None]); let array = &array as &dyn Array; +// ... +let physical_type: PhysicalType = array.data_type().to_physical_type(); +# } +``` + +There is a one to one relationship between each variant of `PhysicalType` (an enum) and +an each implementation of `Array` (a struct): + +| `PhysicalType` | `Array` | +|-------------------|------------------------| +| `Primitive(_)` | `PrimitiveArray<_>` | +| `Binary` | `BinaryArray` | +| `LargeBinary` | `BinaryArray` | +| `Utf8` | `Utf8Array` | +| `LargeUtf8` | `Utf8Array` | +| `List` | `ListArray` | +| `LargeList` | `ListArray` | +| `FixedSizeBinary` | `FixedSizeBinaryArray` | +| `FixedSizeList` | `FixedSizeListArray` | +| `Struct` | `StructArray` | +| `Union` | `UnionArray` | +| `Dictionary(_)` | `DictionaryArray<_>` | -match array.data_type().to_physical_type() { - PhysicalType::Int32 => { - let array = array.as_any().downcast_ref::>().unwrap(); - let values: &[i32] = array.values(); - assert_eq!(values, &[1, 0]); +where `_` represents each of the variants (e.g. `PrimitiveType::Int32 <-> i32`). + +In this context, a common idiom in using `Array` as a trait object is as follows: + +```rust +use arrow2::datatypes::{PhysicalType, PrimitiveType}; +use arrow2::array::{Array, PrimitiveArray}; + +fn float_operator(array: &dyn Array) -> Result, String> { + match array.data_type().to_physical_type() { + PhysicalType::Primitive(PrimitiveType::Float32) => { + let array = array.as_any().downcast_ref::>().unwrap(); + // let array = f32-specific operator + let array = array.clone(); + Ok(Box::new(array)) + } + PhysicalType::Primitive(PrimitiveType::Float64) => { + let array = array.as_any().downcast_ref::>().unwrap(); + // let array = f64-specific operator + let array = array.clone(); + Ok(Box::new(array)) + } + _ => Err("This operator is only valid for float point arrays".to_string()), } - _ => todo!() } -# } ``` -There is a many-to-one relationship between `DataType` and an Array (i.e. a physical representation). The relationship is the following: - -| `PhysicalType` | `PhysicalType` | -|----------------------|---------------------------| -| `UInt8` | `PrimitiveArray` | -| `UInt16` | `PrimitiveArray` | -| `UInt32` | `PrimitiveArray` | -| `UInt64` | `PrimitiveArray` | -| `Int8` | `PrimitiveArray` | -| `Int16` | `PrimitiveArray` | -| `Int32` | `PrimitiveArray` | -| `Int64` | `PrimitiveArray` | -| `Int128` | `PrimitiveArray` | -| `Float32` | `PrimitiveArray` | -| `Float64` | `PrimitiveArray` | -| `DaysMs` | `PrimitiveArray` | -| `Binary` | `BinaryArray` | -| `LargeBinary` | `BinaryArray` | -| `Utf8` | `Utf8Array` | -| `LargeUtf8` | `Utf8Array` | -| `List` | `ListArray` | -| `LargeList` | `ListArray` | -| `FixedSizeBinary` | `FixedSizeBinaryArray` | -| `FixedSizeList` | `FixedSizeListArray` | -| `Struct` | `StructArray` | -| `Union` | `UnionArray` | -| `Dictionary(UInt8)` | `DictionaryArray` | -| `Dictionary(UInt16)` | `DictionaryArray` | -| `Dictionary(UInt32)` | `DictionaryArray` | -| `Dictionary(UInt64)` | `DictionaryArray` | -| `Dictionary(Int8)` | `DictionaryArray` | -| `Dictionary(Int16)` | `DictionaryArray` | -| `Dictionary(Int32)` | `DictionaryArray` | -| `Dictionary(Int64)` | `DictionaryArray` | - ## From Iterator In the examples above, we've introduced how to create an array from an iterator. @@ -218,7 +221,8 @@ bitwise operations, it is often more performant to operate on chunks of bits ins ## Vectorized operations One of the main advantages of the arrow format and its memory layout is that -it often enables SIMD. For example, an unary operation `op` on a `PrimitiveArray` is likely auto-vectorized on the following code: +it often enables SIMD. For example, an unary operation `op` on a `PrimitiveArray` +likely emits SIMD instructions on the following code: ```rust # use arrow2::buffer::Buffer; diff --git a/guide/src/metadata.md b/guide/src/metadata.md index c9009c67673..9ced2cdfeb1 100644 --- a/guide/src/metadata.md +++ b/guide/src/metadata.md @@ -12,16 +12,18 @@ semantical types defined in Arrow. In Arrow2, logical types are declared as variants of the `enum` `arrow2::datatypes::DataType`. For example, `DataType::Int32` represents a signed integer of 32 bits. -Each logical type has an associated in-memory physical representation and is associated to specific -semantics. For example, `Date32` has the same in-memory representation as `Int32`, but the value -represents the number of days since UNIX epoch. +Each `DataType` has an associated `enum PhysicalType` (many-to-one) representing the +particular in-memory representation, and is associated to specific semantics. +For example, both `DataType::Date32` and `DataType::Int32` have the same `PhysicalType` +(`PhysicalType::Primitive(PrimitiveType::Int32)`) but `Date32` represents the number of +days since UNIX epoch. -Logical types are metadata: they annotate arrays with extra information about in-memory data. +Logical types are metadata: they annotate physical types with extra information about data. ## `Field` (column metadata) -Besides logical types, the arrow format supports other relevant metadata to the format. All this -information is stored in `arrow2::datatypes::Field`. +Besides logical types, the arrow format supports other relevant metadata to the format. +All this information is stored in `arrow2::datatypes::Field`. A `Field` is arrow's metadata associated to a column in the context of a columnar format. It has a name, a logical type `DataType`, whether the column is nullable, etc.