From 346c431c4ab9a94b95fe3f683c5a767b1caab80c Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 25 Apr 2021 08:36:54 +0000 Subject: [PATCH] Added Scalar values. --- src/lib.rs | 1 + src/scalar/README.md | 28 +++++ src/scalar/mod.rs | 245 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 274 insertions(+) create mode 100644 src/scalar/README.md create mode 100644 src/scalar/mod.rs diff --git a/src/lib.rs b/src/lib.rs index 59c97b73b74..8f1f989a62b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ pub mod bitmap; pub mod buffer; mod endianess; pub mod error; +pub mod scalar; pub mod trusted_len; pub mod types; diff --git a/src/scalar/README.md b/src/scalar/README.md new file mode 100644 index 00000000000..0948317004b --- /dev/null +++ b/src/scalar/README.md @@ -0,0 +1,28 @@ +# Scalar API + +Design choices: + +### `Scalar` is trait object + +There are three reasons: + +* a scalar should have a small memory footprint, which an enum would not ensure given the different physical types available. +* forward-compatibility: a new entry on an `enum` is backward-incompatible +* do not expose implementation details to users (reduce the surface of the public API) + +### `Scalar` should contain nullability information + +This is to be aligned with the general notion of arrow's `Array`. + +This API is a companion to the `Array`, and follows the same design as `Array`. +Specifically, a `Scalar` is a trait object that can be downcasted to concrete implementations. + +Like `Array`, `Scalar` implements + +* `data_type`, which is used to perform the correct downcast +* `is_valid`, to tell whether the scalar is null or not + +### There is one implementation per arrows' physical type + +* Reduces the number of `match` that users need to write +* Allows casting of logical types without changing the underlying physical type diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs new file mode 100644 index 00000000000..b91be027d2a --- /dev/null +++ b/src/scalar/mod.rs @@ -0,0 +1,245 @@ +use std::any::Any; + +use crate::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, types::NativeType}; + +pub trait Scalar: std::fmt::Debug { + fn as_any(&self) -> &dyn Any; + + fn is_valid(&self) -> bool; + + fn data_type(&self) -> &DataType; + + fn to_boxed_array(&self, length: usize) -> Box; +} + +#[derive(Debug, Clone)] +pub struct PrimitiveScalar { + // Not Option because this offers a stabler pointer offset on the struct + value: T, + is_valid: bool, + data_type: DataType, +} + +impl PrimitiveScalar { + #[inline] + pub fn new(data_type: DataType, v: Option) -> Self { + let is_valid = v.is_some(); + Self { + value: v.unwrap_or_default(), + is_valid, + data_type, + } + } + + #[inline] + pub fn value(&self) -> T { + self.value + } +} + +impl Scalar for PrimitiveScalar { + #[inline] + fn as_any(&self) -> &dyn std::any::Any { + self + } + + #[inline] + fn is_valid(&self) -> bool { + self.is_valid + } + + #[inline] + fn data_type(&self) -> &DataType { + &self.data_type + } + + fn to_boxed_array(&self, length: usize) -> Box { + if self.is_valid { + let values = Buffer::from_trusted_len_iter(std::iter::repeat(self.value).take(length)); + Box::new(PrimitiveArray::from_data( + self.data_type.clone(), + values, + None, + )) + } else { + Box::new(PrimitiveArray::::new_null( + self.data_type.clone(), + length, + )) + } + } +} + +#[derive(Debug, Clone)] +pub struct BooleanScalar { + value: bool, + is_valid: bool, +} + +impl BooleanScalar { + #[inline] + pub fn new(v: Option) -> Self { + let is_valid = v.is_some(); + Self { + value: v.unwrap_or_default(), + is_valid, + } + } + + #[inline] + pub fn value(&self) -> bool { + self.value + } +} + +impl Scalar for BooleanScalar { + #[inline] + fn as_any(&self) -> &dyn std::any::Any { + self + } + + #[inline] + fn is_valid(&self) -> bool { + self.is_valid + } + + #[inline] + fn data_type(&self) -> &DataType { + &DataType::Boolean + } + + fn to_boxed_array(&self, length: usize) -> Box { + if self.is_valid { + let values = Bitmap::from_trusted_len_iter(std::iter::repeat(self.value).take(length)); + Box::new(BooleanArray::from_data(values, None)) + } else { + Box::new(BooleanArray::new_null(length)) + } + } +} + +#[derive(Debug, Clone)] +pub struct Utf8Scalar { + value: Buffer, + is_valid: bool, + phantom: std::marker::PhantomData, +} + +impl Utf8Scalar { + #[inline] + pub fn new(v: Option<&str>) -> Self { + let is_valid = v.is_some(); + O::from_usize(v.map(|x| x.len()).unwrap_or_default()).expect("Too large"); + let value = Buffer::from(v.map(|x| x.as_bytes()).unwrap_or(&[])); + Self { + value, + is_valid, + phantom: std::marker::PhantomData, + } + } + + #[inline] + pub fn value(&self) -> &str { + unsafe { std::str::from_utf8_unchecked(self.value.as_slice()) } + } +} + +impl Scalar for Utf8Scalar { + #[inline] + fn as_any(&self) -> &dyn std::any::Any { + self + } + + #[inline] + fn is_valid(&self) -> bool { + self.is_valid + } + + #[inline] + fn data_type(&self) -> &DataType { + if O::is_large() { + &DataType::LargeUtf8 + } else { + &DataType::Utf8 + } + } + + fn to_boxed_array(&self, length: usize) -> Box { + if self.is_valid { + let item_length = O::from_usize(self.value.len()).unwrap(); // verified at `new` + let offsets = (0..=length).map(|i| O::from_usize(i).unwrap() * item_length); + let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(offsets) }; + let values = std::iter::repeat(self.value.as_slice()) + .take(length) + .flatten() + .copied() + .collect(); + Box::new(Utf8Array::::from_data(offsets, values, None)) + } else { + Box::new(Utf8Array::::new_null(length)) + } + } +} + +#[derive(Debug, Clone)] +pub struct BinaryScalar { + value: Buffer, + is_valid: bool, + phantom: std::marker::PhantomData, +} + +impl BinaryScalar { + #[inline] + pub fn new(v: Option<&str>) -> Self { + let is_valid = v.is_some(); + O::from_usize(v.map(|x| x.len()).unwrap_or_default()).expect("Too large"); + let value = Buffer::from(v.map(|x| x.as_bytes()).unwrap_or(&[])); + Self { + value, + is_valid, + phantom: std::marker::PhantomData, + } + } + + #[inline] + pub fn value(&self) -> &[u8] { + self.value.as_slice() + } +} + +impl Scalar for BinaryScalar { + #[inline] + fn as_any(&self) -> &dyn std::any::Any { + self + } + + #[inline] + fn is_valid(&self) -> bool { + self.is_valid + } + + #[inline] + fn data_type(&self) -> &DataType { + if O::is_large() { + &DataType::LargeBinary + } else { + &DataType::Binary + } + } + + fn to_boxed_array(&self, length: usize) -> Box { + if self.is_valid { + let item_length = O::from_usize(self.value.len()).unwrap(); // verified at `new` + let offsets = (0..=length).map(|i| O::from_usize(i).unwrap() * item_length); + let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(offsets) }; + let values = std::iter::repeat(self.value.as_slice()) + .take(length) + .flatten() + .copied() + .collect(); + Box::new(BinaryArray::::from_data(offsets, values, None)) + } else { + Box::new(BinaryArray::::new_null(length)) + } + } +}