This repository has been archived by the owner on Feb 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 224
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
590fb9f
commit 346c431
Showing
3 changed files
with
274 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Scalar API | ||
|
||
Design choices: | ||
|
||
### `Scalar` is trait object | ||
|
||
There are three reasons: | ||
|
||
* a scalar should have a small memory footprint, which an enum would not ensure given the different physical types available. | ||
* forward-compatibility: a new entry on an `enum` is backward-incompatible | ||
* do not expose implementation details to users (reduce the surface of the public API) | ||
|
||
### `Scalar` should contain nullability information | ||
|
||
This is to be aligned with the general notion of arrow's `Array`. | ||
|
||
This API is a companion to the `Array`, and follows the same design as `Array`. | ||
Specifically, a `Scalar` is a trait object that can be downcasted to concrete implementations. | ||
|
||
Like `Array`, `Scalar` implements | ||
|
||
* `data_type`, which is used to perform the correct downcast | ||
* `is_valid`, to tell whether the scalar is null or not | ||
|
||
### There is one implementation per arrows' physical type | ||
|
||
* Reduces the number of `match` that users need to write | ||
* Allows casting of logical types without changing the underlying physical type |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
use std::any::Any; | ||
|
||
use crate::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, types::NativeType}; | ||
|
||
pub trait Scalar: std::fmt::Debug { | ||
fn as_any(&self) -> &dyn Any; | ||
|
||
fn is_valid(&self) -> bool; | ||
|
||
fn data_type(&self) -> &DataType; | ||
|
||
fn to_boxed_array(&self, length: usize) -> Box<dyn Array>; | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct PrimitiveScalar<T: NativeType> { | ||
// Not Option<T> because this offers a stabler pointer offset on the struct | ||
value: T, | ||
is_valid: bool, | ||
data_type: DataType, | ||
} | ||
|
||
impl<T: NativeType> PrimitiveScalar<T> { | ||
#[inline] | ||
pub fn new(data_type: DataType, v: Option<T>) -> Self { | ||
let is_valid = v.is_some(); | ||
Self { | ||
value: v.unwrap_or_default(), | ||
is_valid, | ||
data_type, | ||
} | ||
} | ||
|
||
#[inline] | ||
pub fn value(&self) -> T { | ||
self.value | ||
} | ||
} | ||
|
||
impl<T: NativeType> Scalar for PrimitiveScalar<T> { | ||
#[inline] | ||
fn as_any(&self) -> &dyn std::any::Any { | ||
self | ||
} | ||
|
||
#[inline] | ||
fn is_valid(&self) -> bool { | ||
self.is_valid | ||
} | ||
|
||
#[inline] | ||
fn data_type(&self) -> &DataType { | ||
&self.data_type | ||
} | ||
|
||
fn to_boxed_array(&self, length: usize) -> Box<dyn Array> { | ||
if self.is_valid { | ||
let values = Buffer::from_trusted_len_iter(std::iter::repeat(self.value).take(length)); | ||
Box::new(PrimitiveArray::from_data( | ||
self.data_type.clone(), | ||
values, | ||
None, | ||
)) | ||
} else { | ||
Box::new(PrimitiveArray::<T>::new_null( | ||
self.data_type.clone(), | ||
length, | ||
)) | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct BooleanScalar { | ||
value: bool, | ||
is_valid: bool, | ||
} | ||
|
||
impl BooleanScalar { | ||
#[inline] | ||
pub fn new(v: Option<bool>) -> Self { | ||
let is_valid = v.is_some(); | ||
Self { | ||
value: v.unwrap_or_default(), | ||
is_valid, | ||
} | ||
} | ||
|
||
#[inline] | ||
pub fn value(&self) -> bool { | ||
self.value | ||
} | ||
} | ||
|
||
impl Scalar for BooleanScalar { | ||
#[inline] | ||
fn as_any(&self) -> &dyn std::any::Any { | ||
self | ||
} | ||
|
||
#[inline] | ||
fn is_valid(&self) -> bool { | ||
self.is_valid | ||
} | ||
|
||
#[inline] | ||
fn data_type(&self) -> &DataType { | ||
&DataType::Boolean | ||
} | ||
|
||
fn to_boxed_array(&self, length: usize) -> Box<dyn Array> { | ||
if self.is_valid { | ||
let values = Bitmap::from_trusted_len_iter(std::iter::repeat(self.value).take(length)); | ||
Box::new(BooleanArray::from_data(values, None)) | ||
} else { | ||
Box::new(BooleanArray::new_null(length)) | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct Utf8Scalar<O: Offset> { | ||
value: Buffer<u8>, | ||
is_valid: bool, | ||
phantom: std::marker::PhantomData<O>, | ||
} | ||
|
||
impl<O: Offset> Utf8Scalar<O> { | ||
#[inline] | ||
pub fn new(v: Option<&str>) -> Self { | ||
let is_valid = v.is_some(); | ||
O::from_usize(v.map(|x| x.len()).unwrap_or_default()).expect("Too large"); | ||
let value = Buffer::from(v.map(|x| x.as_bytes()).unwrap_or(&[])); | ||
Self { | ||
value, | ||
is_valid, | ||
phantom: std::marker::PhantomData, | ||
} | ||
} | ||
|
||
#[inline] | ||
pub fn value(&self) -> &str { | ||
unsafe { std::str::from_utf8_unchecked(self.value.as_slice()) } | ||
} | ||
} | ||
|
||
impl<O: Offset> Scalar for Utf8Scalar<O> { | ||
#[inline] | ||
fn as_any(&self) -> &dyn std::any::Any { | ||
self | ||
} | ||
|
||
#[inline] | ||
fn is_valid(&self) -> bool { | ||
self.is_valid | ||
} | ||
|
||
#[inline] | ||
fn data_type(&self) -> &DataType { | ||
if O::is_large() { | ||
&DataType::LargeUtf8 | ||
} else { | ||
&DataType::Utf8 | ||
} | ||
} | ||
|
||
fn to_boxed_array(&self, length: usize) -> Box<dyn Array> { | ||
if self.is_valid { | ||
let item_length = O::from_usize(self.value.len()).unwrap(); // verified at `new` | ||
let offsets = (0..=length).map(|i| O::from_usize(i).unwrap() * item_length); | ||
let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(offsets) }; | ||
let values = std::iter::repeat(self.value.as_slice()) | ||
.take(length) | ||
.flatten() | ||
.copied() | ||
.collect(); | ||
Box::new(Utf8Array::<O>::from_data(offsets, values, None)) | ||
} else { | ||
Box::new(Utf8Array::<O>::new_null(length)) | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct BinaryScalar<O: Offset> { | ||
value: Buffer<u8>, | ||
is_valid: bool, | ||
phantom: std::marker::PhantomData<O>, | ||
} | ||
|
||
impl<O: Offset> BinaryScalar<O> { | ||
#[inline] | ||
pub fn new(v: Option<&str>) -> Self { | ||
let is_valid = v.is_some(); | ||
O::from_usize(v.map(|x| x.len()).unwrap_or_default()).expect("Too large"); | ||
let value = Buffer::from(v.map(|x| x.as_bytes()).unwrap_or(&[])); | ||
Self { | ||
value, | ||
is_valid, | ||
phantom: std::marker::PhantomData, | ||
} | ||
} | ||
|
||
#[inline] | ||
pub fn value(&self) -> &[u8] { | ||
self.value.as_slice() | ||
} | ||
} | ||
|
||
impl<O: Offset> Scalar for BinaryScalar<O> { | ||
#[inline] | ||
fn as_any(&self) -> &dyn std::any::Any { | ||
self | ||
} | ||
|
||
#[inline] | ||
fn is_valid(&self) -> bool { | ||
self.is_valid | ||
} | ||
|
||
#[inline] | ||
fn data_type(&self) -> &DataType { | ||
if O::is_large() { | ||
&DataType::LargeBinary | ||
} else { | ||
&DataType::Binary | ||
} | ||
} | ||
|
||
fn to_boxed_array(&self, length: usize) -> Box<dyn Array> { | ||
if self.is_valid { | ||
let item_length = O::from_usize(self.value.len()).unwrap(); // verified at `new` | ||
let offsets = (0..=length).map(|i| O::from_usize(i).unwrap() * item_length); | ||
let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(offsets) }; | ||
let values = std::iter::repeat(self.value.as_slice()) | ||
.take(length) | ||
.flatten() | ||
.copied() | ||
.collect(); | ||
Box::new(BinaryArray::<O>::from_data(offsets, values, None)) | ||
} else { | ||
Box::new(BinaryArray::<O>::new_null(length)) | ||
} | ||
} | ||
} |