Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added Scalar values.
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Jul 21, 2021
1 parent 20072a5 commit b146d20
Show file tree
Hide file tree
Showing 3 changed files with 274 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub mod bitmap;
pub mod buffer;
mod endianess;
pub mod error;
pub mod scalar;
pub mod trusted_len;
pub mod types;

Expand Down
28 changes: 28 additions & 0 deletions src/scalar/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Scalar API

Design choices:

### `Scalar` is trait object

There are three reasons:

* a scalar should have a small memory footprint, which an enum would not ensure given the different physical types available.
* forward-compatibility: a new entry on an `enum` is backward-incompatible
* do not expose implementation details to users (reduce the surface of the public API)

### `Scalar` should contain nullability information

This is to be aligned with the general notion of arrow's `Array`.

This API is a companion to the `Array`, and follows the same design as `Array`.
Specifically, a `Scalar` is a trait object that can be downcasted to concrete implementations.

Like `Array`, `Scalar` implements

* `data_type`, which is used to perform the correct downcast
* `is_valid`, to tell whether the scalar is null or not

### There is one implementation per arrows' physical type

* Reduces the number of `match` that users need to write
* Allows casting of logical types without changing the underlying physical type
245 changes: 245 additions & 0 deletions src/scalar/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
use std::any::Any;

use crate::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, types::NativeType};

pub trait Scalar: std::fmt::Debug {
fn as_any(&self) -> &dyn Any;

fn is_valid(&self) -> bool;

fn data_type(&self) -> &DataType;

fn to_boxed_array(&self, length: usize) -> Box<dyn Array>;
}

#[derive(Debug, Clone)]
pub struct PrimitiveScalar<T: NativeType> {
// Not Option<T> because this offers a stabler pointer offset on the struct
value: T,
is_valid: bool,
data_type: DataType,
}

impl<T: NativeType> PrimitiveScalar<T> {
#[inline]
pub fn new(data_type: DataType, v: Option<T>) -> Self {
let is_valid = v.is_some();
Self {
value: v.unwrap_or_default(),
is_valid,
data_type,
}
}

#[inline]
pub fn value(&self) -> T {
self.value
}
}

impl<T: NativeType> Scalar for PrimitiveScalar<T> {
#[inline]
fn as_any(&self) -> &dyn std::any::Any {
self
}

#[inline]
fn is_valid(&self) -> bool {
self.is_valid
}

#[inline]
fn data_type(&self) -> &DataType {
&self.data_type
}

fn to_boxed_array(&self, length: usize) -> Box<dyn Array> {
if self.is_valid {
let values = Buffer::from_trusted_len_iter(std::iter::repeat(self.value).take(length));
Box::new(PrimitiveArray::from_data(
self.data_type.clone(),
values,
None,
))
} else {
Box::new(PrimitiveArray::<T>::new_null(
self.data_type.clone(),
length,
))
}
}
}

#[derive(Debug, Clone)]
pub struct BooleanScalar {
value: bool,
is_valid: bool,
}

impl BooleanScalar {
#[inline]
pub fn new(v: Option<bool>) -> Self {
let is_valid = v.is_some();
Self {
value: v.unwrap_or_default(),
is_valid,
}
}

#[inline]
pub fn value(&self) -> bool {
self.value
}
}

impl Scalar for BooleanScalar {
#[inline]
fn as_any(&self) -> &dyn std::any::Any {
self
}

#[inline]
fn is_valid(&self) -> bool {
self.is_valid
}

#[inline]
fn data_type(&self) -> &DataType {
&DataType::Boolean
}

fn to_boxed_array(&self, length: usize) -> Box<dyn Array> {
if self.is_valid {
let values = Bitmap::from_trusted_len_iter(std::iter::repeat(self.value).take(length));
Box::new(BooleanArray::from_data(values, None))
} else {
Box::new(BooleanArray::new_null(length))
}
}
}

#[derive(Debug, Clone)]
pub struct Utf8Scalar<O: Offset> {
value: Buffer<u8>,
is_valid: bool,
phantom: std::marker::PhantomData<O>,
}

impl<O: Offset> Utf8Scalar<O> {
#[inline]
pub fn new(v: Option<&str>) -> Self {
let is_valid = v.is_some();
O::from_usize(v.map(|x| x.len()).unwrap_or_default()).expect("Too large");
let value = Buffer::from(v.map(|x| x.as_bytes()).unwrap_or(&[]));
Self {
value,
is_valid,
phantom: std::marker::PhantomData,
}
}

#[inline]
pub fn value(&self) -> &str {
unsafe { std::str::from_utf8_unchecked(self.value.as_slice()) }
}
}

impl<O: Offset> Scalar for Utf8Scalar<O> {
#[inline]
fn as_any(&self) -> &dyn std::any::Any {
self
}

#[inline]
fn is_valid(&self) -> bool {
self.is_valid
}

#[inline]
fn data_type(&self) -> &DataType {
if O::is_large() {
&DataType::LargeUtf8
} else {
&DataType::Utf8
}
}

fn to_boxed_array(&self, length: usize) -> Box<dyn Array> {
if self.is_valid {
let item_length = O::from_usize(self.value.len()).unwrap(); // verified at `new`
let offsets = (0..=length).map(|i| O::from_usize(i).unwrap() * item_length);
let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(offsets) };
let values = std::iter::repeat(self.value.as_slice())
.take(length)
.flatten()
.copied()
.collect();
Box::new(Utf8Array::<O>::from_data(offsets, values, None))
} else {
Box::new(Utf8Array::<O>::new_null(length))
}
}
}

#[derive(Debug, Clone)]
pub struct BinaryScalar<O: Offset> {
value: Buffer<u8>,
is_valid: bool,
phantom: std::marker::PhantomData<O>,
}

impl<O: Offset> BinaryScalar<O> {
#[inline]
pub fn new(v: Option<&str>) -> Self {
let is_valid = v.is_some();
O::from_usize(v.map(|x| x.len()).unwrap_or_default()).expect("Too large");
let value = Buffer::from(v.map(|x| x.as_bytes()).unwrap_or(&[]));
Self {
value,
is_valid,
phantom: std::marker::PhantomData,
}
}

#[inline]
pub fn value(&self) -> &[u8] {
self.value.as_slice()
}
}

impl<O: Offset> Scalar for BinaryScalar<O> {
#[inline]
fn as_any(&self) -> &dyn std::any::Any {
self
}

#[inline]
fn is_valid(&self) -> bool {
self.is_valid
}

#[inline]
fn data_type(&self) -> &DataType {
if O::is_large() {
&DataType::LargeBinary
} else {
&DataType::Binary
}
}

fn to_boxed_array(&self, length: usize) -> Box<dyn Array> {
if self.is_valid {
let item_length = O::from_usize(self.value.len()).unwrap(); // verified at `new`
let offsets = (0..=length).map(|i| O::from_usize(i).unwrap() * item_length);
let offsets = unsafe { Buffer::from_trusted_len_iter_unchecked(offsets) };
let values = std::iter::repeat(self.value.as_slice())
.take(length)
.flatten()
.copied()
.collect();
Box::new(BinaryArray::<O>::from_data(offsets, values, None))
} else {
Box::new(BinaryArray::<O>::new_null(length))
}
}
}

0 comments on commit b146d20

Please sign in to comment.