Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Rebase to polars-arrow for BinaryView/StringView support #1607

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5350a23
perf: don't needlessly allocate validity in concat/rechunk (#13288)
ritchie46 Dec 28, 2023
dcbeabe
feat: implement `BinaryView` and `Utf8View` in `polars-arrow` (#13243)
ritchie46 Jan 5, 2024
df6bc34
feat(rust): `BinaryView`/`Utf8View` IPC support (#13464)
ritchie46 Jan 6, 2024
aabbad6
feat(rust): add `BinaryView` to `parquet` writer/reader. (#13489)
ritchie46 Jan 10, 2024
fec11f0
feat: implement binview comparison kernels (#13715)
ritchie46 Jan 14, 2024
733e1cd
perf: directly embed data ptr in Buffer (#13744)
orlp Jan 15, 2024
0f5b93a
feat: new implementation for `String/Binary` type. (#13748)
ritchie46 Jan 19, 2024
6f688e3
perf: lazy cache binview bytes len (#13830)
ritchie46 Jan 19, 2024
a597eed
feat: fix binview ipc format (#13842)
ritchie46 Jan 19, 2024
568043d
perf: apply string view GC more conservatively (#13850)
orlp Jan 19, 2024
ca6afa8
feat: implement ffi for `binview` (#13871)
ritchie46 Jan 20, 2024
09b247f
feat: support mmap for binview in OOC (#13872)
ritchie46 Jan 20, 2024
d9d5260
feat: fix parquet for binview (#13873)
ritchie46 Jan 20, 2024
9bd13a2
fix: ensure binview doesn't OOB (#13876)
ritchie46 Jan 20, 2024
07eb3fb
make BinaryView/Utf8View compatible with arrow2
urvishdesai Feb 15, 2024
021b062
perf: improve binview filter (#13878)
ritchie46 Jan 21, 2024
02e876a
feat: move Enum/Categorical categories to binview (#13882)
ritchie46 Jan 22, 2024
b5d9fa4
perf: speedup binview filter (#13902)
ritchie46 Jan 22, 2024
7974932
chore(rust): update rustc (#13947)
ritchie46 Jan 24, 2024
7664e8b
feat: gc binview when writing ipc (#14035)
ritchie46 Jan 27, 2024
11078a8
fix: json_encode should respect to logical type (#14063)
reswqa Jan 29, 2024
91c0c60
chore: hoist boolean -> string cast (#14122)
ritchie46 Jan 31, 2024
69e55c3
replace get_unchecked_release with get_unchecked
urvishdesai Feb 15, 2024
3ca785f
fix: don't gc after variadic buffers are written (#14473)
ritchie46 Feb 13, 2024
d404d07
resolve more conflicts with polars-arrow
urvishdesai Feb 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ num-traits = "0.2"
dyn-clone = "1"
bytemuck = { version = "1", features = ["derive"] }
chrono = { version = "0.4.31", default_features = false, features = ["std"] }
atoi_simd = "0.15.5"
itoa = "1.0.6"
ryu = "1.0.13"
fast-float = { version = "0.2" }

# for decimal i256
ethnum = "1"
Expand Down Expand Up @@ -57,7 +61,7 @@ indexmap = { version = "^1.6", optional = true }
# used to print columns in a nice columnar format
comfy-table = { version = "6.0", optional = true, default-features = false }

arrow-format = { version = "0.8", optional = true, features = ["ipc"] }
arrow-format = { package = "polars-arrow-format", version = "0.1.0", optional = true, features = ["ipc"] }

hex = { version = "^0.4", optional = true }

Expand Down Expand Up @@ -184,7 +188,7 @@ io_json_write = ["streaming-iterator", "fallible-streaming-iterator", "lexical-c
io_ipc = ["arrow-format"]
io_ipc_write_async = ["io_ipc", "futures"]
io_ipc_read_async = ["io_ipc", "futures", "async-stream"]
io_ipc_compression = ["lz4", "zstd"]
io_ipc_compression = ["lz4", "zstd", "io_ipc"]
io_flight = ["io_ipc", "arrow-format/flight-data"]

# base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
Expand Down
2 changes: 1 addition & 1 deletion examples/ipc_file_mmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ fn write(
let options = arrow2::io::ipc::write::WriteOptions { compression };
let mut writer = arrow2::io::ipc::write::FileWriter::try_new(
result,
schema.clone(),
schema.clone().into(),
ipc_fields.clone(),
options,
)?;
Expand Down
6 changes: 3 additions & 3 deletions src/array/binary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ unsafe impl<O: Offset> ToFfi for BinaryArray<O> {
fn buffers(&self) -> Vec<Option<*const u8>> {
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
Some(self.offsets.buffer().as_ptr().cast::<u8>()),
Some(self.values.as_ptr().cast::<u8>()),
Some(self.offsets.buffer().storage_ptr().cast::<u8>()),
Some(self.values.storage_ptr().cast::<u8>()),
]
}

Expand Down Expand Up @@ -62,6 +62,6 @@ impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
// assumption that data from FFI is well constructed
let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets) };

Ok(Self::new(data_type, offsets, values, validity))
Self::try_new(data_type, offsets, values, validity)
}
}
101 changes: 101 additions & 0 deletions src/array/binview/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;

use super::BinaryViewArrayGeneric;

use crate::error::Result;
use crate::array::binview::{View, ViewType};
use crate::array::{FromFfi, ToFfi};
use crate::bitmap::align;
use crate::ffi;

unsafe impl<T: ViewType + ?Sized> ToFfi for BinaryViewArrayGeneric<T> {
fn buffers(&self) -> Vec<Option<*const u8>> {
let mut buffers = Vec::with_capacity(self.buffers.len() + 2);
buffers.push(self.validity.as_ref().map(|x| x.as_ptr()));
buffers.push(Some(self.views.storage_ptr().cast::<u8>()));
buffers.extend(self.buffers.iter().map(|b| Some(b.storage_ptr())));
buffers
}

fn offset(&self) -> Option<usize> {
let offset = self.views.offset();
if let Some(bitmap) = self.validity.as_ref() {
if bitmap.offset() == offset {
Some(offset)
} else {
None
}
} else {
Some(offset)
}
}

fn to_ffi_aligned(&self) -> Self {
let offset = self.views.offset();

let validity = self.validity.as_ref().map(|bitmap| {
if bitmap.offset() == offset {
bitmap.clone()
} else {
align(bitmap, offset)
}
});

Self {
data_type: self.data_type.clone(),
validity,
views: self.views.clone(),
buffers: self.buffers.clone(),
raw_buffers: self.raw_buffers.clone(),
phantom: Default::default(),
total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
total_buffer_len: self.total_buffer_len,
}
}
}

impl<T: ViewType + ?Sized, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryViewArrayGeneric<T> {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type().clone();

let validity = unsafe { array.validity() }?;
let views = unsafe { array.buffer::<View>(1) }?;

// 2 - validity + views
let n_buffers = array.n_buffers();
let mut remaining_buffers = n_buffers - 2;
if remaining_buffers <= 1 {
return Ok(Self::new_unchecked_unknown_md(
data_type,
views,
Arc::from([]),
validity,
None,
));
}

let n_variadic_buffers = remaining_buffers - 1;
let variadic_buffer_offset = n_buffers - 1;

let variadic_buffer_sizes =
array.buffer_known_len::<i64>(variadic_buffer_offset, n_variadic_buffers)?;
remaining_buffers -= 1;

let mut variadic_buffers = Vec::with_capacity(remaining_buffers);

let offset = 2;
for (i, &size) in (offset..remaining_buffers + offset).zip(variadic_buffer_sizes.iter()) {
let values = unsafe { array.buffer_known_len::<u8>(i, size as usize) }?;
variadic_buffers.push(values);
}

Ok(Self::new_unchecked_unknown_md(
data_type,
views,
Arc::from(variadic_buffers),
validity,
None,
))
}
}
36 changes: 36 additions & 0 deletions src/array/binview/fmt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use std::fmt::{Debug, Formatter, Result, Write};

use super::super::fmt::write_vec;
use super::BinaryViewArrayGeneric;
use crate::array::binview::ViewType;
use crate::array::{Array, BinaryViewArray, Utf8ViewArray};

pub fn write_value<'a, T: ViewType + ?Sized, W: Write>(
array: &'a BinaryViewArrayGeneric<T>,
index: usize,
f: &mut W,
) -> Result
where
&'a T: Debug,
{
let bytes = array.value(index).to_bytes();
let writer = |f: &mut W, index| write!(f, "{}", bytes[index]);

write_vec(f, writer, None, bytes.len(), "None", false)
}

impl Debug for BinaryViewArray {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let writer = |f: &mut Formatter, index| write_value(self, index, f);
write!(f, "BinaryViewArray")?;
write_vec(f, writer, self.validity(), self.len(), "None", false)
}
}

impl Debug for Utf8ViewArray {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let writer = |f: &mut Formatter, index| write!(f, "{}", self.value(index));
write!(f, "Utf8ViewArray")?;
write_vec(f, writer, self.validity(), self.len(), "None", false)
}
}
47 changes: 47 additions & 0 deletions src/array/binview/iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use super::BinaryViewArrayGeneric;
use crate::array::binview::ViewType;
use crate::array::{ArrayAccessor, ArrayValuesIter, MutableBinaryViewArray};
use crate::bitmap::utils::{BitmapIter, ZipValidity};

unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for BinaryViewArrayGeneric<T> {
type Item = &'a T;

#[inline]
unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item {
self.value_unchecked(index)
}

#[inline]
fn len(&self) -> usize {
self.views.len()
}
}

/// Iterator of values of an [`BinaryArray`].
pub type BinaryViewValueIter<'a, T> = ArrayValuesIter<'a, BinaryViewArrayGeneric<T>>;

impl<'a, T: ViewType + ?Sized> IntoIterator for &'a BinaryViewArrayGeneric<T> {
type Item = Option<&'a T>;
type IntoIter = ZipValidity<&'a T, BinaryViewValueIter<'a, T>, BitmapIter<'a>>;

fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}

unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for MutableBinaryViewArray<T> {
type Item = &'a T;

#[inline]
unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item {
self.value_unchecked(index)
}

#[inline]
fn len(&self) -> usize {
self.views().len()
}
}

/// Iterator of values of an [`MutableBinaryViewArray`].
pub type MutableBinaryViewValueIter<'a, T> = ArrayValuesIter<'a, MutableBinaryViewArray<T>>;
Loading