diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index bc5391ee..668cc794 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -31,7 +31,7 @@ jobs: ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}-bench- ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}- ${{ runner.os }}-cargo- - - run: cargo bench --bench narrow -- --output-format=bencher | tee output.txt + - run: cargo bench --bench narrow --all-features -- --output-format=bencher | tee output.txt - uses: actions/upload-artifact@v3 with: name: benchmark-results diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c6148bfb..59a97649 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,13 +7,44 @@ permissions: contents: read jobs: + minimal: + name: Minimal + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + id: rust-toolchain + - uses: dtolnay/install@master + with: + crate: cargo-expand + - uses: actions/cache@v3 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}-minimal-${{ hashFiles('**/Cargo.toml') }} + restore-keys: | + ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}-minimal- + ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}- + ${{ runner.os }}-cargo- + - run: cargo check --workspace --all-targets --no-default-features + - run: cargo test --workspace --all-targets --no-default-features + - run: cargo test --workspace --doc --no-default-features + - run: cargo clippy --workspace --all-targets --no-default-features -- -Dwarnings + msrv: name: Minimum supported Rust version runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: dtolnay/rust-toolchain@1.67.0 + - uses: dtolnay/rust-toolchain@1.70.0 id: rust-toolchain + - uses: dtolnay/install@master + with: + crate: cargo-expand - uses: actions/cache@v3 with: path: | @@ -27,8 +58,9 @@ jobs: ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}-msrv- ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}- ${{ runner.os }}-cargo- - - run: cargo check --all --all-features - + - run: cargo check --workspace --all-targets --all-features + - run: cargo test --all --all-targets --all-features + - run: cargo test --all --doc --all-features check: name: Check runs-on: ubuntu-latest @@ -75,8 +107,8 @@ jobs: - uses: dtolnay/install@master with: crate: cargo-expand - - run: cargo test --all --all-targets --all-features - - run: cargo test --all --doc --all-features + - run: cargo test --workspace --all-targets --all-features + - run: cargo test --workspace --doc --all-features rustfmt: name: Rustfmt @@ -110,7 +142,7 @@ jobs: ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}-clippy- ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}- ${{ runner.os }}-cargo- - - run: cargo clippy --all --all-targets --all-features -- -Dwarnings + - run: cargo clippy --workspace --all-targets --all-features -- -Dwarnings miri: name: Miri @@ -133,6 +165,7 @@ jobs: ${{ runner.os }}-cargo-${{ steps.rust-toolchain.outputs.cachekey }}- ${{ runner.os }}-cargo- - run: cargo miri setup + - run: cargo miri test --no-default-features - run: cargo miri test --all-features coverage: @@ -163,8 +196,8 @@ jobs: - uses: dtolnay/install@master with: crate: cargo-expand - - run: cargo build --all --all-targets --all-features - - run: cargo test --all --all-targets --all-features + - run: cargo build --workspace --all-targets --all-features + - run: cargo test --workspace --all-targets --all-features env: LLVM_PROFILE_FILE: "narrow-%p-%m.profraw" - name: Install grcov diff --git a/Cargo.toml b/Cargo.toml index 234ed304..45ab1b77 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [workspace.package] authors = ["Matthijs Brobbel "] edition = "2021" -rust-version = "1.67.0" +rust-version = "1.70.0" description = "An implementation of Apache Arrow" readme = "README.md" repository = "https://github.com/mbrobbel/narrow" @@ -29,14 +29,21 @@ categories.workspace = true [features] default = [] +arrow-rs = ["dep:arrow-array", "dep:arrow-buffer", "dep:arrow-schema", "narrow-derive?/arrow-rs"] derive = ["dep:narrow-derive"] [dependencies] +arrow-array = { git = "https://github.com/apache/arrow-rs", rev = "7fd2d42", optional = true } +arrow-buffer = { git = "https://github.com/apache/arrow-rs", rev = "7fd2d42", optional = true } +arrow-schema = { git = "https://github.com/apache/arrow-rs", rev = "7fd2d42", optional = true } narrow-derive = { path = "narrow-derive", version = "^0.3.4", optional = true } [dev-dependencies] +arrow-cast = { git = "https://github.com/apache/arrow-rs", rev = "7fd2d42", default-features = false, features = ["prettyprint"] } +bytes = "1.5.0" criterion = { version = "0.5.1", default-features = false } rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } +parquet = { git = "https://github.com/apache/arrow-rs", rev = "7fd2d42", features = ["arrow"] } [profile.bench] lto = true @@ -45,3 +52,7 @@ codegen-units = 1 [[bench]] name = "narrow" harness = false + +[[example]] +name = "parquet" +required-features = ["arrow-rs", "derive"] diff --git a/README.md b/README.md index 04086bd4..d8c79f21 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ This crate provides types to support reading and writing instances of abstract d ## Minimum supported Rust version -The minimum supported Rust version for this crate is Rust 1.67.0. +The minimum supported Rust version for this crate is Rust 1.70.0. ## License diff --git a/examples/parquet.rs b/examples/parquet.rs new file mode 100644 index 00000000..4b093026 --- /dev/null +++ b/examples/parquet.rs @@ -0,0 +1,55 @@ +fn main() { + use arrow_array::RecordBatch; + use arrow_cast::pretty; + use bytes::Bytes; + use narrow::{array::StructArray, arrow::buffer_builder::ArrowBufferBuilder, ArrayType}; + use parquet::arrow::{arrow_reader::ParquetRecordBatchReader, ArrowWriter}; + + #[derive(ArrayType, Default)] + struct Bar(Option); + + #[derive(ArrayType, Default)] + struct Foo { + a: u32, + b: Option, + c: bool, + d: String, + e: Option>>, + f: Bar, + } + let input = [ + Foo { + a: 1, + b: Some(2), + c: true, + d: "hello world!".to_string(), + e: Some(vec![Some(true), None]), + f: Bar(Some(true)), + }, + Foo { + a: 42, + b: None, + c: false, + d: "narrow".to_string(), + e: None, + f: Bar(None), + }, + ]; + + let narrow_array = input + .into_iter() + .collect::>(); + + let record_batch = RecordBatch::from(narrow_array); + pretty::print_batches(&[record_batch.clone()]).unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, record_batch.schema(), None).unwrap(); + writer.write(&record_batch).unwrap(); + writer.close().unwrap(); + + let mut reader = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 1024).unwrap(); + let read = reader.next().unwrap().unwrap(); + pretty::print_batches(&[read.clone()]).unwrap(); + assert_eq!(record_batch, read); +} diff --git a/narrow-derive/Cargo.toml b/narrow-derive/Cargo.toml index 78c2f749..253c9b64 100644 --- a/narrow-derive/Cargo.toml +++ b/narrow-derive/Cargo.toml @@ -12,6 +12,10 @@ license.workspace = true keywords.workspace = true categories.workspace = true +[features] +default = [] +arrow-rs = [] + [lib] proc-macro = true diff --git a/narrow-derive/src/lib.rs b/narrow-derive/src/lib.rs index 94679edc..fbed0840 100644 --- a/narrow-derive/src/lib.rs +++ b/narrow-derive/src/lib.rs @@ -12,7 +12,8 @@ const CRATE: &str = "narrow"; static NARROW: Lazy = Lazy::new(|| match proc_macro_crate::crate_name(CRATE) { Ok(found) => match found { - FoundCrate::Itself => "crate".to_string(), + // Requires `extern crate self as narrow` + FoundCrate::Itself => CRATE.to_string(), FoundCrate::Name(name) => name, }, _ => CRATE.to_string(), diff --git a/narrow-derive/src/struct.rs b/narrow-derive/src/struct.rs index c611a5d3..96ea38cc 100644 --- a/narrow-derive/src/struct.rs +++ b/narrow-derive/src/struct.rs @@ -4,8 +4,7 @@ use quote::{format_ident, quote, ToTokens, TokenStreamExt}; use std::iter::{Enumerate, Map}; use syn::{ parse2, parse_quote, punctuated, token::Paren, visit_mut::VisitMut, DeriveInput, Field, Fields, - File, Generics, Ident, Index, ItemImpl, ItemStruct, Type, TypeParamBound, Visibility, - WherePredicate, + Generics, Ident, Index, ItemImpl, ItemStruct, Type, TypeParamBound, Visibility, WherePredicate, }; pub(super) fn derive(input: &DeriveInput, fields: &Fields) -> TokenStream { @@ -23,6 +22,12 @@ pub(super) fn derive(input: &DeriveInput, fields: &Fields) -> TokenStream { // Generate the StructArrayType impl. let struct_array_type_impl = input.struct_array_type_impl(); + // Optionally generate the StructArrayTypeFields impl. + let struct_array_type_fields_impl = input.struct_array_type_fields_impl(); + + // Optionally generates the conversion to vec of array refs + let struct_array_into_array_refs = input.struct_array_into_array_refs(); + // Generate the array wrapper struct definition. let array_struct_def = input.array_struct_def(); @@ -45,6 +50,10 @@ pub(super) fn derive(input: &DeriveInput, fields: &Fields) -> TokenStream { #struct_array_type_impl + #struct_array_type_fields_impl + + #struct_array_into_array_refs + #array_struct_def #array_default_impl @@ -164,7 +173,7 @@ impl Struct<'_> { } /// Add an `StructArrayType` implementation for the derive input. - fn struct_array_type_impl(&self) -> File { + fn struct_array_type_impl(&self) -> ItemImpl { let narrow = util::narrow(); // Generics @@ -188,6 +197,113 @@ impl Struct<'_> { parse2(tokens).expect("struct_array_type_impl") } + /// Add an `StructArrayTypeFields` implementation for the derive input. + fn struct_array_type_fields_impl(&self) -> ItemImpl { + let narrow = util::narrow(); + + // Generics + let mut generics = self.generics.clone(); + SelfReplace::new(self.ident, &generics).visit_generics_mut(&mut generics); + AddTypeParamBound(Self::array_type_bound()).visit_generics_mut(&mut generics); + AddTypeParam(parse_quote!(Buffer: #narrow::buffer::BufferType)) + .visit_generics_mut(&mut generics); + generics + .make_where_clause() + .predicates + .extend(self.where_predicate_fields(parse_quote!(#narrow::arrow::ArrowArray))); + let (impl_generics, ty_generics, where_clause) = generics.split_for_impl(); + + // Fields + let field_ident = self.field_idents().map(|ident| ident.to_string()); + let field_ty = self.field_types(); + let fields = quote!( + #( + ::std::sync::Arc::new(<<#field_ty as ::narrow::array::ArrayType>::Array as #narrow::arrow::ArrowArray>::as_field(#field_ident)), + )* + ); + + let ident = self.array_struct_ident(); + let tokens = quote! { + #[cfg(feature = "arrow-rs")] + impl #impl_generics #narrow::arrow::StructArrayTypeFields for #ident #ty_generics #where_clause { + fn fields() -> ::arrow_schema::Fields { + ::arrow_schema::Fields::from([ + #fields + ]) + } + } + }; + parse2(tokens).expect("struct_array_type_fields_impl") + } + + /// Add an `Into` implementation for the array to convert to a vec of array refs + fn struct_array_into_array_refs(&self) -> ItemImpl { + let narrow = util::narrow(); + + // Generics + let mut generics = self.generics.clone(); + SelfReplace::new(self.ident, &generics).visit_generics_mut(&mut generics); + AddTypeParamBound(Self::array_type_bound()).visit_generics_mut(&mut generics); + AddTypeParam(parse_quote!(Buffer: #narrow::buffer::BufferType)) + .visit_generics_mut(&mut generics); + generics + .make_where_clause() + .predicates + .extend(self.where_predicate_fields_arrow_array_into()); + let (impl_generics, ty_generics, where_clause) = generics.split_for_impl(); + + // Fields + let field_ty = self.field_types(); + let field_arrays = match self.fields { + Fields::Named(_) => { + let field_ident = self.field_idents(); + quote!( + #( + ::std::sync::Arc::< + <<#field_ty as #narrow::array::ArrayType>::Array as #narrow::arrow::ArrowArray>::Array + >::new(value.#field_ident.into()), + )* + ) + } + Fields::Unnamed(_) => { + let field_idx = self + .fields + .iter() + .enumerate() + .map(|(idx, _)| Index::from(idx)); + quote!( + #( + ::std::sync::Arc::< + <<#field_ty as #narrow::array::ArrayType>::Array as #narrow::arrow::ArrowArray>::Array + >::new(value.#field_idx.into()), + )* + ) + } + Fields::Unit => { + quote!( + #( + ::std::sync::Arc::< + <<#field_ty as #narrow::array::ArrayType>::Array as #narrow::arrow::ArrowArray>::Array + >::new(value.0.into()) + )* + ) + } + }; + + let ident = self.array_struct_ident(); + let tokens = quote! { + #[cfg(feature = "arrow-rs")] + impl #impl_generics ::std::convert::From<#ident #ty_generics> for ::std::vec::Vec<::std::sync::Arc> #where_clause { + fn from(value: #ident #ty_generics) -> Self { + vec![ + #field_arrays + ] + } + } + }; + parse2(tokens).expect("struct_array_into_array_refs") + } + /// Returns the struct definition of the Array wrapper struct. fn array_struct_def(&self) -> ItemStruct { let narrow = util::narrow(); @@ -467,6 +583,18 @@ impl Struct<'_> { self.field_types() .map(move |ty| parse_quote!(<#ty as #narrow::array::ArrayType>::Array: #bound)) } + + fn where_predicate_fields_arrow_array_into(&self) -> impl Iterator + '_ { + let narrow = util::narrow(); + self.field_types() + .map(move |ty| parse_quote!( + <#ty as #narrow::array::ArrayType>::Array: + ::std::convert::Into< + <<#ty as #narrow::array::ArrayType>::Array + as #narrow::arrow::ArrowArray>::Array + > + )) + } } #[cfg(test)] diff --git a/src/array/boolean.rs b/src/array/boolean.rs index 3db0bb88..9ec0eadc 100644 --- a/src/array/boolean.rs +++ b/src/array/boolean.rs @@ -13,7 +13,7 @@ use crate::{ /// /// Values are stored using single bits in a [Bitmap]. pub struct BooleanArray( - as Validity>::Storage, + pub(crate) as Validity>::Storage, ) where Bitmap: Validity; diff --git a/src/array/fixed_size_primitive.rs b/src/array/fixed_size_primitive.rs index ff560242..a2d81eb9 100644 --- a/src/array/fixed_size_primitive.rs +++ b/src/array/fixed_size_primitive.rs @@ -3,7 +3,7 @@ use super::Array; use crate::{ bitmap::{Bitmap, BitmapRef, BitmapRefMut, ValidityBitmap}, - buffer::{BufferType, VecBuffer}, + buffer::{Buffer, BufferType, VecBuffer}, nullable::Nullable, validity::Validity, FixedSize, Index, Length, @@ -34,10 +34,14 @@ type_def!(Int8Array, i8); type_def!(Int16Array, i16); type_def!(Int32Array, i32); type_def!(Int64Array, i64); +#[cfg(not(feature = "arrow-rs"))] +type_def!(Int128Array, i128); type_def!(Uint8Array, u8); type_def!(Uint16Array, u16); type_def!(Uint32Array, u32); type_def!(Uint64Array, u64); +#[cfg(not(feature = "arrow-rs"))] +type_def!(Uint128Array, u128); type_def!(IsizeArray, isize); type_def!(UsizeArray, usize); @@ -52,6 +56,13 @@ where { } +// todo(mbrobbel): buffer_ref traits? +impl AsRef<[T]> for FixedSizePrimitiveArray { + fn as_ref(&self) -> &[T] { + self.0.as_slice() + } +} + impl Default for FixedSizePrimitiveArray where @@ -195,12 +206,15 @@ mod tests { assert_eq!(array.0.as_slice(), &[1, 2, 3, 4]); assert_eq!(array.0.as_slice(), array.0.as_bytes()); - let input_array = [[1_u8, 2], [3, 4]]; - let array_array = input_array - .into_iter() - .collect::>(); - assert_eq!(array_array.0.as_slice(), &[[1, 2], [3, 4]]); - assert_eq!(<_ as Buffer>::as_bytes(&array_array.0), &[1, 2, 3, 4]); + #[cfg(not(feature = "arrow-rs"))] + { + let input_array = [[1_u8, 2], [3, 4]]; + let array_array = input_array + .into_iter() + .collect::>(); + assert_eq!(array_array.0.as_slice(), &[[1, 2], [3, 4]]); + assert_eq!(<_ as Buffer>::as_bytes(&array_array.0), &[1, 2, 3, 4]); + }; } #[test] @@ -221,11 +235,14 @@ mod tests { let array = input.into_iter().collect::>(); assert_eq!(array.into_iter().collect::>(), input); - let input_array = [[1_u8, 2], [3, 4]]; - let array_array = input_array - .into_iter() - .collect::>(); - assert_eq!(array_array.into_iter().collect::>(), input_array); + #[cfg(not(feature = "arrow-rs"))] + { + let input_array = [[1_u8, 2], [3, 4]]; + let array_array = input_array + .into_iter() + .collect::>(); + assert_eq!(array_array.into_iter().collect::>(), input_array); + }; } #[test] @@ -241,11 +258,14 @@ mod tests { let array = input.into_iter().collect::>(); assert_eq!(array.len(), input.as_slice().len()); - let input_array = [[1_u8, 2], [3, 4]]; - let array_array = input_array - .into_iter() - .collect::>(); - assert_eq!(array_array.len(), input_array.as_slice().len()); + #[cfg(not(feature = "arrow-rs"))] + { + let input_array = [[1_u8, 2], [3, 4]]; + let array_array = input_array + .into_iter() + .collect::>(); + assert_eq!(array_array.len(), input_array.as_slice().len()); + }; let input_nullable = [Some(1_u64), None, Some(3), Some(4)]; let array_nullable = input_nullable diff --git a/src/array/mod.rs b/src/array/mod.rs index 77eb4ffb..5933813a 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -94,14 +94,20 @@ impl_array_type!(u64, FixedSizePrimitiveArray); impl_array_type!(Option, FixedSizePrimitiveArray); impl_array_type!(i64, FixedSizePrimitiveArray); impl_array_type!(Option, FixedSizePrimitiveArray); +#[cfg(not(feature = "arrow-rs"))] impl_array_type!(u128, FixedSizePrimitiveArray); +#[cfg(not(feature = "arrow-rs"))] impl_array_type!(Option, FixedSizePrimitiveArray); impl_array_type!(i128, FixedSizePrimitiveArray); impl_array_type!(Option, FixedSizePrimitiveArray); +#[cfg(not(feature = "arrow-rs"))] impl_array_type!(usize, FixedSizePrimitiveArray); +#[cfg(not(feature = "arrow-rs"))] impl_array_type!(Option, FixedSizePrimitiveArray); +#[cfg(not(feature = "arrow-rs"))] impl_array_type!(isize, FixedSizePrimitiveArray); +#[cfg(not(feature = "arrow-rs"))] impl_array_type!(Option, FixedSizePrimitiveArray); impl_array_type!(f32, FixedSizePrimitiveArray); diff --git a/src/array/string.rs b/src/array/string.rs index 7a30ec26..65ed4207 100644 --- a/src/array/string.rs +++ b/src/array/string.rs @@ -46,24 +46,29 @@ where } } -impl<'a, OffsetItem: OffsetElement, Buffer: BufferType> Extend<&'a str> +impl<'a, T: ?Sized, OffsetItem: OffsetElement, Buffer: BufferType> Extend<&'a T> for StringArray where + T: AsRef, VariableSizeBinaryArray: Extend<&'a [u8]>, { - fn extend>(&mut self, iter: I) { - self.0.extend(iter.into_iter().map(str::as_bytes)); + fn extend>(&mut self, iter: I) { + self.0 + .extend(iter.into_iter().map(|item| item.as_ref().as_bytes())); } } -impl<'a, OffsetItem: OffsetElement, Buffer: BufferType> Extend> +impl<'a, T: ?Sized, OffsetItem: OffsetElement, Buffer: BufferType> Extend> for StringArray where + T: AsRef, VariableSizeBinaryArray: Extend>, { - fn extend>>(&mut self, iter: I) { - self.0 - .extend(iter.into_iter().map(|opt| opt.map(str::as_bytes))); + fn extend>>(&mut self, iter: I) { + self.0.extend( + iter.into_iter() + .map(|opt| opt.map(|item| item.as_ref().as_bytes())), + ); } } @@ -99,23 +104,33 @@ where } } -impl<'a, OffsetItem: OffsetElement, Buffer: BufferType> FromIterator<&'a str> +impl<'a, T: ?Sized, OffsetItem: OffsetElement, Buffer: BufferType> FromIterator<&'a T> for StringArray where + T: AsRef, VariableSizeBinaryArray: FromIterator<&'a [u8]>, { - fn from_iter>(iter: I) -> Self { - Self(iter.into_iter().map(str::as_bytes).collect()) + fn from_iter>(iter: I) -> Self { + Self( + iter.into_iter() + .map(|item| item.as_ref().as_bytes()) + .collect(), + ) } } -impl<'a, OffsetItem: OffsetElement, Buffer: BufferType> FromIterator> +impl<'a, T: ?Sized, OffsetItem: OffsetElement, Buffer: BufferType> FromIterator> for StringArray where + T: AsRef, VariableSizeBinaryArray: FromIterator>, { - fn from_iter>>(iter: I) -> Self { - Self(iter.into_iter().map(|x| x.map(str::as_bytes)).collect()) + fn from_iter>>(iter: I) -> Self { + Self( + iter.into_iter() + .map(|x| x.map(|item| item.as_ref().as_bytes())) + .collect(), + ) } } diff --git a/src/array/struct.rs b/src/array/struct.rs index 76ffe229..be41ebbf 100644 --- a/src/array/struct.rs +++ b/src/array/struct.rs @@ -14,7 +14,7 @@ pub trait StructArrayType: ArrayType { /// The array type that stores items of this struct. Note this differs from /// the [`ArrayType`] array because that wraps this array. Also note that this /// has no [`Array`] bound. - type Array; + type Array; // into this then requires all arraytype impls to provide a field } /// Array for product types. @@ -119,7 +119,7 @@ mod tests { a: u32, b: Option<()>, c: (), - d: Option<[u128; 2]>, + d: Option<[u64; 2]>, e: bool, f: &'a [u8], g: String, @@ -138,7 +138,7 @@ mod tests { a: ::Array, b: as ArrayType>::Array, c: <() as ArrayType>::Array, - d: as ArrayType>::Array, + d: as ArrayType>::Array, e: ::Array, f: <&'a [u8] as ArrayType>::Array, g: ::Array, @@ -149,7 +149,7 @@ mod tests { ::Array: Default, as ArrayType>::Array: Default, <() as ArrayType>::Array: Default, - as ArrayType>::Array: Default, + as ArrayType>::Array: Default, ::Array: Default, <&'a [u8] as ArrayType>::Array: Default, ::Array: Default, @@ -159,7 +159,7 @@ mod tests { a: ::Array::::default(), b: as ArrayType>::Array::::default(), c: <() as ArrayType>::Array::::default(), - d: as ArrayType>::Array::::default( + d: as ArrayType>::Array::::default( ), e: ::Array::::default(), f: <&'a [u8] as ArrayType>::Array::::default(), @@ -173,8 +173,8 @@ mod tests { ::Array: Extend, as ArrayType>::Array: Extend>, <() as ArrayType>::Array: Extend<()>, - as ArrayType>::Array: - Extend>, + as ArrayType>::Array: + Extend>, ::Array: Extend, <&'a [u8] as ArrayType>::Array: Extend<&'a [u8]>, ::Array: Extend, @@ -208,8 +208,8 @@ mod tests { as ArrayType>::Array: Default + Extend>, <() as ArrayType>::Array: Default + Extend<()>, - as ArrayType>::Array: - Default + Extend>, + as ArrayType>::Array: + Default + Extend>, ::Array: Default + Extend, <&'a [u8] as ArrayType>::Array: Default + Extend<&'a [u8]>, ::Array: Default + Extend, diff --git a/src/arrow/array/boolean.rs b/src/arrow/array/boolean.rs new file mode 100644 index 00000000..354b06ea --- /dev/null +++ b/src/arrow/array/boolean.rs @@ -0,0 +1,166 @@ +//! Interop with `arrow-rs` boolean array. + +use std::sync::Arc; + +use crate::{ + array::BooleanArray, arrow::ArrowArray, bitmap::Bitmap, buffer::BufferType, nullable::Nullable, + validity::Validity, +}; +use arrow_buffer::{BooleanBuffer, NullBuffer}; +use arrow_schema::{DataType, Field}; + +impl ArrowArray for BooleanArray +where + Bitmap: Validity, +{ + type Array = arrow_array::BooleanArray; + + fn as_field(name: &str) -> arrow_schema::Field { + Field::new(name, DataType::Boolean, NULLABLE) + } +} + +impl From> + for BooleanArray +where + Bitmap: Validity, + Self: From, +{ + fn from(value: Arc) -> Self { + Self::from(arrow_array::BooleanArray::from(value.to_data())) + } +} + +impl From> for arrow_array::BooleanArray +where + Bitmap: Into, +{ + fn from(value: BooleanArray) -> Self { + arrow_array::BooleanArray::new(value.0.into(), None) + } +} + +impl From> for arrow_array::BooleanArray +where + Bitmap: Into + Into, +{ + fn from(value: BooleanArray) -> Self { + arrow_array::BooleanArray::new(value.0.data.into(), Some(value.0.validity.into())) + } +} + +/// Panics when there are nulls +impl From for BooleanArray +where + Bitmap: From, +{ + fn from(value: arrow_array::BooleanArray) -> Self { + let (boolean_buffer, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(_) => panic!("expected array without a null buffer"), + None => BooleanArray(boolean_buffer.into()), + } + } +} + +/// Panics when there are no nulls +// OR allocate one instead and use `TryFrom` conversion? +impl From for BooleanArray +where + Bitmap: From + From, +{ + fn from(value: arrow_array::BooleanArray) -> Self { + let (boolean_buffer, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(null_buffer) => BooleanArray(Nullable { + data: boolean_buffer.into(), + validity: null_buffer.into(), + }), + None => panic!("expected array with a null buffer"), + } + } +} + +#[cfg(test)] +mod tests { + use crate::{array::BooleanArray, buffer::ArcBuffer}; + + const INPUT: [bool; 4] = [true, true, false, true]; + const INPUT_NULLABLE: [Option; 4] = [Some(true), None, Some(false), Some(true)]; + + #[test] + fn from() { + let boolean_array = INPUT.into_iter().collect::(); + assert_eq!( + arrow_array::BooleanArray::from(boolean_array) + .into_iter() + .flatten() + .collect::>(), + INPUT + ); + + let boolean_array_arc = INPUT + .into_iter() + .collect::>(); + assert_eq!( + arrow_array::BooleanArray::from(boolean_array_arc) + .into_iter() + .flatten() + .collect::>(), + INPUT + ); + + let boolean_array_nullable = INPUT_NULLABLE.into_iter().collect::>(); + assert_eq!( + arrow_array::BooleanArray::from(boolean_array_nullable) + .into_iter() + .collect::>(), + INPUT_NULLABLE + ); + } + + #[test] + #[should_panic(expected = "expected array with a null buffer")] + fn into_nullable() { + let boolean_array = arrow_array::BooleanArray::from(INPUT.into_iter().collect::>()); + let _ = BooleanArray::::from( + boolean_array, + ); + } + + #[test] + #[should_panic(expected = "expected array without a null buffer")] + fn into_non_nullable() { + let boolean_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::(); + let _ = BooleanArray::::from( + boolean_array_nullable, + ); + } + + #[test] + fn into() { + let boolean_array = arrow_array::BooleanArray::from(INPUT.into_iter().collect::>()); + assert_eq!( + BooleanArray::::from( + boolean_array + ) + .into_iter() + .collect::>(), + INPUT + ); + + let boolean_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::(); + assert_eq!( + BooleanArray::::from( + boolean_array_nullable + ) + .into_iter() + .collect::>(), + INPUT_NULLABLE + ); + } +} diff --git a/src/arrow/array/fixed_size_primitive.rs b/src/arrow/array/fixed_size_primitive.rs new file mode 100644 index 00000000..d26e225e --- /dev/null +++ b/src/arrow/array/fixed_size_primitive.rs @@ -0,0 +1,214 @@ +//! Interop with `arrow-rs` fixed-sized primitive array. + +use std::sync::Arc; + +use arrow_array::types::{ + ArrowPrimitiveType, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, + UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use arrow_buffer::{NullBuffer, ScalarBuffer}; +use arrow_schema::{DataType, Field}; + +use crate::{ + array::FixedSizePrimitiveArray, arrow::ArrowArray, bitmap::Bitmap, buffer::BufferType, + nullable::Nullable, validity::Validity, FixedSize, +}; + +/// Create the `ArrowArray` impl and required conversions. +macro_rules! arrow_array_convert { + ($ty:ty, $primitive_type:ident, $data_type:ident) => { + impl ArrowArray + for FixedSizePrimitiveArray<$ty, NULLABLE, Buffer> + where + ::Buffer<$ty>: Validity, + { + type Array = arrow_array::PrimitiveArray<$primitive_type>; + + fn as_field(name: &str) -> arrow_schema::Field { + Field::new(name, DataType::$data_type, NULLABLE) + } + } + + impl From> + for FixedSizePrimitiveArray<$ty, NULLABLE, Buffer> + where + ::Buffer<$ty>: Validity, + Self: From>, + { + fn from(value: Arc) -> Self { + Self::from(arrow_array::PrimitiveArray::<$primitive_type>::from( + value.to_data(), + )) + } + } + }; +} + +arrow_array_convert!(u8, UInt8Type, UInt8); +arrow_array_convert!(u16, UInt16Type, UInt16); +arrow_array_convert!(u32, UInt32Type, UInt32); +arrow_array_convert!(u64, UInt64Type, UInt64); + +arrow_array_convert!(i8, Int8Type, Int8); +arrow_array_convert!(i16, Int16Type, Int16); +arrow_array_convert!(i32, Int32Type, Int32); +arrow_array_convert!(i64, Int64Type, Int64); + +arrow_array_convert!(f32, Float32Type, Float32); +arrow_array_convert!(f64, Float64Type, Float64); + +impl, Buffer: BufferType> + From> for arrow_array::PrimitiveArray +where + ::Buffer: Into>, +{ + fn from(value: FixedSizePrimitiveArray) -> Self { + arrow_array::PrimitiveArray::new(value.0.into(), None) + } +} + +impl, Buffer: BufferType> + From> for arrow_array::PrimitiveArray +where + ::Buffer: Into>, + Bitmap: Into, +{ + fn from(value: FixedSizePrimitiveArray) -> Self { + arrow_array::PrimitiveArray::new(value.0.data.into(), Some(value.0.validity.into())) + } +} + +/// Panics when there are nulls +impl, Buffer: BufferType> + From> for FixedSizePrimitiveArray +where + ::Buffer: From>, +{ + fn from(value: arrow_array::PrimitiveArray) -> Self { + let (_data_type, values, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(_) => panic!("expected array without a null buffer"), + None => FixedSizePrimitiveArray(values.into()), + } + } +} + +/// Panics when there are no nulls +impl, Buffer: BufferType> + From> for FixedSizePrimitiveArray +where + ::Buffer: From>, + Bitmap: From, +{ + fn from(value: arrow_array::PrimitiveArray) -> Self { + let (_data_type, values, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(null_buffer) => FixedSizePrimitiveArray(Nullable { + data: values.into(), + validity: null_buffer.into(), + }), + None => panic!("expected array with a null buffer"), + } + } +} + +impl From for FixedSizePrimitiveArray +where + ::Buffer: From, +{ + fn from(value: arrow_buffer::Buffer) -> Self { + Self(value.into()) + } +} + +#[cfg(test)] +mod tests { + use arrow_array::types::{UInt16Type, UInt32Type}; + + use crate::array::FixedSizePrimitiveArray; + + const INPUT: [u32; 4] = [1, 2, 3, 4]; + const INPUT_NULLABLE: [Option; 4] = [Some(1), None, Some(3), Some(4)]; + + #[test] + fn from() { + let fixed_size_primitive_array = INPUT.into_iter().collect::>(); + assert_eq!( + arrow_array::PrimitiveArray::::from(fixed_size_primitive_array) + .into_iter() + .flatten() + .collect::>(), + INPUT + ); + + let fixed_size_primitive_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::>(); + assert_eq!( + arrow_array::PrimitiveArray::::from(fixed_size_primitive_array_nullable) + .into_iter() + .collect::>(), + INPUT_NULLABLE + ); + } + + #[test] + #[should_panic(expected = "expected array with a null buffer")] + fn into_nullable() { + let primitive_array = INPUT + .into_iter() + .collect::>(); + let _ = FixedSizePrimitiveArray::< + u32, + true, + crate::arrow::buffer::scalar_buffer::ArrowScalarBuffer, + >::from(primitive_array); + } + + #[test] + #[should_panic(expected = "expected array without a null buffer")] + fn into_non_nullable() { + let primitive_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::>(); + let _ = FixedSizePrimitiveArray::< + u16, + false, + crate::arrow::buffer::scalar_buffer::ArrowScalarBuffer, + >::from(primitive_array_nullable); + } + + #[test] + #[allow(clippy::redundant_closure_for_method_calls)] + fn into() { + let primitive_array = INPUT + .into_iter() + .collect::>(); + assert_eq!( + FixedSizePrimitiveArray::< + u32, + false, + crate::arrow::buffer::scalar_buffer::ArrowScalarBuffer, + >::from(primitive_array) + .into_iter() + .copied() + .collect::>(), + INPUT + ); + + let primitive_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::>(); + assert_eq!( + FixedSizePrimitiveArray::< + u16, + true, + crate::arrow::buffer::scalar_buffer::ArrowScalarBuffer, + >::from(primitive_array_nullable) + .into_iter() + .map(|opt| opt.copied()) + .collect::>(), + INPUT_NULLABLE + ); + } +} diff --git a/src/arrow/array/mod.rs b/src/arrow/array/mod.rs new file mode 100644 index 00000000..419fd90c --- /dev/null +++ b/src/arrow/array/mod.rs @@ -0,0 +1,8 @@ +//! Interop with [`arrow-array`]. + +mod boolean; +mod fixed_size_primitive; +mod string; +mod r#struct; +pub use r#struct::StructArrayTypeFields; +mod variable_size_list; diff --git a/src/arrow/array/string.rs b/src/arrow/array/string.rs new file mode 100644 index 00000000..7f7cb867 --- /dev/null +++ b/src/arrow/array/string.rs @@ -0,0 +1,188 @@ +//! Interop with [`arrow-rs`] string array. + +use std::sync::Arc; + +use arrow_array::OffsetSizeTrait; +use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_schema::{DataType, Field}; + +use crate::{ + array::{FixedSizePrimitiveArray, StringArray, VariableSizeBinaryArray}, + arrow::ArrowArray, + bitmap::Bitmap, + buffer::BufferType, + nullable::Nullable, + offset::{Offset, OffsetElement}, + validity::Validity, +}; + +impl + ArrowArray for StringArray +where + ::Buffer: Validity, +{ + type Array = arrow_array::GenericStringArray; + + fn as_field(name: &str) -> arrow_schema::Field { + Field::new(name, DataType::Utf8, NULLABLE) + } +} + +impl + From> for StringArray +where + ::Buffer: Validity, + Self: From>, +{ + fn from(value: Arc) -> Self { + Self::from(arrow_array::GenericStringArray::::from( + value.to_data(), + )) + } +} + +impl + From> for arrow_array::GenericStringArray +where + ::Buffer: Into>, + FixedSizePrimitiveArray: Into, +{ + fn from(value: StringArray) -> Self { + arrow_array::GenericStringArray::new( + // Safety: + // - The narrow offfset buffer contains valid offset data + unsafe { OffsetBuffer::new_unchecked(value.0 .0.offsets.into()) }, + value.0 .0.data.into(), + None, + ) + } +} + +impl + From> for arrow_array::GenericStringArray +where + ::Buffer: Into>, + FixedSizePrimitiveArray: Into, + Bitmap: Into, +{ + fn from(value: StringArray) -> Self { + arrow_array::GenericStringArray::new( + // Safety: + // - The narrow offfset buffer contains valid offset data + unsafe { OffsetBuffer::new_unchecked(value.0 .0.offsets.data.into()) }, + value.0 .0.data.into(), + Some(value.0 .0.offsets.validity.into()), + ) + } +} + +/// Panics when there are nulls +impl + From> for StringArray +where + FixedSizePrimitiveArray: From, + ::Buffer: From>, +{ + fn from(value: arrow_array::GenericStringArray) -> Self { + let (offsets, values, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(_) => panic!("expected array without a null buffer"), + None => StringArray(VariableSizeBinaryArray(Offset { + data: values.into(), + offsets: offsets.into_inner().into(), + })), + } + } +} + +/// Panics when there are no nulls +impl + From> for StringArray +where + FixedSizePrimitiveArray: From, + ::Buffer: From>, + Bitmap: From, +{ + fn from(value: arrow_array::GenericStringArray) -> Self { + let (offsets, values, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(null_buffer) => StringArray(VariableSizeBinaryArray(Offset { + data: values.into(), + offsets: Nullable { + data: offsets.into_inner().into(), + validity: null_buffer.into(), + }, + })), + None => panic!("expected array with a null buffer"), + } + } +} + +#[cfg(test)] +mod tests { + use std::i64; + + use crate::{array::StringArray, arrow::scalar_buffer::ArrowScalarBuffer}; + + const INPUT: [&str; 3] = ["hello", "world", "!"]; + const INPUT_NULLABLE: [Option<&str>; 3] = [Some("hello"), None, Some("!")]; + + #[test] + fn from() { + let string_array = INPUT.into_iter().collect::(); + assert_eq!( + arrow_array::StringArray::from(string_array) + .into_iter() + .flatten() + .collect::>(), + INPUT + ); + + let string_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::>(); + assert_eq!( + arrow_array::GenericStringArray::::from(string_array_nullable) + .into_iter() + .collect::>(), + INPUT_NULLABLE + ); + } + + #[test] + #[should_panic(expected = "expected array with a null buffer")] + fn into_nullable() { + let string_array = INPUT + .into_iter() + .map(ToOwned::to_owned) + .map(Option::Some) + .collect::(); + let _: StringArray = string_array.into(); + } + + #[test] + #[should_panic(expected = "expected array without a null buffer")] + fn into_non_nullable() { + let string_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::(); + let _: StringArray = string_array_nullable.into(); + } + + #[test] + fn into() { + let string_array = INPUT + .into_iter() + .map(ToOwned::to_owned) + .map(Option::Some) + .collect::(); + let _: StringArray = string_array.into(); + // todo(mbrobbel): intoiterator for stringarray + + let string_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::(); + let _: StringArray = string_array_nullable.into(); + // todo(mbrobbel): intoiterator for stringarray + } +} diff --git a/src/arrow/array/struct.rs b/src/arrow/array/struct.rs new file mode 100644 index 00000000..e3bb2dbb --- /dev/null +++ b/src/arrow/array/struct.rs @@ -0,0 +1,240 @@ +//! Interop with [`arrow-rs`] struct arrays. + +use std::sync::Arc; + +use arrow_buffer::NullBuffer; +use arrow_schema::{DataType, Field, Fields}; + +use crate::{ + array::{StructArray, StructArrayType}, + arrow::ArrowArray, + bitmap::Bitmap, + buffer::BufferType, + nullable::Nullable, + validity::Validity, +}; + +/// Arrow schema interop trait for the fields of a struct array type. +pub trait StructArrayTypeFields { + /// Returns the fields of this struct array. + fn fields() -> Fields; +} + +impl ArrowArray + for StructArray +where + ::Array: Validity + StructArrayTypeFields, +{ + type Array = arrow_array::StructArray; + + fn as_field(name: &str) -> arrow_schema::Field { + Field::new( + name, + DataType::Struct( + <::Array as StructArrayTypeFields>::fields(), + ), + NULLABLE, + ) + } +} + +impl From> + for StructArray +where + ::Array: Validity, + Self: From, +{ + fn from(value: Arc) -> Self { + Self::from(arrow_array::StructArray::from(value.to_data())) + } +} + +impl From> + for arrow_array::StructArray +where + ::Array: + StructArrayTypeFields + Into>>, +{ + fn from(value: StructArray) -> Self { + // Safety: + // - struct arrays are valid by construction + unsafe { + arrow_array::StructArray::new_unchecked( + <::Array as StructArrayTypeFields>::fields(), + // value.0.into_arrays(), + value.0.into(), + None, + ) + } + } +} + +impl From> + for arrow_array::StructArray +where + ::Array: + StructArrayTypeFields + Into>>, + Bitmap: Into, +{ + fn from(value: StructArray) -> Self { + // Safety: + // - struct arrays are valid by construction + unsafe { + arrow_array::StructArray::new_unchecked( + <::Array as StructArrayTypeFields>::fields(), + value.0.data.into(), + Some(value.0.validity.into()), + ) + } + } +} + +impl From + for StructArray +where + ::Array: From>>, +{ + fn from(value: arrow_array::StructArray) -> Self { + let (_fields, arrays, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(_) => panic!("expected array without a null buffer"), + None => StructArray(arrays.into()), + } + } +} + +impl From + for StructArray +where + ::Array: From>>, + Bitmap: From, +{ + fn from(value: arrow_array::StructArray) -> Self { + let (_fields, arrays, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(null_buffer) => StructArray(Nullable { + data: arrays.into(), + validity: null_buffer.into(), + }), + None => panic!("expected array with a null buffer"), + } + } +} + +impl + From> for arrow_array::RecordBatch +where + ::Array: Validity, + arrow_array::StructArray: From>, +{ + fn from(value: StructArray) -> Self { + Self::from(arrow_array::StructArray::from(value)) + } +} + +impl From + for StructArray +where + ::Array: Validity, + Self: From, +{ + fn from(value: arrow_array::RecordBatch) -> Self { + Self::from(arrow_array::StructArray::from(value)) + } +} + +#[cfg(test)] +mod tests { + + use arrow_array::Array as _; + + use crate::{ + array::union::{self, UnionType}, + array::ArrayType, + arrow::buffer_builder::ArrowBufferBuilder, + offset::{self, OffsetElement}, + }; + + use super::*; + + #[derive(Default)] + struct Foo { + a: u32, + } + struct FooArray { + a: ::Array, + } + impl ArrayType for Foo { + type Array = + StructArray; + } + impl ArrayType for Option { + type Array = + StructArray; + } + impl Default for FooArray + where + ::Array: Default, + { + fn default() -> Self { + Self { + a: ::Array::::default(), + } + } + } + impl Extend for FooArray + where + ::Array: Extend, + { + fn extend>(&mut self, iter: I) { + iter.into_iter().for_each(|Foo { a }| { + self.a.extend(std::iter::once(a)); + }); + } + } + impl FromIterator for FooArray + where + ::Array: Default + Extend, + { + fn from_iter>(iter: T) -> Self { + let (a, _): (_, Vec<_>) = iter.into_iter().map(|Foo { a }| (a, ())).unzip(); + Self { a } + } + } + impl StructArrayType for Foo { + type Array = FooArray; + } + impl StructArrayTypeFields for FooArray { + fn fields() -> Fields { + Fields::from(vec![Field::new("a", DataType::UInt32, false)]) + } + } + impl From> for Vec> + where + ::Array: + Into<<::Array as ArrowArray>::Array>, + { + fn from(value: FooArray) -> Self { + vec![Arc::< + <::Array as ArrowArray>::Array, + >::new(value.a.into())] + } + } + + #[test] + fn from() { + let struct_array = [Foo { a: 1 }, Foo { a: 2 }] + .into_iter() + .collect::>(); + let struct_array_arrow = arrow_array::StructArray::from(struct_array); + assert_eq!(struct_array_arrow.len(), 2); + + let struct_array_nullable = [Some(Foo { a: 1234 }), None] + .into_iter() + .collect::>(); + let struct_array_arrow_nullable = arrow_array::StructArray::from(struct_array_nullable); + assert_eq!(struct_array_arrow_nullable.len(), 2); + assert!(struct_array_arrow_nullable.is_valid(0)); + assert!(struct_array_arrow_nullable.is_null(1)); + } +} diff --git a/src/arrow/array/variable_size_list.rs b/src/arrow/array/variable_size_list.rs new file mode 100644 index 00000000..b4865ec0 --- /dev/null +++ b/src/arrow/array/variable_size_list.rs @@ -0,0 +1,250 @@ +//! Interop with [`arrow-rs`] string array. + +use std::sync::Arc; + +use arrow_array::OffsetSizeTrait; +use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_schema::{DataType, Field}; + +use crate::{ + array::{Array, VariableSizeListArray}, + arrow::ArrowArray, + bitmap::Bitmap, + buffer::BufferType, + nullable::Nullable, + offset::{Offset, OffsetElement}, + validity::Validity, +}; + +impl< + T: ArrowArray, + const NULLABLE: bool, + OffsetItem: OffsetElement + OffsetSizeTrait, + Buffer: BufferType, + > ArrowArray for VariableSizeListArray +where + ::Buffer: Validity, +{ + type Array = arrow_array::GenericListArray; + + fn as_field(name: &str) -> arrow_schema::Field { + Field::new( + name, + DataType::List(Arc::new(T::as_field("item"))), + NULLABLE, + ) + } +} + +impl< + T: Array, + const NULLABLE: bool, + OffsetItem: OffsetElement + OffsetSizeTrait, + Buffer: BufferType, + > From> for VariableSizeListArray +where + ::Buffer: Validity, + Self: From>, +{ + fn from(value: Arc) -> Self { + Self::from(arrow_array::GenericListArray::::from( + value.to_data(), + )) + } +} + +impl + From> + for arrow_array::GenericListArray +where + ::Buffer: Into>, + ::Array: From + 'static, +{ + fn from(value: VariableSizeListArray) -> Self { + arrow_array::GenericListArray::new( + Arc::new(T::as_field("item")), + // Safety: + // - The narrow offfset buffer contains valid offset data + unsafe { OffsetBuffer::new_unchecked(value.0.offsets.into()) }, + Arc::<::Array>::new(value.0.data.into()), + None, + ) + } +} + +impl + From> + for arrow_array::GenericListArray +where + ::Buffer: Into>, + Bitmap: Into, + ::Array: From + 'static, +{ + fn from(value: VariableSizeListArray) -> Self { + arrow_array::GenericListArray::new( + Arc::new(T::as_field("item")), + // Safety: + // - The narrow offfset buffer contains valid offset data + unsafe { OffsetBuffer::new_unchecked(value.0.offsets.data.into()) }, + Arc::<::Array>::new(value.0.data.into()), + Some(value.0.offsets.validity.into()), + ) + } +} + +/// Panics when there are nulls +impl + From> + for VariableSizeListArray +where + T: From>, + ::Buffer: From>, +{ + fn from(value: arrow_array::GenericListArray) -> Self { + let (_field, offsets, values, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(_) => panic!("expected array without a null buffer"), + None => VariableSizeListArray(Offset { + data: values.into(), + offsets: offsets.into_inner().into(), + }), + } + } +} + +/// Panics when there are no nulls +impl + From> + for VariableSizeListArray +where + T: From>, + ::Buffer: From>, + Bitmap: From, +{ + fn from(value: arrow_array::GenericListArray) -> Self { + let (_field, offsets, values, nulls_opt) = value.into_parts(); + match nulls_opt { + Some(null_buffer) => VariableSizeListArray(Offset { + data: values.into(), + offsets: Nullable { + data: offsets.into_inner().into(), + validity: null_buffer.into(), + }, + }), + None => panic!("expected array with a null buffer"), + } + } +} + +#[cfg(test)] +mod tests { + use arrow_array::{ + builder::{ListBuilder, StringBuilder}, + types::UInt16Type, + Array as _, + }; + + use crate::{ + array::{StringArray, Uint16Array, VariableSizeListArray}, + arrow::scalar_buffer::ArrowScalarBuffer, + Length, + }; + + const INPUT: [&[u16]; 3] = [&[1, 2], &[3], &[4]]; + const INPUT_NULLABLE: [Option<&[&str]>; 3] = + [Some(&["hello", " "]), None, Some(&["world", "!"])]; + + #[test] + fn from() { + let variable_size_list_array = INPUT + .into_iter() + .collect::>(); + let list_array = arrow_array::ListArray::from(variable_size_list_array); + assert_eq!(list_array.len(), INPUT.len()); + + let variable_size_list_array_nullable = INPUT_NULLABLE + .into_iter() + .collect::>(); + let list_array_nullable = arrow_array::ListArray::from(variable_size_list_array_nullable); + assert_eq!(list_array_nullable.len(), INPUT_NULLABLE.len()); + } + + #[test] + #[should_panic(expected = "expected array with a null buffer")] + fn into_nullable() { + let list_array = arrow_array::ListArray::from_iter_primitive::( + INPUT + .into_iter() + .map(|opt| opt.iter().copied().map(Option::Some)) + .map(Option::Some), + ); + let _: VariableSizeListArray< + Uint16Array, + true, + i32, + ArrowScalarBuffer, + > = list_array.into(); + } + + #[test] + #[should_panic(expected = "expected array without a null buffer")] + fn into_non_nullable() { + let mut list_builder = + ListBuilder::with_capacity(StringBuilder::new(), INPUT_NULLABLE.len()); + INPUT_NULLABLE.into_iter().for_each(|opt| match opt { + Some(items) => { + for item in items { + list_builder.values().append_value(item); + } + list_builder.append(true); + } + None => { + list_builder.append(false); + } + }); + let list_array_nullable = list_builder.finish(); + let _: VariableSizeListArray< + StringArray, + false, + i32, + ArrowScalarBuffer, + > = list_array_nullable.into(); + } + + #[test] + fn into() { + let list_array = arrow_array::ListArray::from_iter_primitive::( + INPUT + .into_iter() + .map(|opt| opt.iter().copied().map(Option::Some)) + .map(Option::Some), + ); + let _: VariableSizeListArray< + Uint16Array, + false, + i32, + ArrowScalarBuffer, + > = list_array.into(); + + let mut list_builder = + ListBuilder::with_capacity(StringBuilder::new(), INPUT_NULLABLE.len()); + INPUT_NULLABLE.into_iter().for_each(|opt| match opt { + Some(items) => { + for item in items { + list_builder.values().append_value(item); + } + list_builder.append(true); + } + None => { + list_builder.append(false); + } + }); + let list_array_nullable = list_builder.finish(); + let _: VariableSizeListArray< + StringArray, + true, + i32, + ArrowScalarBuffer, + > = list_array_nullable.into(); + } +} diff --git a/src/arrow/buffer/boolean_buffer.rs b/src/arrow/buffer/boolean_buffer.rs new file mode 100644 index 00000000..9ee17bc9 --- /dev/null +++ b/src/arrow/buffer/boolean_buffer.rs @@ -0,0 +1,78 @@ +//! Interop with [`arrow-rs`] boolean buffer. + +use arrow_buffer::BooleanBuffer; + +use crate::{bitmap::Bitmap, buffer::BufferType, Length}; + +impl Length for BooleanBuffer { + fn len(&self) -> usize { + BooleanBuffer::len(self) + } +} + +impl From> for BooleanBuffer +where + ::Buffer: Into, +{ + fn from(value: Bitmap) -> Self { + Self::new(value.buffer.into(), value.offset, value.bits) + } +} + +impl From for Bitmap +where + ::Buffer: From, +{ + fn from(value: BooleanBuffer) -> Self { + let bits = value.len(); + let offset = value.offset(); + Bitmap { + buffer: value.into_inner().into(), + bits, + offset, + } + } +} + +#[cfg(test)] +mod tests { + use crate::{arrow::buffer::scalar_buffer::ArrowScalarBuffer, buffer::ArcBuffer}; + + use super::*; + + const INPUT: [bool; 4] = [true, true, false, true]; + + #[test] + fn length() { + let boolean_buffer = INPUT.into_iter().collect::(); + assert_eq!(Length::len(&boolean_buffer), INPUT.len()); + } + + #[test] + fn from() { + let bitmap = INPUT.into_iter().collect::(); + assert_eq!( + BooleanBuffer::from(bitmap).into_iter().collect::>(), + INPUT + ); + + let bitmap_arc = INPUT.into_iter().collect::>(); + assert_eq!( + BooleanBuffer::from(bitmap_arc) + .into_iter() + .collect::>(), + INPUT + ); + } + + #[test] + fn into() { + let boolean_buffer = INPUT.into_iter().collect::(); + assert_eq!( + Bitmap::::from(boolean_buffer) + .into_iter() + .collect::>(), + INPUT + ); + } +} diff --git a/src/arrow/buffer/buffer_builder.rs b/src/arrow/buffer/buffer_builder.rs new file mode 100644 index 00000000..2270f242 --- /dev/null +++ b/src/arrow/buffer/buffer_builder.rs @@ -0,0 +1,110 @@ +//! Interop with [`arrow-rs`] buffer builder. + +use arrow_buffer::BufferBuilder; + +use crate::{ + array::FixedSizePrimitiveArray, + buffer::{Buffer, BufferMut, BufferType}, + FixedSize, Index, Length, +}; + +/// A [`BufferType`] implementation for [`BufferBuilder`]. +#[derive(Clone, Copy)] +pub struct ArrowBufferBuilder; + +impl BufferType for ArrowBufferBuilder { + type Buffer = BufferBuilder; +} + +impl Buffer for BufferBuilder { + fn as_slice(&self) -> &[T] { + BufferBuilder::as_slice(self) + } +} + +impl BufferMut for BufferBuilder { + fn as_mut_slice(&mut self) -> &mut [T] { + BufferBuilder::as_slice_mut(self) + } +} + +impl Index for BufferBuilder { + type Item<'a> = &'a T + where + Self: 'a; + + unsafe fn index_unchecked(&self, index: usize) -> Self::Item<'_> { + self.as_slice().get_unchecked(index) + } +} + +impl Length for BufferBuilder { + fn len(&self) -> usize { + BufferBuilder::len(self) + } +} + +impl From> + for BufferBuilder +{ + fn from(value: FixedSizePrimitiveArray) -> Self { + // Note: this makes a copy + let buffer = arrow_buffer::MutableBuffer::from(value.0.as_slice().to_vec()); + BufferBuilder::new_from_buffer(buffer) + } +} + +impl From> + for FixedSizePrimitiveArray +where + ::Buffer: From, +{ + fn from(mut value: BufferBuilder) -> Self { + FixedSizePrimitiveArray(value.finish().into()) + } +} + +#[cfg(test)] +mod tests { + use crate::{arrow::scalar_buffer::ArrowScalarBuffer, buffer::ArcBuffer}; + + use super::*; + + const INPUT: [u32; 4] = [1, 2, 3, 4]; + + #[test] + fn length() { + let buffer_builder = INPUT.into_iter().collect::>(); + assert_eq!(Length::len(&buffer_builder), INPUT.len()); + } + + #[test] + fn from() { + let fixed_size_primitive_array = INPUT.into_iter().collect::>(); + assert_eq!( + BufferBuilder::from(fixed_size_primitive_array).as_slice(), + INPUT + ); + + let fixed_size_primitive_array_arc = + INPUT + .into_iter() + .collect::>(); + assert_eq!( + BufferBuilder::from(fixed_size_primitive_array_arc).as_slice(), + INPUT + ); + } + + #[test] + fn into() { + let buffer_builder = INPUT.into_iter().collect::>(); + assert_eq!( + FixedSizePrimitiveArray::<_, false, ArrowScalarBuffer>::from(buffer_builder) + .into_iter() + .copied() + .collect::>(), + INPUT + ); + } +} diff --git a/src/arrow/buffer/mod.rs b/src/arrow/buffer/mod.rs new file mode 100644 index 00000000..e532be23 --- /dev/null +++ b/src/arrow/buffer/mod.rs @@ -0,0 +1,7 @@ +//! Interop with [`arrow-rs`] buffer types. + +pub mod boolean_buffer; +pub mod buffer_builder; +pub mod null_buffer; +pub mod offset_buffer; +pub mod scalar_buffer; diff --git a/src/arrow/buffer/null_buffer.rs b/src/arrow/buffer/null_buffer.rs new file mode 100644 index 00000000..69a25404 --- /dev/null +++ b/src/arrow/buffer/null_buffer.rs @@ -0,0 +1,70 @@ +//! Interop with [`arrow-rs`] null buffer. + +use arrow_buffer::{BooleanBuffer, NullBuffer}; + +use crate::{bitmap::Bitmap, buffer::BufferType, Length}; + +impl Length for NullBuffer { + fn len(&self) -> usize { + NullBuffer::len(self) + } +} + +impl From> for NullBuffer +where + Bitmap: Into, +{ + fn from(value: Bitmap) -> Self { + Self::new(value.into()) + } +} + +impl From for Bitmap +where + Bitmap: From, +{ + fn from(value: NullBuffer) -> Self { + Bitmap::from(value.into_inner()) + } +} + +#[cfg(test)] +mod tests { + use crate::{arrow::buffer::scalar_buffer::ArrowScalarBuffer, buffer::ArcBuffer}; + + use super::*; + + const INPUT: [bool; 4] = [true, true, false, true]; + + #[test] + fn length() { + let null_buffer = INPUT.into_iter().collect::(); + assert_eq!(Length::len(&null_buffer), INPUT.len()); + } + + #[test] + fn from() { + let bitmap = INPUT.into_iter().collect::(); + assert_eq!( + NullBuffer::from(bitmap).into_iter().collect::>(), + INPUT + ); + + let bitmap_arc = INPUT.into_iter().collect::>(); + assert_eq!( + NullBuffer::from(bitmap_arc).into_iter().collect::>(), + INPUT + ); + } + + #[test] + fn into() { + let null_buffer = INPUT.into_iter().collect::(); + assert_eq!( + Bitmap::::from(null_buffer) + .into_iter() + .collect::>(), + INPUT + ); + } +} diff --git a/src/arrow/buffer/offset_buffer.rs b/src/arrow/buffer/offset_buffer.rs new file mode 100644 index 00000000..7cc80b33 --- /dev/null +++ b/src/arrow/buffer/offset_buffer.rs @@ -0,0 +1,85 @@ +//! Interop with [`arrow-rs`] offset buffer. + +//! Interop with [`arrow-rs`] null buffer. + +use arrow_buffer::{OffsetBuffer, ScalarBuffer}; + +use crate::{array::FixedSizePrimitiveArray, buffer::BufferType, offset::OffsetElement, Length}; + +impl Length for OffsetBuffer { + fn len(&self) -> usize { + self.as_ref().len() + } +} + +impl + From> for OffsetBuffer +where + FixedSizePrimitiveArray: Into>, +{ + fn from(value: FixedSizePrimitiveArray) -> Self { + Self::new(value.into()) + } +} + +impl From> + for FixedSizePrimitiveArray +where + ::Buffer: From>, +{ + fn from(value: OffsetBuffer) -> Self { + Self(value.into_inner().into()) + } +} + +#[cfg(test)] +mod tests { + + use crate::{ + array::FixedSizePrimitiveArray, arrow::buffer::scalar_buffer::ArrowScalarBuffer, + buffer::ArcBuffer, + }; + + use super::*; + + const INPUT: [usize; 4] = [1, 1, 2, 2]; + + #[test] + fn length() { + let offset_buffer = OffsetBuffer::::from_lengths(INPUT); + assert_eq!(Length::len(&offset_buffer), INPUT.len() + 1); + } + + #[test] + fn from() { + let fixed_size_primitive_array = INPUT + .into_iter() + .map(|x| x.try_into().expect("")) + .collect::>(); + assert_eq!( + OffsetBuffer::::from(fixed_size_primitive_array).as_ref(), + [1, 1, 2, 2] + ); + + let fixed_size_primitive_array_arc = INPUT + .into_iter() + .map(|x| x.try_into().expect("")) + .collect::>( + ); + assert_eq!( + OffsetBuffer::::from(fixed_size_primitive_array_arc).as_ref(), + [1, 1, 2, 2] + ); + } + + #[test] + fn into() { + let offset_buffer = OffsetBuffer::::from_lengths(INPUT); + assert_eq!( + FixedSizePrimitiveArray::::from(offset_buffer.clone()) + .into_iter() + .collect::>(), + offset_buffer.iter().collect::>() + ); + } +} diff --git a/src/arrow/buffer/scalar_buffer.rs b/src/arrow/buffer/scalar_buffer.rs new file mode 100644 index 00000000..8ea3e194 --- /dev/null +++ b/src/arrow/buffer/scalar_buffer.rs @@ -0,0 +1,113 @@ +//! Interop with [`arrow-rs`] scalar buffer. + +use arrow_buffer::ScalarBuffer; + +use crate::{ + array::FixedSizePrimitiveArray, + buffer::{Buffer, BufferType}, + FixedSize, Index, Length, +}; + +/// A [`BufferType`] implementation for [`ScalarBuffer`]. +#[derive(Clone, Copy)] +pub struct ArrowScalarBuffer; + +impl BufferType for ArrowScalarBuffer { + type Buffer = ScalarBuffer; +} + +impl Buffer for ScalarBuffer { + fn as_slice(&self) -> &[T] { + self + } +} + +impl Index for ScalarBuffer { + type Item<'a> = &'a T + where + Self: 'a; + + unsafe fn index_unchecked(&self, index: usize) -> Self::Item<'_> { + self.get_unchecked(index) + } +} + +impl Length for ScalarBuffer { + fn len(&self) -> usize { + self.as_ref().len() + } +} + +impl From> + for ScalarBuffer +where + ::Buffer: AsRef<[T]>, +{ + fn from(value: FixedSizePrimitiveArray) -> Self { + let len = value.len(); + // Note: this makes a copy + let buffer = arrow_buffer::Buffer::from_slice_ref(value.0.as_ref()); + ScalarBuffer::new(buffer, 0, len) + } +} + +impl From> + for FixedSizePrimitiveArray +where + ::Buffer: From, +{ + fn from(value: ScalarBuffer) -> Self { + FixedSizePrimitiveArray(value.into_inner().into()) + } +} + +#[cfg(test)] +mod tests { + use crate::buffer::ArcBuffer; + + use super::*; + + const INPUT: [u32; 4] = [1, 2, 3, 4]; + + #[test] + fn length() { + let scalar_buffer = INPUT.into_iter().collect::>(); + assert_eq!(Length::len(&scalar_buffer), INPUT.len()); + } + + #[test] + fn from() { + let fixed_size_primitive_array = INPUT.into_iter().collect::>(); + assert_eq!( + ScalarBuffer::from(fixed_size_primitive_array) + .into_iter() + .copied() + .collect::>(), + INPUT + ); + + let fixed_size_primitive_array_arc = + INPUT + .into_iter() + .collect::>(); + assert_eq!( + ScalarBuffer::from(fixed_size_primitive_array_arc) + .into_iter() + .copied() + .collect::>(), + INPUT + ); + } + + #[test] + fn into() { + let scalar_buffer = INPUT.into_iter().collect::>(); + assert_eq!( + FixedSizePrimitiveArray::<_, false, ArrowScalarBuffer>::from(scalar_buffer) + .into_iter() + .copied() + .collect::>(), + INPUT + ); + } +} diff --git a/src/arrow/mod.rs b/src/arrow/mod.rs new file mode 100644 index 00000000..5919eb65 --- /dev/null +++ b/src/arrow/mod.rs @@ -0,0 +1,21 @@ +//! Interop with the [`arrow-rs`] crate. +//! +//! [`arrow-rs`]: https://crates.io/crates/arrow + +mod array; +pub use array::StructArrayTypeFields; + +mod buffer; +pub use buffer::*; + +use crate::array::Array; +use arrow_schema::Field; + +/// Extension trait of [`Array`] for [`arrow-rs`] interop. +pub trait ArrowArray: Array + Sized { + /// The corresponding arrow array + type Array: arrow_array::Array; + + /// Returns the field of this array. + fn as_field(name: &str) -> Field; +} diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index 9473c303..a100c6e9 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -42,14 +42,14 @@ pub trait BitmapRefMut: BitmapRef { // todo(mb): implement ops pub struct Bitmap { /// The bits are stored in this buffer of bytes. - buffer: ::Buffer, + pub(crate) buffer: ::Buffer, /// The number of bits stored in the bitmap. - bits: usize, + pub(crate) bits: usize, /// An offset (in number of bits) in the buffer. This enables zero-copy /// slicing of the bitmap on non-byte boundaries. - offset: usize, + pub(crate) offset: usize, } impl BitmapRef for Bitmap { diff --git a/src/fixed_size.rs b/src/fixed_size.rs index bde5af03..b8e5e229 100644 --- a/src/fixed_size.rs +++ b/src/fixed_size.rs @@ -3,6 +3,21 @@ use crate::array::ArrayType; use std::{fmt::Debug, mem}; +#[cfg(feature = "arrow-rs")] +/// Module that re-exports the [`arrow_buffer::ArrowNativeType`] trait. +mod arrow_rs { + pub use arrow_buffer::ArrowNativeType as _arrow_rs_trait; +} +#[cfg(not(feature = "arrow-rs"))] +/// Module with empty trait to work around [RFC-3399](https://rust-lang.github.io/rfcs/3399-cfg-attribute-in-where.html). +mod arrow_rs { + /// Empty trait. + pub trait Type {} + impl Type for T {} + pub use Type as _arrow_rs_trait; +} +use arrow_rs::_arrow_rs_trait; + /// Subtrait for fixed-size types. /// /// This exists to be used as trait bound where one or more of the supertraits @@ -10,11 +25,22 @@ use std::{fmt::Debug, mem}; /// fixed-size types. /// /// This trait is sealed to prevent downstream implementations. -pub trait FixedSize: ArrayType + Copy + Debug + Sized + sealed::Sealed + 'static { +pub trait FixedSize: + ArrayType + Copy + Debug + Sized + sealed::Sealed + 'static + _arrow_rs_trait +{ /// The fixed-size of this type in bytes. const SIZE: usize = mem::size_of::(); } +/// Private module for [`sealed::Sealed`] trait. +mod sealed { + /// Used to seal [`super::FixedSize`]. + pub trait Sealed {} + + // Prevent downstream implementation of [super::FixedSize]. + impl Sealed for T where T: super::FixedSize {} +} + impl FixedSize for i8 {} impl FixedSize for i16 {} impl FixedSize for i32 {} @@ -24,36 +50,30 @@ impl FixedSize for u8 {} impl FixedSize for u16 {} impl FixedSize for u32 {} impl FixedSize for u64 {} +#[cfg(not(feature = "arrow-rs"))] impl FixedSize for u128 {} +#[cfg(not(feature = "arrow-rs"))] impl FixedSize for isize {} +#[cfg(not(feature = "arrow-rs"))] impl FixedSize for usize {} impl FixedSize for f32 {} impl FixedSize for f64 {} -impl FixedSize for () {} - -impl FixedSize for [T; N] {} - -/// Private module for [`sealed::Sealed`] trait. -mod sealed { - /// Used to seal [`super::FixedSize`]. - pub trait Sealed {} - - /// Prevent downstream implementation of [`super::FixedSize`]. - impl Sealed for T where T: super::FixedSize {} -} +#[cfg(not(feature = "arrow-rs"))] +impl FixedSize for [T; N] {} #[cfg(test)] mod tests { - use super::*; + use super::FixedSize; #[test] fn size() { - assert_eq!(<()>::SIZE, 0); assert_eq!(u8::SIZE, 1); + #[cfg(not(feature = "arrow-rs"))] assert_eq!(<[u16; 21]>::SIZE, 42); + #[cfg(not(feature = "arrow-rs"))] assert_eq!(<[u8; 1234]>::SIZE, 1234); } } diff --git a/src/lib.rs b/src/lib.rs index 9a3ef6c8..937a1dc6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ //! //! An experimental (work-in-progress) implementation of [Apache Arrow](https://arrow.apache.org). +#![cfg_attr(docsrs, feature(doc_auto_cfg, doc_cfg))] #![doc( html_logo_url = "https://raw.githubusercontent.com/mbrobbel/narrow/main/narrow.svg", html_favicon_url = "https://raw.githubusercontent.com/mbrobbel/narrow/main/narrow.svg" @@ -77,6 +78,9 @@ pub(crate) mod validity; pub mod array; +#[cfg(feature = "arrow-rs")] +pub mod arrow; + // Re-export `narrow_derive` macros when the `derive` feature is enabled. #[cfg(feature = "derive")] pub use narrow_derive::ArrayType; diff --git a/tests/derive.rs b/tests/derive.rs index d1f49cbe..1653ab5a 100644 --- a/tests/derive.rs +++ b/tests/derive.rs @@ -2,6 +2,7 @@ mod tests { mod derive { mod r#struct { + #[cfg(not(feature = "arrow-rs"))] mod unit { use narrow::{ array::{StructArray, VariableSizeListArray}, @@ -64,6 +65,7 @@ mod tests { } } + #[cfg(not(feature = "arrow-rs"))] mod unnamed { use narrow::{ array::{StructArray, VariableSizeListArray}, @@ -141,6 +143,7 @@ mod tests { } } + #[cfg(not(feature = "arrow-rs"))] mod named { use narrow::{ array::{StructArray, VariableSizeListArray},