From ba08aa52ed9a1f4354ef4657f8e37f3f4a5bb136 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Aug 2021 07:43:39 -0400 Subject: [PATCH 1/9] Tiny tweaks to release readme (#670) --- dev/release/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/release/README.md b/dev/release/README.md index 4c3f1ef8fc6b..c18dae8f68e0 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -260,7 +260,7 @@ For example, to backport `b2de5446cc1e45a0559fb39039d0545df1ac0d26` to active_re ```shell git clone git@github.com:apache/arrow-rs.git /tmp/arrow-rs -ARROW_GITHUB_API_TOKEN=$ARROW_GITHUB_API_TOKEN CHECKOUT_ROOT=/tmp/arrow-rs CHERRY_PICK_SHA=b2de5446cc1e45a0559fb39039d0545df1ac0d26 python3 dev/release/cherry-pick-pr.py +CHERRY_PICK_SHA=b2de5446cc1e45a0559fb39039d0545df1ac0d26 ARROW_GITHUB_API_TOKEN=$ARROW_GITHUB_API_TOKEN CHECKOUT_ROOT=/tmp/arrow-rs python3 dev/release/cherry-pick-pr.py ``` ## Labels @@ -268,7 +268,7 @@ ARROW_GITHUB_API_TOKEN=$ARROW_GITHUB_API_TOKEN CHECKOUT_ROOT=/tmp/arrow-rs CHERR There are two labels that help keep track of backporting: 1. [`cherry-picked`](https://github.com/apache/arrow-rs/labels/cherry-picked) for PRs that have been cherry-picked/backported to `active_release` -2. [`release-cherry-pick`](https://github.com/apache/arrow-rs/labels/release-cherry-pick) for the PRs that are the cherry pick +2. [`release-cherry-pick`](https://github.com/apache/arrow-rs/labels/release-cherry-pick) for the PRs that are the cherry pick to `active_release` You can find candidates to cherry pick using [this filter](https://github.com/apache/arrow-rs/pulls?q=is%3Apr+is%3Aclosed+-label%3Arelease-cherry-pick+-label%3Acherry-picked) From b682ef5f058433ea90af47fc016eccf7f356a622 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 6 Aug 2021 20:38:01 -0400 Subject: [PATCH 2/9] Add a note about arrow crate security / safety (#628) * Add note about safety to arrow README.md * Prettier * Remove note about making modules private --- arrow/README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/arrow/README.md b/arrow/README.md index 5f974df4532d..c5fdb11a279d 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -23,9 +23,13 @@ This crate contains the official Native Rust implementation of [Apache Arrow](https://arrow.apache.org/) in memory format. Please see the API documents for additional details. +## Versioning / Releases + +Unlike many other crates in the Rust ecosystem which spend extended time in "pre 1.0.0" state, releasing versions 0.x, the arrow-rs crate follows the versioning scheme of the overall [Apache Arrow](https://arrow.apache.org/) project in an effort to signal which language implementations have been integration tested with each other. + ## Features -The arrow crate provides the following optional features: +The arrow crate provides the following features which may be enabled: - `csv` (default) - support for reading and writing Arrow arrays to/from csv files - `ipc` (default) - support for the [arrow-flight]((https://crates.io/crates/arrow-flight) IPC and wire format @@ -35,6 +39,24 @@ The arrow crate provides the following optional features: implementations of some [compute](https://github.com/apache/arrow/tree/master/rust/arrow/src/compute) kernels using explicit SIMD processor intrinsics. +## Safety + +TLDR: You should avoid using the `alloc` and `buffer` and `bitmap` modules if at all possible. These modules contain `unsafe` code and are easy to misuse. + +As with all open source code, you should carefully evaluate the suitability of `arrow` for your project, taking into consideration your needs and risk tolerance prior to use. + +_Background_: There are various parts of the `arrow` crate which use `unsafe` and `transmute` code internally. We are actively working as a community to minimize undefined behavior and remove `unsafe` usage to align more with Rust's core principles of safety (e.g. the arrow2 project). + +As `arrow` exists today, it is fairly easy to misuse the APIs, leading to undefined behavior, and it is especially easy to misuse code in modules named above. For an example, as described in [the arrow2 crate](https://github.com/jorgecarleitao/arrow2#why), the following code compiles, does not panic, but results in undefined behavior: + +```rust +let buffer = Buffer::from_slic_ref(&[0i32, 2i32]) +let data = ArrayData::new(DataType::Int64, 10, 0, None, 0, vec![buffer], vec![]); +let array = Float64Array::from(Arc::new(data)); + +println!("{:?}", array.value(1)); +``` + ## Building for WASM In order to compile Arrow for Web Assembly (the `wasm32-unknown-unknown` WASM target), you will likely need to turn off this crate's default features and use the `js` feature. From 4618ef539521a09a1e46246a29ea31807e98bb7c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 8 Aug 2021 03:46:14 -0400 Subject: [PATCH 3/9] Fix parquet string statistics generation (#643) * Fix string statistics generation, add tests * fix Int96 stats test * Add notes for additional tickets --- parquet/src/column/writer.rs | 122 +++++++++++++++++++++++++++++++++++ parquet/src/data_type.rs | 29 ++++----- 2 files changed, 134 insertions(+), 17 deletions(-) diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index d5b845756da2..3cb17e17f7f6 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -1687,6 +1687,128 @@ mod tests { ); } + #[test] + fn test_bool_statistics() { + let stats = statistics_roundtrip::(&[true, false, false, true]); + assert!(stats.has_min_max_set()); + // should it be BooleanStatistics?? + // https://github.com/apache/arrow-rs/issues/659 + if let Statistics::Int32(stats) = stats { + assert_eq!(stats.min(), &0); + assert_eq!(stats.max(), &1); + } else { + panic!("expecting Statistics::Int32, got {:?}", stats); + } + } + + #[test] + fn test_int32_statistics() { + let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); + assert!(stats.has_min_max_set()); + if let Statistics::Int32(stats) = stats { + assert_eq!(stats.min(), &-2); + assert_eq!(stats.max(), &3); + } else { + panic!("expecting Statistics::Int32, got {:?}", stats); + } + } + + #[test] + fn test_int64_statistics() { + let stats = statistics_roundtrip::(&[-1, 3, -2, 2]); + assert!(stats.has_min_max_set()); + if let Statistics::Int64(stats) = stats { + assert_eq!(stats.min(), &-2); + assert_eq!(stats.max(), &3); + } else { + panic!("expecting Statistics::Int64, got {:?}", stats); + } + } + + #[test] + fn test_int96_statistics() { + let input = vec![ + Int96::from(vec![1, 20, 30]), + Int96::from(vec![3, 20, 10]), + Int96::from(vec![0, 20, 30]), + Int96::from(vec![2, 20, 30]), + ] + .into_iter() + .collect::>(); + + let stats = statistics_roundtrip::(&input); + assert!(stats.has_min_max_set()); + if let Statistics::Int96(stats) = stats { + assert_eq!(stats.min(), &Int96::from(vec![0, 20, 30])); + assert_eq!(stats.max(), &Int96::from(vec![3, 20, 10])); + } else { + panic!("expecting Statistics::Int96, got {:?}", stats); + } + } + + #[test] + fn test_float_statistics() { + let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Float(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &3.0); + } else { + panic!("expecting Statistics::Float, got {:?}", stats); + } + } + + #[test] + fn test_double_statistics() { + let stats = statistics_roundtrip::(&[-1.0, 3.0, -2.0, 2.0]); + assert!(stats.has_min_max_set()); + if let Statistics::Double(stats) = stats { + assert_eq!(stats.min(), &-2.0); + assert_eq!(stats.max(), &3.0); + } else { + panic!("expecting Statistics::Double, got {:?}", stats); + } + } + + #[test] + fn test_byte_array_statistics() { + let input = vec!["aawaa", "zz", "aaw", "m", "qrs"] + .iter() + .map(|&s| s.into()) + .collect::>(); + + let stats = statistics_roundtrip::(&input); + assert!(stats.has_min_max_set()); + if let Statistics::ByteArray(stats) = stats { + assert_eq!(stats.min(), &ByteArray::from("aaw")); + assert_eq!(stats.max(), &ByteArray::from("zz")); + } else { + panic!("expecting Statistics::ByteArray, got {:?}", stats); + } + } + + #[test] + fn test_fixed_len_byte_array_statistics() { + let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "] + .iter() + .map(|&s| { + let b: ByteArray = s.into(); + b.into() + }) + .collect::>(); + + let stats = statistics_roundtrip::(&input); + assert!(stats.has_min_max_set()); + // should it be FixedLenByteArray? + // https://github.com/apache/arrow-rs/issues/660 + if let Statistics::ByteArray(stats) = stats { + assert_eq!(stats.min(), &ByteArray::from("aaw ")); + assert_eq!(stats.max(), &ByteArray::from("zz ")); + } else { + panic!("expecting Statistics::ByteArray, got {:?}", stats); + } + } + #[test] fn test_float_statistics_nan_middle() { let stats = statistics_roundtrip::(&[1.0, f32::NAN, 2.0]); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 797324e421e1..127ba95387e3 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -128,23 +128,18 @@ impl std::fmt::Debug for ByteArray { impl PartialOrd for ByteArray { fn partial_cmp(&self, other: &ByteArray) -> Option { - if self.data.is_some() && other.data.is_some() { - match self.len().cmp(&other.len()) { - Ordering::Greater => Some(Ordering::Greater), - Ordering::Less => Some(Ordering::Less), - Ordering::Equal => { - for (v1, v2) in self.data().iter().zip(other.data().iter()) { - match v1.cmp(v2) { - Ordering::Greater => return Some(Ordering::Greater), - Ordering::Less => return Some(Ordering::Less), - _ => {} - } - } - Some(Ordering::Equal) - } + // sort nulls first (consistent with PartialCmp on Option) + // + // Since ByteBuffer doesn't implement PartialOrd, so can't + // derive an implementation + match (&self.data, &other.data) { + (None, None) => Some(Ordering::Equal), + (None, Some(_)) => Some(Ordering::Less), + (Some(_), None) => Some(Ordering::Greater), + (Some(self_data), Some(other_data)) => { + // compare slices directly + self_data.data().partial_cmp(other_data.data()) } - } else { - None } } } @@ -1368,7 +1363,7 @@ mod tests { let ba4 = ByteArray::from(vec![]); let ba5 = ByteArray::from(vec![2, 2, 3]); - assert!(ba1 > ba2); + assert!(ba1 < ba2); assert!(ba3 > ba1); assert!(ba1 > ba4); assert_eq!(ba1, ba11); From 75432edb05ff001481df728607fc5b9be969c266 Mon Sep 17 00:00:00 2001 From: Ben Chambers <35960+bjchambers@users.noreply.github.com> Date: Sun, 8 Aug 2021 00:57:17 -0700 Subject: [PATCH 4/9] allocate enough bytes when writing booleans (#658) * allocate enough bytes when writing booleans * round up to nearest multiple of 256 --- parquet/src/arrow/arrow_writer.rs | 28 +++++++++++++++++++++++++++- parquet/src/data_type.rs | 8 +++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 4726734475ba..7728cd4cb2f2 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -227,7 +227,7 @@ fn write_leaves( ArrowDataType::FixedSizeList(_, _) | ArrowDataType::Union(_) => { Err(ParquetError::NYI( format!( - "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", + "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", array.data_type() ) )) @@ -1199,6 +1199,32 @@ mod tests { ); } + #[test] + fn bool_large_single_column() { + let values = Arc::new( + [None, Some(true), Some(false)] + .iter() + .cycle() + .copied() + .take(200_000) + .collect::(), + ); + let schema = + Schema::new(vec![Field::new("col", values.data_type().clone(), true)]); + let expected_batch = + RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); + let file = get_temp_file("bool_large_single_column", &[]); + + let mut writer = ArrowWriter::try_new( + file.try_clone().unwrap(), + expected_batch.schema(), + None, + ) + .expect("Unable to write file"); + writer.write(&expected_batch).unwrap(); + writer.close().unwrap(); + } + #[test] fn i8_single_column() { required_and_optional::(0..SMALL_SIZE as i8, "i8_single_column"); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 127ba95387e3..3573362744fe 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -588,6 +588,7 @@ pub(crate) mod private { use crate::util::bit_util::{BitReader, BitWriter}; use crate::util::memory::ByteBufferPtr; + use arrow::util::bit_util::round_upto_power_of_2; use byteorder::ByteOrder; use std::convert::TryInto; @@ -669,7 +670,12 @@ pub(crate) mod private { bit_writer: &mut BitWriter, ) -> Result<()> { if bit_writer.bytes_written() + values.len() / 8 >= bit_writer.capacity() { - bit_writer.extend(256); + let bits_available = + (bit_writer.capacity() - bit_writer.bytes_written()) * 8; + let bits_needed = values.len() - bits_available; + let bytes_needed = (bits_needed + 7) / 8; + let bytes_needed = round_upto_power_of_2(bytes_needed, 256); + bit_writer.extend(bytes_needed); } for value in values { if !bit_writer.put_value(*value as u64, 1) { From 7b2e26d88aa788da76db4bce8df7b9d57c30cd78 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 8 Aug 2021 06:36:24 -0400 Subject: [PATCH 5/9] Add some do comments to parquet bit_util (#663) --- parquet/src/util/bit_util.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index dafd339d5684..4b34df478065 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -223,7 +223,7 @@ impl BitWriter { } } - /// Extend buffer size + /// Extend buffer size by `increment` bytes #[inline] pub fn extend(&mut self, increment: usize) { self.max_bytes += increment; @@ -231,7 +231,7 @@ impl BitWriter { self.buffer.extend(extra); } - /// Report buffer size + /// Report buffer size, in bytes #[inline] pub fn capacity(&mut self) -> usize { self.max_bytes @@ -332,6 +332,7 @@ impl BitWriter { self.max_bytes } + /// Writes the entire byte `value` at the byte `offset` pub fn write_at(&mut self, offset: usize, value: u8) { self.buffer[offset] = value; } From bea4cb84f44751e67d53b300554b52eb98f6fb3a Mon Sep 17 00:00:00 2001 From: Navin Date: Sun, 8 Aug 2021 20:40:42 +1000 Subject: [PATCH 6/9] Doctests for DictionaryArray::from_iter, PrimitiveDictionaryBuilder and DecimalBuilder. (#673) * Doctest for PrimitiveDictionaryBuilder. * Doctests for DictionaryArray::from_iter. * Documentation for DecimalBuilder. --- arrow/src/array/array_dictionary.rs | 30 ++++++++++++++++++++++ arrow/src/array/builder.rs | 39 +++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index cf847ef6fa5e..de9873ccee5c 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -153,6 +153,22 @@ impl From for DictionaryArray { } /// Constructs a `DictionaryArray` from an iterator of optional strings. +/// +/// # Example: +/// ``` +/// use arrow::array::{DictionaryArray, PrimitiveArray, StringArray}; +/// use arrow::datatypes::Int8Type; +/// +/// let test = vec!["a", "a", "b", "c"]; +/// let array: DictionaryArray = test +/// .iter() +/// .map(|&x| if x == "b" { None } else { Some(x) }) +/// .collect(); +/// assert_eq!( +/// "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n", +/// format!("{:?}", array) +/// ); +/// ``` impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator> for DictionaryArray { @@ -181,6 +197,20 @@ impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator = test.into_iter().collect(); +/// assert_eq!( +/// "DictionaryArray {keys: PrimitiveArray\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", +/// format!("{:?}", array) +/// ); +/// ``` impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray { diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index b4ebc875a90e..8a4ebbff0541 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1062,6 +1062,11 @@ pub struct FixedSizeBinaryBuilder { builder: FixedSizeListBuilder, } +/// +/// Array Builder for [`DecimalArray`] +/// +/// See [`DecimalArray`] for example. +/// #[derive(Debug)] pub struct DecimalBuilder { builder: FixedSizeListBuilder, @@ -2095,6 +2100,40 @@ impl UnionBuilder { /// Array builder for `DictionaryArray`. For example to map a set of byte indices /// to f32 values. Note that the use of a `HashMap` here will not scale to very large /// arrays or result in an ordered dictionary. +/// +/// # Example: +/// +/// ``` +/// use arrow::array::{ +/// Array, PrimitiveBuilder, PrimitiveDictionaryBuilder, +/// UInt8Array, UInt32Array, +/// }; +/// use arrow::datatypes::{UInt8Type, UInt32Type}; +/// +/// let key_builder = PrimitiveBuilder::::new(3); +/// let value_builder = PrimitiveBuilder::::new(2); +/// let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); +/// builder.append(12345678).unwrap(); +/// builder.append_null().unwrap(); +/// builder.append(22345678).unwrap(); +/// let array = builder.finish(); +/// +/// assert_eq!( +/// array.keys(), +/// &UInt8Array::from(vec![Some(0), None, Some(1)]) +/// ); +/// +/// // Values are polymorphic and so require a downcast. +/// let av = array.values(); +/// let ava: &UInt32Array = av.as_any().downcast_ref::().unwrap(); +/// let avs: &[u32] = ava.values(); +/// +/// assert!(!array.is_null(0)); +/// assert!(array.is_null(1)); +/// assert!(!array.is_null(2)); +/// +/// assert_eq!(avs, &[12345678, 22345678]); +/// ``` #[derive(Debug)] pub struct PrimitiveDictionaryBuilder where From 857dbafcbaa721f22ac485f38ccaff3faf8d2ab9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 8 Aug 2021 08:32:47 -0400 Subject: [PATCH 7/9] Write boolean stats for boolean columns (not i32 stats) (#661) --- parquet/src/column/writer.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index 3cb17e17f7f6..af76c84c6a03 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -919,7 +919,7 @@ impl ColumnWriterImpl { }; match self.descr.physical_type() { Type::INT32 => gen_stats_section!(i32, int32, min, max, distinct, nulls), - Type::BOOLEAN => gen_stats_section!(i32, int32, min, max, distinct, nulls), + Type::BOOLEAN => gen_stats_section!(bool, boolean, min, max, distinct, nulls), Type::INT64 => gen_stats_section!(i64, int64, min, max, distinct, nulls), Type::INT96 => gen_stats_section!(Int96, int96, min, max, distinct, nulls), Type::FLOAT => gen_stats_section!(f32, float, min, max, distinct, nulls), @@ -1691,13 +1691,11 @@ mod tests { fn test_bool_statistics() { let stats = statistics_roundtrip::(&[true, false, false, true]); assert!(stats.has_min_max_set()); - // should it be BooleanStatistics?? - // https://github.com/apache/arrow-rs/issues/659 - if let Statistics::Int32(stats) = stats { - assert_eq!(stats.min(), &0); - assert_eq!(stats.max(), &1); + if let Statistics::Boolean(stats) = stats { + assert_eq!(stats.min(), &false); + assert_eq!(stats.max(), &true); } else { - panic!("expecting Statistics::Int32, got {:?}", stats); + panic!("expecting Statistics::Boolean, got {:?}", stats); } } From fc0493198194265eed16fa5d41d6d70760756363 Mon Sep 17 00:00:00 2001 From: Roee Shlomo Date: Mon, 9 Aug 2021 14:31:07 +0300 Subject: [PATCH 8/9] Make rand an optional dependency (#674) Closes #671 Signed-off-by: roee88 --- .github/workflows/rust.yml | 4 +++- arrow/Cargo.toml | 13 ++++++------- arrow/README.md | 6 ++++-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 5579072effb4..d76192c689cb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -327,12 +327,14 @@ jobs: rustup override set ${{ matrix.rust }} rustup component add rustfmt rustup target add wasm32-unknown-unknown + rustup target add wasm32-wasi - name: Build arrow crate run: | export CARGO_HOME="/github/home/.cargo" export CARGO_TARGET_DIR="/github/home/target" cd arrow - cargo build --features=js --target wasm32-unknown-unknown + cargo build --no-default-features --features=csv,ipc,simd --target wasm32-unknown-unknown + cargo build --no-default-features --features=csv,ipc,simd --target wasm32-wasi # test builds with various feature flags default-build: diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f5a3e24a6725..0c8ca76b7890 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -40,10 +40,7 @@ serde = { version = "1.0", features = ["rc"] } serde_derive = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] } indexmap = "1.6" -rand = { version = "0.8", default-features = false } -# getrandom is a dependency of rand, not (directly) of arrow -# need to specify `js` feature to build on wasm -getrandom = { version = "0.2", optional = true } +rand = { version = "0.8", optional = true } num = "0.4" csv_crate = { version = "1.1", optional = true, package="csv" } regex = "1.3" @@ -64,16 +61,18 @@ csv = ["csv_crate"] ipc = ["flatbuffers"] simd = ["packed_simd"] prettyprint = ["prettytable-rs"] -js = ["getrandom/js"] # The test utils feature enables code used in benchmarks and tests but -# not the core arrow code itself -test_utils = ["rand/std", "rand/std_rng"] +# not the core arrow code itself. Be aware that `rand` must be kept as +# an optional dependency for supporting compile to wasm32-unknown-unknown +# target without assuming an environment containing JavaScript. +test_utils = ["rand"] # this is only intended to be used in single-threaded programs: it verifies that # all allocated memory is being released (no memory leaks). # See README for details memory-check = [] [dev-dependencies] +rand = "0.8" criterion = "0.3" flate2 = "1" tempfile = "3" diff --git a/arrow/README.md b/arrow/README.md index c5fdb11a279d..298a5cf93c24 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -59,11 +59,13 @@ println!("{:?}", array.value(1)); ## Building for WASM -In order to compile Arrow for Web Assembly (the `wasm32-unknown-unknown` WASM target), you will likely need to turn off this crate's default features and use the `js` feature. +Arrow can compile to WebAssembly using the `wasm32-unknown-unknown` and `wasm32-wasi` targets. + +In order to compile Arrow for `wasm32-unknown-unknown` you will need to disable default features, then include the desired features, but exclude test dependencies (the `test_utils` feature). For example, use this snippet in your `Cargo.toml`: ```toml [dependencies] -arrow = { version = "5.0", default-features = false, features = ["js"] } +arrow = { version = "5.0", default-features = false, features = ["csv", "ipc", "simd"] } ``` ## Examples From fa5acd971c973161f17e69d5c6b50d6e77c7da03 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 9 Aug 2021 20:58:03 -0400 Subject: [PATCH 9/9] Write FixedLenByteArray stats for FixedLenByteArray columns (not ByteArray stats) (#662) --- parquet/src/column/writer.rs | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index af76c84c6a03..0da943918104 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -924,11 +924,28 @@ impl ColumnWriterImpl { Type::INT96 => gen_stats_section!(Int96, int96, min, max, distinct, nulls), Type::FLOAT => gen_stats_section!(f32, float, min, max, distinct, nulls), Type::DOUBLE => gen_stats_section!(f64, double, min, max, distinct, nulls), - Type::BYTE_ARRAY | Type::FIXED_LEN_BYTE_ARRAY => { + Type::BYTE_ARRAY => { let min = min.as_ref().map(|v| ByteArray::from(v.as_bytes().to_vec())); let max = max.as_ref().map(|v| ByteArray::from(v.as_bytes().to_vec())); Statistics::byte_array(min, max, distinct, nulls, false) } + Type::FIXED_LEN_BYTE_ARRAY => { + let min = min + .as_ref() + .map(|v| ByteArray::from(v.as_bytes().to_vec())) + .map(|ba| { + let ba: FixedLenByteArray = ba.into(); + ba + }); + let max = max + .as_ref() + .map(|v| ByteArray::from(v.as_bytes().to_vec())) + .map(|ba| { + let ba: FixedLenByteArray = ba.into(); + ba + }); + Statistics::fixed_len_byte_array(min, max, distinct, nulls, false) + } } } @@ -1797,13 +1814,13 @@ mod tests { let stats = statistics_roundtrip::(&input); assert!(stats.has_min_max_set()); - // should it be FixedLenByteArray? - // https://github.com/apache/arrow-rs/issues/660 - if let Statistics::ByteArray(stats) = stats { - assert_eq!(stats.min(), &ByteArray::from("aaw ")); - assert_eq!(stats.max(), &ByteArray::from("zz ")); + if let Statistics::FixedLenByteArray(stats) = stats { + let expected_min: FixedLenByteArray = ByteArray::from("aaw ").into(); + assert_eq!(stats.min(), &expected_min); + let expected_max: FixedLenByteArray = ByteArray::from("zz ").into(); + assert_eq!(stats.max(), &expected_max); } else { - panic!("expecting Statistics::ByteArray, got {:?}", stats); + panic!("expecting Statistics::FixedLenByteArray, got {:?}", stats); } }