From 1c43a1642979292fac040b90c9ac97487198ceb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=ABratal=C2=BB?= <«ratal@ratal.org»> Date: Sun, 7 Jan 2024 23:17:09 +0100 Subject: [PATCH 1/3] Main As FixedSizeBinayArray is almost equivalent to PrimitiveArray (a Vec), it should have similar COW capabilities. So I would like to propose to add get_mut_values() method. --- Cargo.toml | 38 ++++++++++++++++++++---------- src/array/fixed_size_binary/mod.rs | 5 ++++ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a8e5933d2f..022127bee8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,14 @@ [package] name = "arrow2" -version = "0.17.4" +version = "0.17.5" license = "Apache-2.0" description = "Unofficial implementation of Apache Arrow spec in safe Rust" homepage = "https://github.com/jorgecarleitao/arrow2" repository = "https://github.com/jorgecarleitao/arrow2" -authors = ["Jorge C. Leitao ", "Apache Arrow "] +authors = [ + "Jorge C. Leitao ", + "Apache Arrow ", +] keywords = ["arrow", "analytics"] edition = "2021" exclude = ["testing/"] @@ -51,7 +54,9 @@ regex-syntax = { version = "0.7", optional = true } streaming-iterator = { version = "0.1", optional = true } fallible-streaming-iterator = { version = "0.1", optional = true } -json-deserializer = { version = "0.4.4", optional = true, features = ["preserve_order"] } +json-deserializer = { version = "0.4.4", optional = true, features = [ + "preserve_order", +] } indexmap = { version = "^1.6", optional = true } # used to print columns in a nice columnar format @@ -86,7 +91,9 @@ orc-format = { version = "0.3.0", optional = true } # Arrow integration tests support serde = { version = "^1.0", features = ["rc"], optional = true } serde_derive = { version = "^1.0", optional = true } -serde_json = { version = "^1.0", features = ["preserve_order"], optional = true } +serde_json = { version = "^1.0", features = [ + "preserve_order", +], optional = true } # for division/remainder optimization at runtime strength_reduce = { version = "0.2", optional = true } @@ -180,7 +187,11 @@ io_csv_read_async = ["csv-async", "lexical-core", "futures"] io_csv_write = ["csv-core", "streaming-iterator", "lexical-core"] io_json = ["io_json_read", "io_json_write"] io_json_read = ["json-deserializer", "indexmap", "lexical-core"] -io_json_write = ["streaming-iterator", "fallible-streaming-iterator", "lexical-core"] +io_json_write = [ + "streaming-iterator", + "fallible-streaming-iterator", + "lexical-core", +] io_ipc = ["arrow-format"] io_ipc_write_async = ["io_ipc", "futures"] io_ipc_read_async = ["io_ipc", "futures", "async-stream"] @@ -188,7 +199,13 @@ io_ipc_compression = ["lz4", "zstd"] io_flight = ["io_ipc", "arrow-format/flight-data"] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. -io_parquet = ["parquet2", "io_ipc", "base64", "streaming-iterator", "fallible-streaming-iterator"] +io_parquet = [ + "parquet2", + "io_ipc", + "base64", + "streaming-iterator", + "fallible-streaming-iterator", +] io_parquet_async = ["futures", "io_parquet", "parquet2/async"] io_parquet_compression = [ @@ -196,7 +213,7 @@ io_parquet_compression = [ "io_parquet_gzip", "io_parquet_snappy", "io_parquet_lz4", - "io_parquet_brotli" + "io_parquet_brotli", ] # sample testing of generated arrow data @@ -214,9 +231,7 @@ io_parquet_brotli = ["parquet2/brotli"] io_parquet_bloom_filter = ["parquet2/bloom_filter"] io_avro = ["avro-schema", "streaming-iterator"] -io_avro_compression = [ - "avro-schema/compression", -] +io_avro_compression = ["avro-schema/compression"] io_avro_async = ["avro-schema/async"] io_orc = ["orc-format"] @@ -277,7 +292,7 @@ compute = [ "compute_take", "compute_temporal", "compute_utf8", - "compute_window" + "compute_window", ] benchmarks = ["rand"] serde_types = ["serde", "serde_derive"] @@ -401,4 +416,3 @@ harness = false [[bench]] name = "like_kernels" harness = false - diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 34242d9ad6..d01a54287a 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -172,6 +172,11 @@ impl FixedSizeBinaryArray { } } + /// Returns an option of a mutable reference to the values of this [`FixedSizeBinaryArray`]. + pub fn get_mut_values(&mut self) -> Option<&mut [u8]> { + self.values.get_mut_slice() + } + /// Returns a new [`FixedSizeBinaryArray`] with a different logical type. /// This is `O(1)`. /// # Panics From b929bcc65b5e537d7511079a9dd267d606a5d554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=ABratal=C2=BB?= <«ratal@ratal.org»> Date: Tue, 16 Jan 2024 12:42:39 +0100 Subject: [PATCH 2/3] get mutable offsets and values for Utf8 and binary arrays --- src/array/binary/mod.rs | 10 ++++++++++ src/array/utf8/mod.rs | 10 ++++++++++ src/offset.rs | 10 ++++++++++ 3 files changed, 30 insertions(+) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 7247decb30..b51dde2852 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -221,6 +221,16 @@ impl BinaryArray { impl_mut_validity!(); impl_into_array!(); + /// Returns an option of a mutable reference to the values of this [`BinaryArray`]. + pub fn get_mut_values(&mut self) -> Option<&mut [u8]> { + self.values.get_mut_slice() + } + + /// Returns an option of a mutable reference to the values of this [`BinaryArray`]. + pub fn get_mut_offsets(&mut self) -> Option<&mut [O]> { + self.offsets.get_mut_slice() + } + /// Returns its internal representation #[must_use] pub fn into_inner(self) -> (DataType, OffsetsBuffer, Buffer, Option) { diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 9440ae4330..261768ff52 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -240,6 +240,16 @@ impl Utf8Array { impl_mut_validity!(); impl_into_array!(); + /// Returns an option of a mutable reference to the values of this [`Utf8Array`]. + pub fn get_mut_values(&mut self) -> Option<&mut [u8]> { + self.values.get_mut_slice() + } + + /// Returns an option of a mutable reference to the values of this [`Utf8Array`]. + pub fn get_mut_offsets(&mut self) -> Option<&mut [O]> { + self.offsets.get_mut_slice() + } + /// Returns its internal representation #[must_use] pub fn into_inner(self) -> (DataType, OffsetsBuffer, Buffer, Option) { diff --git a/src/offset.rs b/src/offset.rs index 80b45d6680..8fa71d5c14 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -371,6 +371,16 @@ impl OffsetsBuffer { .map_left(Self) } + /// Returns a mutable reference to its slice, if possible. + /// + /// This operation returns [`Some`] iff this [`OffsetsBuffer`]: + /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) + /// * has not been imported from the c data interface (FFI) + #[inline] + pub fn get_mut_slice(&mut self) -> Option<&mut [O]> { + self.0.get_mut_slice() + } + /// Returns a reference to its internal [`Buffer`]. #[inline] pub fn buffer(&self) -> &Buffer { From 003d8ef64ee922b3c9a230650e8a789cfa2ade34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C2=ABratal=C2=BB?= <«ratal@ratal.org»> Date: Tue, 16 Jan 2024 18:45:39 +0100 Subject: [PATCH 3/3] added mut_values() for fixedSizeListArray --- src/array/fixed_size_list/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/array/fixed_size_list/mod.rs b/src/array/fixed_size_list/mod.rs index 0d335167b2..2fc1ea9e43 100644 --- a/src/array/fixed_size_list/mod.rs +++ b/src/array/fixed_size_list/mod.rs @@ -148,11 +148,16 @@ impl FixedSizeListArray { self.validity.as_ref() } - /// Returns the inner array. + /// Returns the inner array reference. pub fn values(&self) -> &Box { &self.values } + /// Returns the inner array mutable reference. + pub fn mut_values(&mut self) -> &mut Box { + &mut self.values + } + /// Returns the `Vec` at position `i`. /// # Panic: /// panics iff `i >= self.len()`