From 613e93e75fafaacb8d65af5b208568649fc84eed Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 26 Jul 2024 06:11:06 -0400 Subject: [PATCH 01/17] Merge `53.0.0-dev` dev branch to main (#6126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (#6041) * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` Signed-off-by: Bugen Zhao * fix example tests Signed-off-by: Bugen Zhao --------- Signed-off-by: Bugen Zhao * Remove `impl> From for Buffer` that easily accidentally copies data (#6043) * deprecate auto copy, ask explicit reference * update comments * make cargo doc happy * Make display of interval types more pretty (#6006) * improve dispaly for interval. * update test in pretty, and fix display problem. * tmp * fix tests in arrow-cast. * fix tests in pretty. * fix style. * Update snafu (#5930) * Update Parquet thrift generated structures (#6045) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * Revert "Revert "Write Bloom filters between row groups instead of the end (#…" (#5933) This reverts commit 22e0b4432c9838f2536284015271d3de9a165135. * Revert "Update snafu (#5930)" (#6069) This reverts commit 756b1fb26d1702f36f446faf9bb40a4869c3e840. * Update pyo3 requirement from 0.21.1 to 0.22.1 (fixed) (#6075) * Update pyo3 requirement from 0.21.1 to 0.22.1 Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.1) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * refactor: remove deprecated `FromPyArrow::from_pyarrow` "GIL Refs" are being phased out. * chore: update `pyo3` in integration tests --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * remove repeated codes to make the codes more concise. (#6080) * Add `unencoded_byte_array_data_bytes` to `ParquetMetaData` (#6068) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * add support for unencoded_byte_array_data_bytes * add comments * change sig of ColumnMetrics::update_variable_length_bytes() * rename ParquetOffsetIndex to OffsetSizeIndex * rename some functions * suggestion from review Co-authored-by: Andrew Lamb * add Default trait to ColumnMetrics as suggested in review * rename OffsetSizeIndex to OffsetIndexMetaData --------- Co-authored-by: Andrew Lamb * Update pyo3 requirement from 0.21.1 to 0.22.2 (#6085) Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/v0.22.2/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.2) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Deprecate read_page_locations() and simplify offset index in `ParquetMetaData` (#6095) * deprecate read_page_locations * add to_thrift() to OffsetIndexMetaData * Update parquet/src/column/writer/mod.rs Co-authored-by: Ed Seidl --------- Signed-off-by: Bugen Zhao Signed-off-by: dependabot[bot] Co-authored-by: Bugen Zhao Co-authored-by: Xiangpeng Hao Co-authored-by: kamille Co-authored-by: Jesse Co-authored-by: Ed Seidl Co-authored-by: Marco Neumann Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- arrow-buffer/src/buffer/immutable.rs | 33 +- arrow-cast/src/cast/mod.rs | 26 +- arrow-cast/src/display.rs | 155 +++++-- arrow-cast/src/pretty.rs | 54 +-- arrow-flight/Cargo.toml | 11 +- arrow-flight/examples/flight_sql_server.rs | 6 +- arrow-flight/gen/Cargo.toml | 4 +- arrow-flight/src/arrow.flight.protocol.rs | 36 +- .../src/sql/arrow.flight.protocol.sql.rs | 12 +- arrow-flight/tests/common/trailers_layer.rs | 32 +- arrow-integration-testing/Cargo.toml | 4 +- arrow-pyarrow-integration-testing/Cargo.toml | 2 +- arrow/Cargo.toml | 2 +- arrow/src/pyarrow.rs | 5 - parquet/Cargo.toml | 8 + parquet/examples/write_parquet.rs | 131 ++++++ parquet/regen.sh | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 4 +- parquet/src/arrow/arrow_reader/statistics.rs | 14 +- parquet/src/arrow/arrow_writer/byte_array.rs | 19 +- parquet/src/arrow/arrow_writer/mod.rs | 42 +- parquet/src/arrow/async_reader/mod.rs | 23 +- parquet/src/arrow/async_writer/mod.rs | 4 +- parquet/src/bin/parquet-index.rs | 12 +- parquet/src/column/writer/encoder.rs | 8 + parquet/src/column/writer/mod.rs | 54 ++- parquet/src/data_type.rs | 11 + parquet/src/file/metadata/memory.rs | 8 + parquet/src/file/metadata/mod.rs | 93 +++- parquet/src/file/page_index/index_reader.rs | 54 ++- parquet/src/file/page_index/mod.rs | 1 + parquet/src/file/page_index/offset_index.rs | 59 +++ parquet/src/file/properties.rs | 36 ++ parquet/src/file/serialized_reader.rs | 45 +- parquet/src/file/writer.rs | 202 ++++++--- parquet/src/format.rs | 397 +++++++++++++++--- parquet/tests/arrow_writer_layout.rs | 10 +- 37 files changed, 1245 insertions(+), 374 deletions(-) create mode 100644 parquet/examples/write_parquet.rs create mode 100644 parquet/src/file/page_index/offset_index.rs diff --git a/arrow-buffer/src/buffer/immutable.rs b/arrow-buffer/src/buffer/immutable.rs index 52e201ca15a2..c53ef18ba58f 100644 --- a/arrow-buffer/src/buffer/immutable.rs +++ b/arrow-buffer/src/buffer/immutable.rs @@ -356,16 +356,29 @@ impl Buffer { } } -/// Creating a `Buffer` instance by copying the memory from a `AsRef<[u8]>` into a newly -/// allocated memory region. -impl> From for Buffer { - fn from(p: T) -> Self { - // allocate aligned memory buffer - let slice = p.as_ref(); - let len = slice.len(); - let mut buffer = MutableBuffer::new(len); - buffer.extend_from_slice(slice); - buffer.into() +/// Note that here we deliberately do not implement +/// `impl> From for Buffer` +/// As it would accept `Buffer::from(vec![...])` that would cause an unexpected copy. +/// Instead, we ask user to be explicit when copying is occurring, e.g., `Buffer::from(vec![...].to_byte_slice())`. +/// For zero-copy conversion, user should use `Buffer::from_vec(vec![...])`. +/// +/// Since we removed impl for `AsRef`, we added the following three specific implementations to reduce API breakage. +/// See for more discussion on this. +impl From<&[u8]> for Buffer { + fn from(p: &[u8]) -> Self { + Self::from_slice_ref(p) + } +} + +impl From<[u8; N]> for Buffer { + fn from(p: [u8; N]) -> Self { + Self::from_slice_ref(p) + } +} + +impl From<&[u8; N]> for Buffer { + fn from(p: &[u8; N]) -> Self { + Self::from_slice_ref(p) } } diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 4dd69596209a..7df9420f94f0 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -4409,8 +4409,8 @@ mod tests { IntervalUnit::YearMonth, IntervalYearMonthArray, vec![ - Some("1 years 1 mons 0 days 0 hours 0 mins 0.00 secs"), - Some("2 years 7 mons 0 days 0 hours 0 mins 0.00 secs"), + Some("1 years 1 mons"), + Some("2 years 7 mons"), None, None, None, @@ -4433,9 +4433,9 @@ mod tests { IntervalUnit::DayTime, IntervalDayTimeArray, vec![ - Some("0 years 0 mons 390 days 0 hours 0 mins 0.000 secs"), - Some("0 years 0 mons 930 days 0 hours 0 mins 0.000 secs"), - Some("0 years 0 mons 30 days 0 hours 0 mins 0.000 secs"), + Some("390 days"), + Some("930 days"), + Some("30 days"), None, None, ] @@ -4461,16 +4461,16 @@ mod tests { IntervalUnit::MonthDayNano, IntervalMonthDayNanoArray, vec![ - Some("0 years 13 mons 1 days 0 hours 0 mins 0.000000000 secs"), + Some("13 mons 1 days"), None, - Some("0 years 31 mons 35 days 0 hours 0 mins 0.001400000 secs"), - Some("0 years 0 mons 3 days 0 hours 0 mins 0.000000000 secs"), - Some("0 years 0 mons 0 days 0 hours 0 mins 8.000000000 secs"), + Some("31 mons 35 days 0.001400000 secs"), + Some("3 days"), + Some("8.000000000 secs"), None, - Some("0 years 0 mons 1 days 0 hours 0 mins 29.800000000 secs"), - Some("0 years 3 mons 0 days 0 hours 0 mins 1.000000000 secs"), - Some("0 years 0 mons 0 days 0 hours 8 mins 0.000000000 secs"), - Some("0 years 63 mons 9 days 19 hours 9 mins 2.222000000 secs"), + Some("1 days 29.800000000 secs"), + Some("3 mons 1.000000000 secs"), + Some("8 mins"), + Some("63 mons 9 days 19 hours 9 mins 2.222000000 secs"), None, ] ); diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 6a40d036350a..312e7973963e 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -654,10 +654,7 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { let years = (interval / 12_f64).floor(); let month = interval - (years * 12_f64); - write!( - f, - "{years} years {month} mons 0 days 0 hours 0 mins 0.00 secs", - )?; + write!(f, "{years} years {month} mons",)?; Ok(()) } } @@ -665,62 +662,140 @@ impl<'a> DisplayIndex for &'a PrimitiveArray { impl<'a> DisplayIndex for &'a PrimitiveArray { fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { let value = self.value(idx); + let mut prefix = ""; - let secs = value.milliseconds / 1_000; + if value.days != 0 { + write!(f, "{prefix}{} days", value.days)?; + prefix = " "; + } + + if value.milliseconds != 0 { + let millis_fmt = MillisecondsFormatter { + milliseconds: value.milliseconds, + prefix, + }; + + f.write_fmt(format_args!("{millis_fmt}"))?; + } + + Ok(()) + } +} + +impl<'a> DisplayIndex for &'a PrimitiveArray { + fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { + let value = self.value(idx); + let mut prefix = ""; + + if value.months != 0 { + write!(f, "{prefix}{} mons", value.months)?; + prefix = " "; + } + + if value.days != 0 { + write!(f, "{prefix}{} days", value.days)?; + prefix = " "; + } + + if value.nanoseconds != 0 { + let nano_fmt = NanosecondsFormatter { + nanoseconds: value.nanoseconds, + prefix, + }; + f.write_fmt(format_args!("{nano_fmt}"))?; + } + + Ok(()) + } +} + +struct NanosecondsFormatter<'a> { + nanoseconds: i64, + prefix: &'a str, +} + +impl<'a> Display for NanosecondsFormatter<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut prefix = self.prefix; + + let secs = self.nanoseconds / 1_000_000_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let milliseconds = value.milliseconds % 1_000; + let nanoseconds = self.nanoseconds % 1_000_000_000; - let secs_sign = if secs < 0 || milliseconds < 0 { - "-" - } else { - "" - }; + if hours != 0 { + write!(f, "{prefix}{} hours", hours)?; + prefix = " "; + } + + if mins != 0 { + write!(f, "{prefix}{} mins", mins)?; + prefix = " "; + } + + if secs != 0 || nanoseconds != 0 { + let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; + write!( + f, + "{prefix}{}{}.{:09} secs", + secs_sign, + secs.abs(), + nanoseconds.abs() + )?; + } - write!( - f, - "0 years 0 mons {} days {} hours {} mins {}{}.{:03} secs", - value.days, - hours, - mins, - secs_sign, - secs.abs(), - milliseconds.abs(), - )?; Ok(()) } } -impl<'a> DisplayIndex for &'a PrimitiveArray { - fn write(&self, idx: usize, f: &mut dyn Write) -> FormatResult { - let value = self.value(idx); +struct MillisecondsFormatter<'a> { + milliseconds: i32, + prefix: &'a str, +} + +impl<'a> Display for MillisecondsFormatter<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut prefix = self.prefix; - let secs = value.nanoseconds / 1_000_000_000; + let secs = self.milliseconds / 1_000; let mins = secs / 60; let hours = mins / 60; let secs = secs - (mins * 60); let mins = mins - (hours * 60); - let nanoseconds = value.nanoseconds % 1_000_000_000; - - let secs_sign = if secs < 0 || nanoseconds < 0 { "-" } else { "" }; - - write!( - f, - "0 years {} mons {} days {} hours {} mins {}{}.{:09} secs", - value.months, - value.days, - hours, - mins, - secs_sign, - secs.abs(), - nanoseconds.abs(), - )?; + let milliseconds = self.milliseconds % 1_000; + + if hours != 0 { + write!(f, "{prefix}{} hours", hours,)?; + prefix = " "; + } + + if mins != 0 { + write!(f, "{prefix}{} mins", mins,)?; + prefix = " "; + } + + if secs != 0 || milliseconds != 0 { + let secs_sign = if secs < 0 || milliseconds < 0 { + "-" + } else { + "" + }; + + write!( + f, + "{prefix}{}{}.{:03} secs", + secs_sign, + secs.abs(), + milliseconds.abs() + )?; + } + Ok(()) } } diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs index 9383b9f73f61..f41471e38d5e 100644 --- a/arrow-cast/src/pretty.rs +++ b/arrow-cast/src/pretty.rs @@ -986,16 +986,16 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+----------------------------------------------------+", - "| IntervalDayTime |", - "+----------------------------------------------------+", - "| 0 years 0 mons -1 days 0 hours -10 mins 0.000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -1.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -0.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.010 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.100 secs |", - "+----------------------------------------------------+", + "+------------------+", + "| IntervalDayTime |", + "+------------------+", + "| -1 days -10 mins |", + "| -1.001 secs |", + "| -0.001 secs |", + "| 0.001 secs |", + "| 0.010 secs |", + "| 0.100 secs |", + "+------------------+", ]; let actual: Vec<&str> = table.lines().collect(); @@ -1032,23 +1032,23 @@ mod tests { let table = pretty_format_batches(&[batch]).unwrap().to_string(); let expected = vec![ - "+-----------------------------------------------------------+", - "| IntervalMonthDayNano |", - "+-----------------------------------------------------------+", - "| 0 years -1 mons -1 days 0 hours -10 mins 0.000000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -1.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins -0.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000001 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000010 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000000100 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000001000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000010000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.000100000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.001000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.010000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 0.100000000 secs |", - "| 0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs |", - "+-----------------------------------------------------------+", + "+--------------------------+", + "| IntervalMonthDayNano |", + "+--------------------------+", + "| -1 mons -1 days -10 mins |", + "| -1.000000001 secs |", + "| -0.000000001 secs |", + "| 0.000000001 secs |", + "| 0.000000010 secs |", + "| 0.000000100 secs |", + "| 0.000001000 secs |", + "| 0.000010000 secs |", + "| 0.000100000 secs |", + "| 0.001000000 secs |", + "| 0.010000000 secs |", + "| 0.100000000 secs |", + "| 1.000000000 secs |", + "+--------------------------+", ]; let actual: Vec<&str> = table.lines().collect(); diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 539b1ea35d6c..f66891ef09a1 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -44,11 +44,11 @@ bytes = { version = "1", default-features = false } futures = { version = "0.3", default-features = false, features = ["alloc"] } once_cell = { version = "1", optional = true } paste = { version = "1.0" } -prost = { version = "0.12.3", default-features = false, features = ["prost-derive"] } +prost = { version = "0.13.1", default-features = false, features = ["prost-derive"] } # For Timestamp type -prost-types = { version = "0.12.3", default-features = false } +prost-types = { version = "0.13.1", default-features = false } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } -tonic = { version = "0.11.0", default-features = false, features = ["transport", "codegen", "prost"] } +tonic = { version = "0.12.0", default-features = false, features = ["transport", "codegen", "prost"] } # CLI-related dependencies anyhow = { version = "1.0", optional = true } @@ -70,8 +70,9 @@ cli = ["anyhow", "arrow-cast/prettyprint", "clap", "tracing-log", "tracing-subsc [dev-dependencies] arrow-cast = { workspace = true, features = ["prettyprint"] } assert_cmd = "2.0.8" -http = "0.2.9" -http-body = "0.4.5" +http = "1.1.0" +http-body = "1.0.0" +hyper-util = "0.1" pin-project-lite = "0.2" tempfile = "3.3" tokio-stream = { version = "0.1", features = ["net"] } diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 031628eaa833..d5168debc433 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -783,7 +783,8 @@ impl ProstMessageExt for FetchResults { #[cfg(test)] mod tests { use super::*; - use futures::TryStreamExt; + use futures::{TryFutureExt, TryStreamExt}; + use hyper_util::rt::TokioIo; use std::fs; use std::future::Future; use std::net::SocketAddr; @@ -843,7 +844,8 @@ mod tests { .serve_with_incoming(stream); let request_future = async { - let connector = service_fn(move |_| UnixStream::connect(path.clone())); + let connector = + service_fn(move |_| UnixStream::connect(path.clone()).map_ok(TokioIo::new)); let channel = Endpoint::try_from("http://example.com") .unwrap() .connect_with_connector(connector) diff --git a/arrow-flight/gen/Cargo.toml b/arrow-flight/gen/Cargo.toml index 7264a527ca8d..a12c683776b4 100644 --- a/arrow-flight/gen/Cargo.toml +++ b/arrow-flight/gen/Cargo.toml @@ -33,5 +33,5 @@ publish = false # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = "=1.0.86", default-features = false } -prost-build = { version = "=0.12.6", default-features = false } -tonic-build = { version = "=0.11.0", default-features = false, features = ["transport", "prost"] } +prost-build = { version = "=0.13.1", default-features = false } +tonic-build = { version = "=0.12.0", default-features = false, features = ["transport", "prost"] } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index bc314de9d19f..8c7292894eab 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -38,7 +38,7 @@ pub struct BasicAuth { pub password: ::prost::alloc::string::String, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct Empty {} /// /// Describes an available action, including both the name used for execution @@ -103,7 +103,7 @@ pub struct Result { /// /// The result should be stored in Result.body. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CancelFlightInfoResult { #[prost(enumeration = "CancelStatus", tag = "1")] pub status: i32, @@ -1053,19 +1053,17 @@ pub mod flight_service_server { /// can expose a set of actions that are available. #[derive(Debug)] pub struct FlightServiceServer { - inner: _Inner, + inner: Arc, accept_compression_encodings: EnabledCompressionEncodings, send_compression_encodings: EnabledCompressionEncodings, max_decoding_message_size: Option, max_encoding_message_size: Option, } - struct _Inner(Arc); impl FlightServiceServer { pub fn new(inner: T) -> Self { Self::from_arc(Arc::new(inner)) } pub fn from_arc(inner: Arc) -> Self { - let inner = _Inner(inner); Self { inner, accept_compression_encodings: Default::default(), @@ -1128,7 +1126,6 @@ pub mod flight_service_server { Poll::Ready(Ok(())) } fn call(&mut self, req: http::Request) -> Self::Future { - let inner = self.inner.clone(); match req.uri().path() { "/arrow.flight.protocol.FlightService/Handshake" => { #[allow(non_camel_case_types)] @@ -1162,7 +1159,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = HandshakeSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1209,7 +1205,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = ListFlightsSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1255,7 +1250,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = GetFlightInfoSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1302,7 +1296,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = PollFlightInfoSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1348,7 +1341,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = GetSchemaSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1395,7 +1387,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoGetSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1442,7 +1433,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoPutSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1489,7 +1479,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoExchangeSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1536,7 +1525,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = DoActionSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1583,7 +1571,6 @@ pub mod flight_service_server { let max_encoding_message_size = self.max_encoding_message_size; let inner = self.inner.clone(); let fut = async move { - let inner = inner.0; let method = ListActionsSvc(inner); let codec = tonic::codec::ProstCodec::default(); let mut grpc = tonic::server::Grpc::new(codec) @@ -1605,8 +1592,11 @@ pub mod flight_service_server { Ok( http::Response::builder() .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") + .header("grpc-status", tonic::Code::Unimplemented as i32) + .header( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ) .body(empty_body()) .unwrap(), ) @@ -1627,16 +1617,6 @@ pub mod flight_service_server { } } } - impl Clone for _Inner { - fn clone(&self) -> Self { - Self(Arc::clone(&self.0)) - } - } - impl std::fmt::Debug for _Inner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0) - } - } impl tonic::server::NamedService for FlightServiceServer { const NAME: &'static str = "arrow.flight.protocol.FlightService"; } diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index c1f0fac0f6ba..5e6f198df75c 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -101,7 +101,7 @@ pub struct CommandGetSqlInfo { /// > /// The returned data should be ordered by data_type and then by type_name. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CommandGetXdbcTypeInfo { /// /// Specifies the data type to search for the info. @@ -121,7 +121,7 @@ pub struct CommandGetXdbcTypeInfo { /// > /// The returned data should be ordered by catalog_name. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CommandGetCatalogs {} /// /// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. @@ -232,7 +232,7 @@ pub struct CommandGetTables { /// > /// The returned data should be ordered by table_type. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct CommandGetTableTypes {} /// /// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. @@ -511,7 +511,7 @@ pub struct ActionClosePreparedStatementRequest { /// Request message for the "BeginTransaction" action. /// Begins a transaction. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ActionBeginTransactionRequest {} /// /// Request message for the "BeginSavepoint" action. @@ -802,7 +802,7 @@ pub struct CommandPreparedStatementUpdate { /// CommandPreparedStatementUpdate was in the request, containing /// results from the update. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct DoPutUpdateResult { /// The number of records updated. A return value of -1 represents /// an unknown updated record count. @@ -862,7 +862,7 @@ pub struct ActionCancelQueryRequest { /// This command is deprecated since 13.0.0. Use the "CancelFlightInfo" /// action with DoAction instead. #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ActionCancelQueryResult { #[prost(enumeration = "action_cancel_query_result::CancelResult", tag = "1")] pub result: i32, diff --git a/arrow-flight/tests/common/trailers_layer.rs b/arrow-flight/tests/common/trailers_layer.rs index b2ab74f7d925..0ccb7df86c74 100644 --- a/arrow-flight/tests/common/trailers_layer.rs +++ b/arrow-flight/tests/common/trailers_layer.rs @@ -21,7 +21,7 @@ use std::task::{Context, Poll}; use futures::ready; use http::{HeaderValue, Request, Response}; -use http_body::SizeHint; +use http_body::{Frame, SizeHint}; use pin_project_lite::pin_project; use tower::{Layer, Service}; @@ -99,31 +99,19 @@ impl http_body::Body for WrappedBody { type Data = B::Data; type Error = B::Error; - fn poll_data( - mut self: Pin<&mut Self>, + fn poll_frame( + self: Pin<&mut Self>, cx: &mut Context<'_>, - ) -> Poll>> { - self.as_mut().project().inner.poll_data(cx) - } - - fn poll_trailers( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll, Self::Error>> { - let result: Result, Self::Error> = - ready!(self.as_mut().project().inner.poll_trailers(cx)); - - let mut trailers = http::header::HeaderMap::new(); - trailers.insert("test-trailer", HeaderValue::from_static("trailer_val")); + ) -> Poll, Self::Error>>> { + let mut result = ready!(self.project().inner.poll_frame(cx)); - match result { - Ok(Some(mut existing)) => { - existing.extend(trailers.iter().map(|(k, v)| (k.clone(), v.clone()))); - Poll::Ready(Ok(Some(existing))) + if let Some(Ok(frame)) = &mut result { + if let Some(trailers) = frame.trailers_mut() { + trailers.insert("test-trailer", HeaderValue::from_static("trailer_val")); } - Ok(None) => Poll::Ready(Ok(Some(trailers))), - Err(e) => Poll::Ready(Err(e)), } + + Poll::Ready(result) } fn is_end_stream(&self) -> bool { diff --git a/arrow-integration-testing/Cargo.toml b/arrow-integration-testing/Cargo.toml index 032b99f4fbbb..7be56d919852 100644 --- a/arrow-integration-testing/Cargo.toml +++ b/arrow-integration-testing/Cargo.toml @@ -42,11 +42,11 @@ async-trait = { version = "0.1.41", default-features = false } clap = { version = "4", default-features = false, features = ["std", "derive", "help", "error-context", "usage"] } futures = { version = "0.3", default-features = false } hex = { version = "0.4", default-features = false, features = ["std"] } -prost = { version = "0.12", default-features = false } +prost = { version = "0.13", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc", "derive"] } serde_json = { version = "1.0", default-features = false, features = ["std"] } tokio = { version = "1.0", default-features = false } -tonic = { version = "0.11", default-features = false } +tonic = { version = "0.12", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 6f07d42d88c1..0834f2d13384 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -34,4 +34,4 @@ crate-type = ["cdylib"] [dependencies] arrow = { path = "../arrow", features = ["pyarrow"] } -pyo3 = { version = "0.21.1", features = ["extension-module"] } +pyo3 = { version = "0.22", features = ["extension-module"] } diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 745ca03214e6..12b6ddd6a830 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -54,7 +54,7 @@ arrow-select = { workspace = true } arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } -pyo3 = { version = "0.21.1", default-features = false, optional = true } +pyo3 = { version = "0.22.2", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 1733067c738a..43cdb4fe0919 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -83,11 +83,6 @@ fn to_py_err(err: ArrowError) -> PyErr { } pub trait FromPyArrow: Sized { - #[deprecated(since = "52.0.0", note = "Use from_pyarrow_bound")] - fn from_pyarrow(value: &PyAny) -> PyResult { - Self::from_pyarrow_bound(&value.as_borrowed()) - } - fn from_pyarrow_bound(value: &Bound) -> PyResult; } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 2cc12a81dea5..7391d0964646 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -67,6 +67,7 @@ hashbrown = { version = "0.14", default-features = false } twox-hash = { version = "1.6", default-features = false } paste = { version = "1.0" } half = { version = "2.1", default-features = false, features = ["num-traits"] } +sysinfo = { version = "0.30.12", optional = true, default-features = false } [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } @@ -114,12 +115,19 @@ async = ["futures", "tokio"] object_store = ["dep:object_store", "async"] # Group Zstd dependencies zstd = ["dep:zstd", "zstd-sys"] +# Display memory in example/write_parquet.rs +sysinfo = ["dep:sysinfo"] [[example]] name = "read_parquet" required-features = ["arrow"] path = "./examples/read_parquet.rs" +[[example]] +name = "write_parquet" +required-features = ["cli", "sysinfo"] +path = "./examples/write_parquet.rs" + [[example]] name = "async_read_parquet" required-features = ["arrow", "async"] diff --git a/parquet/examples/write_parquet.rs b/parquet/examples/write_parquet.rs new file mode 100644 index 000000000000..d2ef550df840 --- /dev/null +++ b/parquet/examples/write_parquet.rs @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::fs::File; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use arrow::array::{StructArray, UInt64Builder}; +use arrow::datatypes::DataType::UInt64; +use arrow::datatypes::{Field, Schema}; +use clap::{Parser, ValueEnum}; +use parquet::arrow::ArrowWriter as ParquetWriter; +use parquet::basic::Encoding; +use parquet::errors::Result; +use parquet::file::properties::{BloomFilterPosition, WriterProperties}; +use sysinfo::{MemoryRefreshKind, Pid, ProcessRefreshKind, RefreshKind, System}; + +#[derive(ValueEnum, Clone)] +enum BloomFilterPositionArg { + End, + AfterRowGroup, +} + +#[derive(Parser)] +#[command(version)] +/// Writes sequences of integers, with a Bloom Filter, while logging timing and memory usage. +struct Args { + #[arg(long, default_value_t = 1000)] + /// Number of batches to write + iterations: u64, + + #[arg(long, default_value_t = 1000000)] + /// Number of rows in each batch + batch: u64, + + #[arg(long, value_enum, default_value_t=BloomFilterPositionArg::AfterRowGroup)] + /// Where to write Bloom Filters + bloom_filter_position: BloomFilterPositionArg, + + /// Path to the file to write + path: PathBuf, +} + +fn now() -> String { + chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string() +} + +fn mem(system: &mut System) -> String { + let pid = Pid::from(std::process::id() as usize); + system.refresh_process_specifics(pid, ProcessRefreshKind::new().with_memory()); + system + .process(pid) + .map(|proc| format!("{}MB", proc.memory() / 1_000_000)) + .unwrap_or("N/A".to_string()) +} + +fn main() -> Result<()> { + let args = Args::parse(); + + let bloom_filter_position = match args.bloom_filter_position { + BloomFilterPositionArg::End => BloomFilterPosition::End, + BloomFilterPositionArg::AfterRowGroup => BloomFilterPosition::AfterRowGroup, + }; + + let properties = WriterProperties::builder() + .set_column_bloom_filter_enabled("id".into(), true) + .set_column_encoding("id".into(), Encoding::DELTA_BINARY_PACKED) + .set_bloom_filter_position(bloom_filter_position) + .build(); + let schema = Arc::new(Schema::new(vec![Field::new("id", UInt64, false)])); + // Create parquet file that will be read. + let file = File::create(args.path).unwrap(); + let mut writer = ParquetWriter::try_new(file, schema.clone(), Some(properties))?; + + let mut system = + System::new_with_specifics(RefreshKind::new().with_memory(MemoryRefreshKind::everything())); + eprintln!( + "{} Writing {} batches of {} rows. RSS = {}", + now(), + args.iterations, + args.batch, + mem(&mut system) + ); + + let mut array_builder = UInt64Builder::new(); + let mut last_log = Instant::now(); + for i in 0..args.iterations { + if Instant::now() - last_log > Duration::new(10, 0) { + last_log = Instant::now(); + eprintln!( + "{} Iteration {}/{}. RSS = {}", + now(), + i + 1, + args.iterations, + mem(&mut system) + ); + } + for j in 0..args.batch { + array_builder.append_value(i + j); + } + writer.write( + &StructArray::new( + schema.fields().clone(), + vec![Arc::new(array_builder.finish())], + None, + ) + .into(), + )?; + } + writer.flush()?; + writer.close()?; + + eprintln!("{} Done. RSS = {}", now(), mem(&mut system)); + + Ok(()) +} diff --git a/parquet/regen.sh b/parquet/regen.sh index d1b82108a018..39999c7872cd 100755 --- a/parquet/regen.sh +++ b/parquet/regen.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2 +REVISION=5b564f3c47679526cf72e54f207013f28f53acc4 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 15c1a880cc75..c696763d63d2 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -394,7 +394,7 @@ impl ArrowReaderMetadata { let offset_index = metadata .row_groups() .iter() - .map(|rg| index_reader::read_pages_locations(reader, rg.columns())) + .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) .collect::>>()?; metadata.set_offset_index(Some(offset_index)) @@ -689,7 +689,7 @@ impl Iterator for ReaderPageIterator { // To avoid `i[rg_idx][self.oolumn_idx`] panic, we need to filter out empty `i[rg_idx]`. let page_locations = offset_index .filter(|i| !i[rg_idx].is_empty()) - .map(|i| i[rg_idx][self.column_idx].clone()); + .map(|i| i[rg_idx][self.column_idx].page_locations.clone()); let total_rows = rg.num_rows() as usize; let reader = self.reader.clone(); diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 00a3ad90d006..6a1434bce906 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1349,7 +1349,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1378,7 +1380,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1408,7 +1412,9 @@ impl<'a> StatisticsConverter<'a> { let iter = row_group_indices.into_iter().map(|rg_index| { let column_page_index_per_row_group_per_column = &column_page_index[*rg_index][parquet_index]; - let num_data_pages = &column_offset_index[*rg_index][parquet_index].len(); + let num_data_pages = &column_offset_index[*rg_index][parquet_index] + .page_locations() + .len(); (*num_data_pages, column_page_index_per_row_group_per_column) }); @@ -1450,7 +1456,7 @@ impl<'a> StatisticsConverter<'a> { let mut row_count_total = Vec::new(); for rg_idx in row_group_indices { - let page_locations = &column_offset_index[*rg_idx][parquet_index]; + let page_locations = &column_offset_index[*rg_idx][parquet_index].page_locations(); let row_count_per_page = page_locations .windows(2) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index fc37ebfb4510..2d23ad8510f9 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -96,6 +96,7 @@ macro_rules! downcast_op { struct FallbackEncoder { encoder: FallbackEncoderImpl, num_values: usize, + variable_length_bytes: i64, } /// The fallback encoder in use @@ -152,6 +153,7 @@ impl FallbackEncoder { Ok(Self { encoder, num_values: 0, + variable_length_bytes: 0, }) } @@ -168,7 +170,8 @@ impl FallbackEncoder { let value = values.value(*idx); let value = value.as_ref(); buffer.extend_from_slice((value.len() as u32).as_bytes()); - buffer.extend_from_slice(value) + buffer.extend_from_slice(value); + self.variable_length_bytes += value.len() as i64; } } FallbackEncoderImpl::DeltaLength { buffer, lengths } => { @@ -177,6 +180,7 @@ impl FallbackEncoder { let value = value.as_ref(); lengths.put(&[value.len() as i32]).unwrap(); buffer.extend_from_slice(value); + self.variable_length_bytes += value.len() as i64; } } FallbackEncoderImpl::Delta { @@ -205,6 +209,7 @@ impl FallbackEncoder { buffer.extend_from_slice(&value[prefix_length..]); prefix_lengths.put(&[prefix_length as i32]).unwrap(); suffix_lengths.put(&[suffix_length as i32]).unwrap(); + self.variable_length_bytes += value.len() as i64; } } } @@ -269,12 +274,17 @@ impl FallbackEncoder { } }; + // Capture value of variable_length_bytes and reset for next page + let variable_length_bytes = Some(self.variable_length_bytes); + self.variable_length_bytes = 0; + Ok(DataPageValues { buf: buf.into(), num_values: std::mem::take(&mut self.num_values), encoding, min_value, max_value, + variable_length_bytes, }) } } @@ -321,6 +331,7 @@ impl Storage for ByteArrayStorage { struct DictEncoder { interner: Interner, indices: Vec, + variable_length_bytes: i64, } impl DictEncoder { @@ -336,6 +347,7 @@ impl DictEncoder { let value = values.value(*idx); let interned = self.interner.intern(value.as_ref()); self.indices.push(interned); + self.variable_length_bytes += value.as_ref().len() as i64; } } @@ -384,12 +396,17 @@ impl DictEncoder { self.indices.clear(); + // Capture value of variable_length_bytes and reset for next page + let variable_length_bytes = Some(self.variable_length_bytes); + self.variable_length_bytes = 0; + DataPageValues { buf: encoder.consume().into(), num_values, encoding: Encoding::RLE_DICTIONARY, min_value, max_value, + variable_length_bytes, } } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index cf46f3b64a57..8f7b514ccf71 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -43,7 +43,7 @@ use crate::column::writer::{ }; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaDataPtr}; +use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaData}; use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; @@ -204,7 +204,7 @@ impl ArrowWriter { } /// Returns metadata for any flushed row groups - pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] { self.writer.flushed_row_groups() } @@ -1096,8 +1096,10 @@ mod tests { use crate::data_type::AsBytes; use crate::file::metadata::ParquetMetaData; use crate::file::page_index::index::Index; - use crate::file::page_index::index_reader::read_pages_locations; - use crate::file::properties::{EnabledStatistics, ReaderProperties, WriterVersion}; + use crate::file::page_index::index_reader::read_offset_indexes; + use crate::file::properties::{ + BloomFilterPosition, EnabledStatistics, ReaderProperties, WriterVersion, + }; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::file::{ reader::{FileReader, SerializedFileReader}, @@ -1667,16 +1669,16 @@ mod tests { "Expected a dictionary page" ); - let page_locations = read_pages_locations(&file, column).unwrap(); + let offset_indexes = read_offset_indexes(&file, column).unwrap(); - let offset_index = page_locations[0].clone(); + let page_locations = offset_indexes[0].page_locations.clone(); // We should fallback to PLAIN encoding after the first row and our max page size is 1 bytes // so we expect one dictionary encoded page and then a page per row thereafter. assert_eq!( - offset_index.len(), + page_locations.len(), 10, - "Expected 9 pages but got {offset_index:#?}" + "Expected 9 pages but got {page_locations:#?}" ); } @@ -1745,6 +1747,7 @@ mod tests { values: ArrayRef, schema: SchemaRef, bloom_filter: bool, + bloom_filter_position: BloomFilterPosition, } impl RoundTripOptions { @@ -1755,6 +1758,7 @@ mod tests { values, schema: Arc::new(schema), bloom_filter: false, + bloom_filter_position: BloomFilterPosition::AfterRowGroup, } } } @@ -1774,6 +1778,7 @@ mod tests { values, schema, bloom_filter, + bloom_filter_position, } = options; let encodings = match values.data_type() { @@ -1814,6 +1819,7 @@ mod tests { .set_dictionary_page_size_limit(dictionary_size.max(1)) .set_encoding(*encoding) .set_bloom_filter_enabled(bloom_filter) + .set_bloom_filter_position(bloom_filter_position) .build(); files.push(roundtrip_opts(&expected_batch, props)) @@ -2171,6 +2177,22 @@ mod tests { values_required::(many_vecs_iter); } + #[test] + fn i32_column_bloom_filter_at_end() { + let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32)); + let mut options = RoundTripOptions::new(array, false); + options.bloom_filter = true; + options.bloom_filter_position = BloomFilterPosition::End; + + let files = one_column_roundtrip_with_options(options); + check_bloom_filter( + files, + "col".to_string(), + (0..SMALL_SIZE as i32).collect(), + (SMALL_SIZE as i32 + 1..SMALL_SIZE as i32 + 10).collect(), + ); + } + #[test] fn i32_column_bloom_filter() { let array = Arc::new(Int32Array::from_iter(0..SMALL_SIZE as i32)); @@ -2998,8 +3020,8 @@ mod tests { assert_eq!(index.len(), 1); assert_eq!(index[0].len(), 2); // 2 columns - assert_eq!(index[0][0].len(), 1); // 1 page - assert_eq!(index[0][1].len(), 1); // 1 page + assert_eq!(index[0][0].page_locations().len(), 1); // 1 page + assert_eq!(index[0][1].page_locations().len(), 1); // 1 page } #[test] diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index e4205b7ef2ce..5695dbc10fe1 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -106,9 +106,10 @@ use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::{ChunkReader, Length, SerializedPageReader}; use crate::file::FOOTER_SIZE; -use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash, PageLocation}; +use crate::format::{BloomFilterAlgorithm, BloomFilterCompression, BloomFilterHash}; mod metadata; pub use metadata::*; @@ -489,7 +490,7 @@ where // TODO: calling build_array multiple times is wasteful let meta = self.metadata.row_group(row_group_idx); - let page_locations = self + let offset_index = self .metadata .offset_index() .map(|x| x[row_group_idx].as_slice()); @@ -499,7 +500,7 @@ where // schema: meta.schema_descr_ptr(), row_count: meta.num_rows() as usize, column_chunks: vec![None; meta.columns().len()], - page_locations, + offset_index, }; if let Some(filter) = self.filter.as_mut() { @@ -703,7 +704,7 @@ where /// An in-memory collection of column chunks struct InMemoryRowGroup<'a> { metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, column_chunks: Vec>>, row_count: usize, } @@ -716,7 +717,7 @@ impl<'a> InMemoryRowGroup<'a> { projection: &ProjectionMask, selection: Option<&RowSelection>, ) -> Result<()> { - if let Some((selection, page_locations)) = selection.zip(self.page_locations) { + if let Some((selection, offset_index)) = selection.zip(self.offset_index) { // If we have a `RowSelection` and an `OffsetIndex` then only fetch pages required for the // `RowSelection` let mut page_start_offsets: Vec> = vec![]; @@ -734,14 +735,14 @@ impl<'a> InMemoryRowGroup<'a> { // then we need to also fetch a dictionary page. let mut ranges = vec![]; let (start, _len) = chunk_meta.byte_range(); - match page_locations[idx].first() { + match offset_index[idx].page_locations.first() { Some(first) if first.offset as u64 != start => { ranges.push(start as usize..first.offset as usize); } _ => (), } - ranges.extend(selection.scan_ranges(&page_locations[idx])); + ranges.extend(selection.scan_ranges(&offset_index[idx].page_locations)); page_start_offsets.push(ranges.iter().map(|range| range.start).collect()); ranges @@ -812,7 +813,9 @@ impl<'a> RowGroups for InMemoryRowGroup<'a> { "Invalid column index {i}, column was not fetched" ))), Some(data) => { - let page_locations = self.page_locations.map(|index| index[i].clone()); + let page_locations = self + .offset_index + .map(|index| index[i].page_locations.clone()); let page_reader: Box = Box::new(SerializedPageReader::new( data.clone(), self.metadata.column(i), @@ -1529,7 +1532,7 @@ mod tests { let metadata = parse_metadata(&data).unwrap(); let offset_index = - index_reader::read_pages_locations(&data, metadata.row_group(0).columns()) + index_reader::read_offset_indexes(&data, metadata.row_group(0).columns()) .expect("reading offset index"); let row_group_meta = metadata.row_group(0).clone(); @@ -1574,7 +1577,7 @@ mod tests { }; let mut skip = true; - let mut pages = offset_index[0].iter().peekable(); + let mut pages = offset_index[0].page_locations.iter().peekable(); // Setup `RowSelection` so that we can skip every other page, selecting the last page let mut selectors = vec![]; diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs index edeb0fec00b7..274d8fef8976 100644 --- a/parquet/src/arrow/async_writer/mod.rs +++ b/parquet/src/arrow/async_writer/mod.rs @@ -54,7 +54,7 @@ use crate::{ arrow::arrow_writer::ArrowWriterOptions, arrow::ArrowWriter, errors::{ParquetError, Result}, - file::{metadata::RowGroupMetaDataPtr, properties::WriterProperties}, + file::{metadata::RowGroupMetaData, properties::WriterProperties}, format::{FileMetaData, KeyValue}, }; use arrow_array::RecordBatch; @@ -172,7 +172,7 @@ impl AsyncArrowWriter { } /// Returns metadata for any flushed row groups - pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] { self.sync_writer.flushed_row_groups() } diff --git a/parquet/src/bin/parquet-index.rs b/parquet/src/bin/parquet-index.rs index 86e08b6dafa3..1a9b74dd78fb 100644 --- a/parquet/src/bin/parquet-index.rs +++ b/parquet/src/bin/parquet-index.rs @@ -37,6 +37,7 @@ use clap::Parser; use parquet::errors::{ParquetError, Result}; use parquet::file::page_index::index::{Index, PageIndex}; +use parquet::file::page_index::offset_index::OffsetIndexMetaData; use parquet::file::reader::{FileReader, SerializedFileReader}; use parquet::file::serialized_reader::ReadOptionsBuilder; use parquet::format::PageLocation; @@ -93,7 +94,8 @@ impl Args { )) })?; - let row_counts = compute_row_counts(offset_index, row_group.num_rows()); + let row_counts = + compute_row_counts(offset_index.page_locations.as_slice(), row_group.num_rows()); match &column_indices[column_idx] { Index::NONE => println!("NO INDEX"), Index::BOOLEAN(v) => print_index(&v.indexes, offset_index, &row_counts)?, @@ -131,20 +133,20 @@ fn compute_row_counts(offset_index: &[PageLocation], rows: i64) -> Vec { /// Prints index information for a single column chunk fn print_index( column_index: &[PageIndex], - offset_index: &[PageLocation], + offset_index: &OffsetIndexMetaData, row_counts: &[i64], ) -> Result<()> { - if column_index.len() != offset_index.len() { + if column_index.len() != offset_index.page_locations.len() { return Err(ParquetError::General(format!( "Index length mismatch, got {} and {}", column_index.len(), - offset_index.len() + offset_index.page_locations.len() ))); } for (idx, ((c, o), row_count)) in column_index .iter() - .zip(offset_index) + .zip(offset_index.page_locations()) .zip(row_counts) .enumerate() { diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index b6c8212608b8..9d01c09040de 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -63,6 +63,7 @@ pub struct DataPageValues { pub encoding: Encoding, pub min_value: Option, pub max_value: Option, + pub variable_length_bytes: Option, } /// A generic encoder of [`ColumnValues`] to data and dictionary pages used by @@ -131,6 +132,7 @@ pub struct ColumnValueEncoderImpl { min_value: Option, max_value: Option, bloom_filter: Option, + variable_length_bytes: Option, } impl ColumnValueEncoderImpl { @@ -150,6 +152,10 @@ impl ColumnValueEncoderImpl { update_min(&self.descr, &min, &mut self.min_value); update_max(&self.descr, &max, &mut self.max_value); } + + if let Some(var_bytes) = T::T::variable_length_bytes(slice) { + *self.variable_length_bytes.get_or_insert(0) += var_bytes; + } } // encode the values into bloom filter if enabled @@ -203,6 +209,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { bloom_filter, min_value: None, max_value: None, + variable_length_bytes: None, }) } @@ -296,6 +303,7 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { num_values: std::mem::take(&mut self.num_values), min_value: self.min_value.take(), max_value: self.max_value.take(), + variable_length_bytes: self.variable_length_bytes.take(), }) } } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index fdc24890e1fa..2c0c957d87d3 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -192,7 +192,8 @@ struct PageMetrics { } // Metrics per column writer -struct ColumnMetrics { +#[derive(Default)] +struct ColumnMetrics { total_bytes_written: u64, total_rows_written: u64, total_uncompressed_size: u64, @@ -204,6 +205,20 @@ struct ColumnMetrics { max_column_value: Option, num_column_nulls: u64, column_distinct_count: Option, + variable_length_bytes: Option, +} + +impl ColumnMetrics { + fn new() -> Self { + Default::default() + } + + /// Sum the provided page variable_length_bytes into the chunk variable_length_bytes + fn update_variable_length_bytes(&mut self, variable_length_bytes: Option) { + if let Some(var_bytes) = variable_length_bytes { + *self.variable_length_bytes.get_or_insert(0) += var_bytes; + } + } } /// Typed column writer for a primitive column. @@ -282,19 +297,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { num_buffered_rows: 0, num_page_nulls: 0, }, - column_metrics: ColumnMetrics { - total_bytes_written: 0, - total_rows_written: 0, - total_uncompressed_size: 0, - total_compressed_size: 0, - total_num_values: 0, - dictionary_page_offset: None, - data_page_offset: None, - min_column_value: None, - max_column_value: None, - num_column_nulls: 0, - column_distinct_count: None, - }, + column_metrics: ColumnMetrics::::new(), column_index_builder, offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -640,7 +643,11 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } /// Update the column index and offset index when adding the data page - fn update_column_offset_index(&mut self, page_statistics: Option<&ValueStatistics>) { + fn update_column_offset_index( + &mut self, + page_statistics: Option<&ValueStatistics>, + page_variable_length_bytes: Option, + ) { // update the column index let null_page = (self.page_metrics.num_buffered_rows as u64) == self.page_metrics.num_page_nulls; @@ -714,6 +721,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // update the offset index self.offset_index_builder .append_row_count(self.page_metrics.num_buffered_rows as i64); + + self.offset_index_builder + .append_unencoded_byte_array_data_bytes(page_variable_length_bytes); } /// Determine if we should allow truncating min/max values for this column's statistics @@ -789,7 +799,15 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { }; // update column and offset index - self.update_column_offset_index(page_statistics.as_ref()); + self.update_column_offset_index( + page_statistics.as_ref(), + values_data.variable_length_bytes, + ); + + // Update variable_length_bytes in column_metrics + self.column_metrics + .update_variable_length_bytes(values_data.variable_length_bytes); + let page_statistics = page_statistics.map(Statistics::from); let compressed_page = match self.props.writer_version() { @@ -999,7 +1017,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { stats => stats, }; - builder = builder.set_statistics(statistics); + builder = builder + .set_statistics(statistics) + .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes); } let metadata = builder.build()?; diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index b85a75cfd410..01e92115c45b 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -644,6 +644,13 @@ pub(crate) mod private { (std::mem::size_of::(), 1) } + /// Return the number of variable length bytes in a given slice of data + /// + /// Returns the sum of lengths for BYTE_ARRAY data, and None for all other data types + fn variable_length_bytes(_: &[Self]) -> Option { + None + } + /// Return the value as i64 if possible /// /// This is essentially the same as `std::convert::TryInto` but can't be @@ -956,6 +963,10 @@ pub(crate) mod private { Ok(num_values) } + fn variable_length_bytes(values: &[Self]) -> Option { + Some(values.iter().map(|x| x.len() as i64).sum()) + } + fn skip(decoder: &mut PlainDecoderDetails, num_values: usize) -> Result { let data = decoder .data diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 57d5aaa2dd2f..0b6d1f0d1a24 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -23,6 +23,7 @@ use crate::data_type::private::ParquetValueType; use crate::file::metadata::{ColumnChunkMetaData, FileMetaData, KeyValue, RowGroupMetaData}; use crate::file::page_encoding_stats::PageEncodingStats; use crate::file::page_index::index::{Index, NativeIndex, PageIndex}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{Statistics, ValueStatistics}; use crate::format::{BoundaryOrder, PageLocation, SortingColumn}; use std::sync::Arc; @@ -97,6 +98,7 @@ impl HeapSize for ColumnChunkMetaData { + self.compression.heap_size() + self.statistics.heap_size() + self.encoding_stats.heap_size() + + self.unencoded_byte_array_data_bytes.heap_size() } } @@ -143,6 +145,12 @@ impl HeapSize for Statistics { } } +impl HeapSize for OffsetIndexMetaData { + fn heap_size(&self) -> usize { + self.page_locations.heap_size() + self.unencoded_byte_array_data_bytes.heap_size() + } +} + impl HeapSize for Index { fn heap_size(&self) -> usize { match self { diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 16c51c8115b9..52206e66a590 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -40,7 +40,7 @@ use std::sync::Arc; use crate::format::{ BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup, - SortingColumn, + SizeStatistics, SortingColumn, }; use crate::basic::{ColumnOrder, Compression, Encoding, Type}; @@ -48,6 +48,7 @@ use crate::errors::{ParquetError, Result}; pub(crate) use crate::file::metadata::memory::HeapSize; use crate::file::page_encoding_stats::{self, PageEncodingStats}; use crate::file::page_index::index::Index; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::statistics::{self, Statistics}; use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor, @@ -65,27 +66,23 @@ use crate::schema::types::{ /// [`Index`] corresponding to column `column_number` of row group /// `row_group_number`. /// -/// For example `column_index[2][3]` holds the [`Index`] for the forth +/// For example `column_index[2][3]` holds the [`Index`] for the fourth /// column in the third row group of the parquet file. /// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetColumnIndex = Vec>; -/// [`PageLocation`] for each data page of each row group of each column +/// [`OffsetIndexMetaData`] for each data page of each row group of each column /// /// This structure is the parsed representation of the [`OffsetIndex`] from the /// Parquet file footer, as described in the Parquet [PageIndex documentation]. /// -/// `offset_index[row_group_number][column_number][page_number]` holds -/// the [`PageLocation`] corresponding to page `page_number` of column +/// `offset_index[row_group_number][column_number]` holds +/// the [`OffsetIndexMetaData`] corresponding to column /// `column_number`of row group `row_group_number`. /// -/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for -/// the fifth page of the forth column in the third row group of the -/// parquet file. -/// /// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md -pub type ParquetOffsetIndex = Vec>>; +pub type ParquetOffsetIndex = Vec>; /// Parsed metadata for a single Parquet file /// @@ -110,7 +107,7 @@ pub struct ParquetMetaData { row_groups: Vec, /// Page level index for each page in each column chunk column_index: Option, - /// Offset index for all each page in each column chunk + /// Offset index for each page in each column chunk offset_index: Option, } @@ -374,6 +371,11 @@ impl RowGroupMetaData { &self.columns } + /// Returns mutable slice of column chunk metadata. + pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] { + &mut self.columns + } + /// Number of rows in this row group. pub fn num_rows(&self) -> i64 { self.num_rows @@ -554,6 +556,7 @@ pub struct ColumnChunkMetaData { offset_index_length: Option, column_index_offset: Option, column_index_length: Option, + unencoded_byte_array_data_bytes: Option, } /// Represents common operations for a column chunk. @@ -706,6 +709,14 @@ impl ColumnChunkMetaData { Some(offset..(offset + length)) } + /// Returns the number of bytes of variable length data after decoding. + /// + /// Only set for BYTE_ARRAY columns. This field may not be set by older + /// writers. + pub fn unencoded_byte_array_data_bytes(&self) -> Option { + self.unencoded_byte_array_data_bytes + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { @@ -743,6 +754,12 @@ impl ColumnChunkMetaData { let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; let column_index_length = cc.column_index_length; + let unencoded_byte_array_data_bytes = if let Some(size_stats) = col_metadata.size_statistics + { + size_stats.unencoded_byte_array_data_bytes + } else { + None + }; let result = ColumnChunkMetaData { column_descr, @@ -764,6 +781,7 @@ impl ColumnChunkMetaData { offset_index_length, column_index_offset, column_index_length, + unencoded_byte_array_data_bytes, }; Ok(result) } @@ -787,6 +805,16 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift `ColumnMetaData` pub fn to_column_metadata_thrift(&self) -> ColumnMetaData { + let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() { + Some(SizeStatistics { + unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, + repetition_level_histogram: None, + definition_level_histogram: None, + }) + } else { + None + }; + ColumnMetaData { type_: self.column_type().into(), encodings: self.encodings().iter().map(|&v| v.into()).collect(), @@ -806,6 +834,7 @@ impl ColumnChunkMetaData { .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()), bloom_filter_offset: self.bloom_filter_offset, bloom_filter_length: self.bloom_filter_length, + size_statistics, } } @@ -841,6 +870,7 @@ impl ColumnChunkMetaDataBuilder { offset_index_length: None, column_index_offset: None, column_index_length: None, + unencoded_byte_array_data_bytes: None, }) } @@ -952,6 +982,12 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets optional length of variable length data in bytes. + pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option) -> Self { + self.0.unencoded_byte_array_data_bytes = value; + self + } + /// Builds column chunk metadata. pub fn build(self) -> Result { Ok(self.0) @@ -1033,6 +1069,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, + None, + None, ) } } @@ -1044,6 +1082,7 @@ pub struct OffsetIndexBuilder { offset_array: Vec, compressed_page_size_array: Vec, first_row_index_array: Vec, + unencoded_byte_array_data_bytes_array: Option>, current_first_row_index: i64, } @@ -1059,6 +1098,7 @@ impl OffsetIndexBuilder { offset_array: Vec::new(), compressed_page_size_array: Vec::new(), first_row_index_array: Vec::new(), + unencoded_byte_array_data_bytes_array: None, current_first_row_index: 0, } } @@ -1074,6 +1114,17 @@ impl OffsetIndexBuilder { self.compressed_page_size_array.push(compressed_page_size); } + pub fn append_unencoded_byte_array_data_bytes( + &mut self, + unencoded_byte_array_data_bytes: Option, + ) { + if let Some(val) = unencoded_byte_array_data_bytes { + self.unencoded_byte_array_data_bytes_array + .get_or_insert(Vec::new()) + .push(val); + } + } + /// Build and get the thrift metadata of offset index pub fn build_to_thrift(self) -> OffsetIndex { let locations = self @@ -1083,7 +1134,7 @@ impl OffsetIndexBuilder { .zip(self.first_row_index_array.iter()) .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index)) .collect::>(); - OffsetIndex::new(locations) + OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array) } } @@ -1234,6 +1285,7 @@ mod tests { .set_offset_index_length(Some(25)) .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) + .set_unencoded_byte_array_data_bytes(Some(2000)) .build() .unwrap(); @@ -1345,7 +1397,8 @@ mod tests { let row_group_meta_with_stats = vec![row_group_meta_with_stats]; let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta_with_stats); - let base_expected_size = 2024; + let base_expected_size = 2088; + assert_eq!(parquet_meta.memory_size(), base_expected_size); let mut column_index = ColumnIndexBuilder::new(); @@ -1354,17 +1407,25 @@ mod tests { let native_index = NativeIndex::::try_new(column_index).unwrap(); // Now, add in OffsetIndex + let mut offset_index = OffsetIndexBuilder::new(); + offset_index.append_row_count(1); + offset_index.append_offset_and_size(2, 3); + offset_index.append_unencoded_byte_array_data_bytes(Some(10)); + offset_index.append_row_count(1); + offset_index.append_offset_and_size(2, 3); + offset_index.append_unencoded_byte_array_data_bytes(Some(10)); + let offset_index = offset_index.build_to_thrift(); + let parquet_meta = ParquetMetaData::new_with_page_index( file_metadata, row_group_meta, Some(vec![vec![Index::BOOLEAN(native_index)]]), Some(vec![vec![ - vec![PageLocation::new(1, 2, 3)], - vec![PageLocation::new(1, 2, 3)], + OffsetIndexMetaData::try_new(offset_index).unwrap() ]]), ); - let bigger_expected_size = 2304; + let bigger_expected_size = 2400; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/page_index/index_reader.rs b/parquet/src/file/page_index/index_reader.rs index 2ddf826fb022..395e9afe122c 100644 --- a/parquet/src/file/page_index/index_reader.rs +++ b/parquet/src/file/page_index/index_reader.rs @@ -22,6 +22,7 @@ use crate::data_type::Int96; use crate::errors::ParquetError; use crate::file::metadata::ColumnChunkMetaData; use crate::file::page_index::index::{Index, NativeIndex}; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::reader::ChunkReader; use crate::format::{ColumnIndex, OffsetIndex, PageLocation}; use crate::thrift::{TCompactSliceInputProtocol, TSerializable}; @@ -45,9 +46,9 @@ pub(crate) fn acc_range(a: Option>, b: Option>) -> Opt /// Returns an empty vector if this row group does not contain a /// [`ColumnIndex`]. /// -/// See [Column Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub fn read_columns_indexes( reader: &R, chunks: &[ColumnChunkMetaData], @@ -81,9 +82,10 @@ pub fn read_columns_indexes( /// Return an empty vector if this row group does not contain an /// [`OffsetIndex]`. /// -/// See [Column Index Documentation] for more details. +/// See [Page Index Documentation] for more details. /// -/// [Column Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +#[deprecated(since = "53.0.0", note = "Use read_offset_indexes")] pub fn read_pages_locations( reader: &R, chunks: &[ColumnChunkMetaData], @@ -100,6 +102,42 @@ pub fn read_pages_locations( let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; + chunks + .iter() + .map(|c| match c.offset_index_range() { + Some(r) => decode_page_locations(get(r)), + None => Err(general_err!("missing offset index")), + }) + .collect() +} + +/// Reads per-column [`OffsetIndexMetaData`] for all columns of a row group by +/// decoding [`OffsetIndex`] . +/// +/// Returns a vector of `offset_index[column_number]`. +/// +/// Returns an empty vector if this row group does not contain an +/// [`OffsetIndex`]. +/// +/// See [Page Index Documentation] for more details. +/// +/// [Page Index Documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +pub fn read_offset_indexes( + reader: &R, + chunks: &[ColumnChunkMetaData], +) -> Result, ParquetError> { + let fetch = chunks + .iter() + .fold(None, |range, c| acc_range(range, c.offset_index_range())); + + let fetch = match fetch { + Some(r) => r, + None => return Ok(vec![]), + }; + + let bytes = reader.get_bytes(fetch.start as _, fetch.end - fetch.start)?; + let get = |r: Range| &bytes[(r.start - fetch.start)..(r.end - fetch.start)]; + chunks .iter() .map(|c| match c.offset_index_range() { @@ -109,7 +147,13 @@ pub fn read_pages_locations( .collect() } -pub(crate) fn decode_offset_index(data: &[u8]) -> Result, ParquetError> { +pub(crate) fn decode_offset_index(data: &[u8]) -> Result { + let mut prot = TCompactSliceInputProtocol::new(data); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + OffsetIndexMetaData::try_new(offset) +} + +pub(crate) fn decode_page_locations(data: &[u8]) -> Result, ParquetError> { let mut prot = TCompactSliceInputProtocol::new(data); let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; Ok(offset.page_locations) diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index 9372645d76ee..a8077896db34 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -21,3 +21,4 @@ pub mod index; pub mod index_reader; +pub mod offset_index; diff --git a/parquet/src/file/page_index/offset_index.rs b/parquet/src/file/page_index/offset_index.rs new file mode 100644 index 000000000000..2ae3464141ca --- /dev/null +++ b/parquet/src/file/page_index/offset_index.rs @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`OffsetIndexMetaData`] structure holding decoded [`OffsetIndex`] information + +use crate::errors::ParquetError; +use crate::format::{OffsetIndex, PageLocation}; + +/// [`OffsetIndex`] information for a column chunk. Contains offsets and sizes for each page +/// in the chunk. Optionally stores fully decoded page sizes for BYTE_ARRAY columns. +#[derive(Debug, Clone, PartialEq)] +pub struct OffsetIndexMetaData { + pub page_locations: Vec, + pub unencoded_byte_array_data_bytes: Option>, +} + +impl OffsetIndexMetaData { + /// Creates a new [`OffsetIndexMetaData`] from an [`OffsetIndex`]. + pub(crate) fn try_new(index: OffsetIndex) -> Result { + Ok(Self { + page_locations: index.page_locations, + unencoded_byte_array_data_bytes: index.unencoded_byte_array_data_bytes, + }) + } + + /// Vector of [`PageLocation`] objects, one per page in the chunk. + pub fn page_locations(&self) -> &Vec { + &self.page_locations + } + + /// Optional vector of unencoded page sizes, one per page in the chunk. Only defined + /// for BYTE_ARRAY columns. + pub fn unencoded_byte_array_data_bytes(&self) -> Option<&Vec> { + self.unencoded_byte_array_data_bytes.as_ref() + } + + // TODO: remove annotation after merge + #[allow(dead_code)] + pub(crate) fn to_thrift(&self) -> OffsetIndex { + OffsetIndex::new( + self.page_locations.clone(), + self.unencoded_byte_array_data_bytes.clone(), + ) + } +} diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index cd5969fda0be..61f6390c97d4 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -45,6 +45,8 @@ pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Pag pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096; /// Default value for [`WriterProperties::max_row_group_size`] pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024; +/// Default value for [`WriterProperties::bloom_filter_position`] +pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup; /// Default value for [`WriterProperties::created_by`] pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION")); /// Default value for [`WriterProperties::column_index_truncate_length`] @@ -88,6 +90,24 @@ impl FromStr for WriterVersion { } } +/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should +/// write Bloom filters +/// +/// Basic constant, which is not part of the Thrift definition. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BloomFilterPosition { + /// Write Bloom Filters of each row group right after the row group + /// + /// This saves memory by writing it as soon as it is computed, at the cost + /// of data locality for readers + AfterRowGroup, + /// Write Bloom Filters at the end of the file + /// + /// This allows better data locality for readers, at the cost of memory usage + /// for writers. + End, +} + /// Reference counted writer properties. pub type WriterPropertiesPtr = Arc; @@ -132,6 +152,7 @@ pub struct WriterProperties { data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, + bloom_filter_position: BloomFilterPosition, writer_version: WriterVersion, created_by: String, pub(crate) key_value_metadata: Option>, @@ -219,6 +240,11 @@ impl WriterProperties { self.max_row_group_size } + /// Returns maximum number of rows in a row group. + pub fn bloom_filter_position(&self) -> BloomFilterPosition { + self.bloom_filter_position + } + /// Returns configured writer version. pub fn writer_version(&self) -> WriterVersion { self.writer_version @@ -340,6 +366,7 @@ pub struct WriterPropertiesBuilder { data_page_row_count_limit: usize, write_batch_size: usize, max_row_group_size: usize, + bloom_filter_position: BloomFilterPosition, writer_version: WriterVersion, created_by: String, key_value_metadata: Option>, @@ -359,6 +386,7 @@ impl WriterPropertiesBuilder { data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT, write_batch_size: DEFAULT_WRITE_BATCH_SIZE, max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE, + bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION, writer_version: DEFAULT_WRITER_VERSION, created_by: DEFAULT_CREATED_BY.to_string(), key_value_metadata: None, @@ -378,6 +406,7 @@ impl WriterPropertiesBuilder { data_page_row_count_limit: self.data_page_row_count_limit, write_batch_size: self.write_batch_size, max_row_group_size: self.max_row_group_size, + bloom_filter_position: self.bloom_filter_position, writer_version: self.writer_version, created_by: self.created_by, key_value_metadata: self.key_value_metadata, @@ -489,6 +518,12 @@ impl WriterPropertiesBuilder { self } + /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`) + pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self { + self.bloom_filter_position = value; + self + } + /// Sets "created by" property (defaults to `parquet-rs version `). pub fn set_created_by(mut self, value: String) -> Self { self.created_by = value; @@ -1054,6 +1089,7 @@ mod tests { ); assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE); assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE); + assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION); assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION); assert_eq!(props.created_by(), DEFAULT_CREATED_BY); assert_eq!(props.key_value_metadata(), None); diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index ac7d2d287488..70aea6fd5ad3 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -28,6 +28,7 @@ use crate::column::page::{Page, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; use crate::file::page_index::index_reader; +use crate::file::page_index::offset_index::OffsetIndexMetaData; use crate::file::{ footer, metadata::*, @@ -214,7 +215,7 @@ impl SerializedFileReader { for rg in &mut filtered_row_groups { let column_index = index_reader::read_columns_indexes(&chunk_reader, rg.columns())?; - let offset_index = index_reader::read_pages_locations(&chunk_reader, rg.columns())?; + let offset_index = index_reader::read_offset_indexes(&chunk_reader, rg.columns())?; columns_indexes.push(column_index); offset_indexes.push(offset_index); } @@ -285,7 +286,7 @@ impl FileReader for SerializedFileReader { pub struct SerializedRowGroupReader<'a, R: ChunkReader> { chunk_reader: Arc, metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, props: ReaderPropertiesPtr, bloom_filters: Vec>, } @@ -295,7 +296,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { pub fn new( chunk_reader: Arc, metadata: &'a RowGroupMetaData, - page_locations: Option<&'a [Vec]>, + offset_index: Option<&'a [OffsetIndexMetaData]>, props: ReaderPropertiesPtr, ) -> Result { let bloom_filters = if props.read_bloom_filter() { @@ -310,7 +311,7 @@ impl<'a, R: ChunkReader> SerializedRowGroupReader<'a, R> { Ok(Self { chunk_reader, metadata, - page_locations, + offset_index, props, bloom_filters, }) @@ -330,7 +331,7 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for SerializedRowGroupReader<' fn get_column_page_reader(&self, i: usize) -> Result> { let col = self.metadata.column(i); - let page_locations = self.page_locations.map(|x| x[i].clone()); + let page_locations = self.offset_index.map(|x| x[i].page_locations.clone()); let props = Arc::clone(&self.props); Ok(Box::new(SerializedPageReader::new_with_properties( @@ -776,7 +777,7 @@ mod tests { use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, FixedLenByteArrayType}; use crate::file::page_index::index::{Index, NativeIndex}; - use crate::file::page_index::index_reader::{read_columns_indexes, read_pages_locations}; + use crate::file::page_index::index_reader::{read_columns_indexes, read_offset_indexes}; use crate::file::writer::SerializedFileWriter; use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; @@ -1314,7 +1315,7 @@ mod tests { // only one row group assert_eq!(offset_indexes.len(), 1); let offset_index = &offset_indexes[0]; - let page_offset = &offset_index[0][0]; + let page_offset = &offset_index[0].page_locations()[0]; assert_eq!(4, page_offset.offset); assert_eq!(152, page_offset.compressed_page_size); @@ -1337,8 +1338,8 @@ mod tests { b.reverse(); assert_eq!(a, b); - let a = read_pages_locations(&test_file, columns).unwrap(); - let mut b = read_pages_locations(&test_file, &reversed).unwrap(); + let a = read_offset_indexes(&test_file, columns).unwrap(); + let mut b = read_offset_indexes(&test_file, &reversed).unwrap(); b.reverse(); assert_eq!(a, b); } @@ -1375,7 +1376,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 0), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[0].len(), 325); + assert_eq!(row_group_offset_indexes[0].page_locations.len(), 325); } else { unreachable!() }; @@ -1383,7 +1384,7 @@ mod tests { assert!(&column_index[0][1].is_sorted()); if let Index::BOOLEAN(index) = &column_index[0][1] { assert_eq!(index.indexes.len(), 82); - assert_eq!(row_group_offset_indexes[1].len(), 82); + assert_eq!(row_group_offset_indexes[1].page_locations.len(), 82); } else { unreachable!() }; @@ -1396,7 +1397,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 2), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[2].len(), 325); + assert_eq!(row_group_offset_indexes[2].page_locations.len(), 325); } else { unreachable!() }; @@ -1409,7 +1410,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 3), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[3].len(), 325); + assert_eq!(row_group_offset_indexes[3].page_locations.len(), 325); } else { unreachable!() }; @@ -1422,7 +1423,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 4), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[4].len(), 325); + assert_eq!(row_group_offset_indexes[4].page_locations.len(), 325); } else { unreachable!() }; @@ -1435,7 +1436,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 5), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[5].len(), 528); + assert_eq!(row_group_offset_indexes[5].page_locations.len(), 528); } else { unreachable!() }; @@ -1448,7 +1449,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 6), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[6].len(), 325); + assert_eq!(row_group_offset_indexes[6].page_locations.len(), 325); } else { unreachable!() }; @@ -1461,7 +1462,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 7), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[7].len(), 528); + assert_eq!(row_group_offset_indexes[7].page_locations.len(), 528); } else { unreachable!() }; @@ -1474,7 +1475,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 8), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[8].len(), 974); + assert_eq!(row_group_offset_indexes[8].page_locations.len(), 974); } else { unreachable!() }; @@ -1487,7 +1488,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 9), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[9].len(), 352); + assert_eq!(row_group_offset_indexes[9].page_locations.len(), 352); } else { unreachable!() }; @@ -1495,7 +1496,7 @@ mod tests { //Notice: min_max values for each page for this col not exits. assert!(!&column_index[0][10].is_sorted()); if let Index::NONE = &column_index[0][10] { - assert_eq!(row_group_offset_indexes[10].len(), 974); + assert_eq!(row_group_offset_indexes[10].page_locations.len(), 974); } else { unreachable!() }; @@ -1508,7 +1509,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 11), BoundaryOrder::ASCENDING, ); - assert_eq!(row_group_offset_indexes[11].len(), 325); + assert_eq!(row_group_offset_indexes[11].page_locations.len(), 325); } else { unreachable!() }; @@ -1521,7 +1522,7 @@ mod tests { get_row_group_min_max_bytes(row_group_metadata, 12), BoundaryOrder::UNORDERED, ); - assert_eq!(row_group_offset_indexes[12].len(), 325); + assert_eq!(row_group_offset_indexes[12].page_locations.len(), 325); } else { unreachable!() }; diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 90e985a95028..c44a7e6697f0 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -34,8 +34,9 @@ use crate::column::{ }; use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; +use crate::file::properties::{BloomFilterPosition, WriterPropertiesPtr}; use crate::file::reader::ChunkReader; -use crate::file::{metadata::*, properties::WriterPropertiesPtr, PARQUET_MAGIC}; +use crate::file::{metadata::*, PARQUET_MAGIC}; use crate::schema::types::{self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr}; /// A wrapper around a [`Write`] that keeps track of the number @@ -115,9 +116,10 @@ pub type OnCloseColumnChunk<'a> = Box Result<() /// - the row group metadata /// - the column index for each column chunk /// - the offset index for each column chunk -pub type OnCloseRowGroup<'a> = Box< +pub type OnCloseRowGroup<'a, W> = Box< dyn FnOnce( - RowGroupMetaDataPtr, + &'a mut TrackedWrite, + RowGroupMetaData, Vec>, Vec>, Vec>, @@ -143,7 +145,7 @@ pub struct SerializedFileWriter { schema: TypePtr, descr: SchemaDescPtr, props: WriterPropertiesPtr, - row_groups: Vec, + row_groups: Vec, bloom_filters: Vec>>, column_indexes: Vec>>, offset_indexes: Vec>>, @@ -197,18 +199,29 @@ impl SerializedFileWriter { self.row_group_index += 1; + let bloom_filter_position = self.properties().bloom_filter_position(); let row_groups = &mut self.row_groups; let row_bloom_filters = &mut self.bloom_filters; let row_column_indexes = &mut self.column_indexes; let row_offset_indexes = &mut self.offset_indexes; - let on_close = - |metadata, row_group_bloom_filter, row_group_column_index, row_group_offset_index| { - row_groups.push(metadata); - row_bloom_filters.push(row_group_bloom_filter); - row_column_indexes.push(row_group_column_index); - row_offset_indexes.push(row_group_offset_index); - Ok(()) + let on_close = move |buf, + mut metadata, + row_group_bloom_filter, + row_group_column_index, + row_group_offset_index| { + row_bloom_filters.push(row_group_bloom_filter); + row_column_indexes.push(row_group_column_index); + row_offset_indexes.push(row_group_offset_index); + // write bloom filters out immediately after the row group if requested + match bloom_filter_position { + BloomFilterPosition::AfterRowGroup => { + write_bloom_filters(buf, row_bloom_filters, &mut metadata)? + } + BloomFilterPosition::End => (), }; + row_groups.push(metadata); + Ok(()) + }; let row_group_writer = SerializedRowGroupWriter::new( self.descr.clone(), @@ -221,7 +234,7 @@ impl SerializedFileWriter { } /// Returns metadata for any flushed row groups - pub fn flushed_row_groups(&self) -> &[RowGroupMetaDataPtr] { + pub fn flushed_row_groups(&self) -> &[RowGroupMetaData] { &self.row_groups } @@ -273,34 +286,6 @@ impl SerializedFileWriter { Ok(()) } - /// Serialize all the bloom filter to the file - fn write_bloom_filters(&mut self, row_groups: &mut [RowGroup]) -> Result<()> { - // iter row group - // iter each column - // write bloom filter to the file - for (row_group_idx, row_group) in row_groups.iter_mut().enumerate() { - for (column_idx, column_chunk) in row_group.columns.iter_mut().enumerate() { - match &self.bloom_filters[row_group_idx][column_idx] { - Some(bloom_filter) => { - let start_offset = self.buf.bytes_written(); - bloom_filter.write(&mut self.buf)?; - let end_offset = self.buf.bytes_written(); - // set offset and index for bloom filter - let column_chunk_meta = column_chunk - .meta_data - .as_mut() - .expect("can't have bloom filter without column metadata"); - column_chunk_meta.bloom_filter_offset = Some(start_offset as i64); - column_chunk_meta.bloom_filter_length = - Some((end_offset - start_offset) as i32); - } - None => {} - } - } - } - Ok(()) - } - /// Serialize all the column index to the file fn write_column_indexes(&mut self, row_groups: &mut [RowGroup]) -> Result<()> { // iter row group @@ -331,6 +316,11 @@ impl SerializedFileWriter { self.finished = true; let num_rows = self.row_groups.iter().map(|x| x.num_rows()).sum(); + // write out any remaining bloom filters after all row groups + for row_group in &mut self.row_groups { + write_bloom_filters(&mut self.buf, &mut self.bloom_filters, row_group)?; + } + let mut row_groups = self .row_groups .as_slice() @@ -338,7 +328,6 @@ impl SerializedFileWriter { .map(|v| v.to_thrift()) .collect::>(); - self.write_bloom_filters(&mut row_groups)?; // Write column indexes and offset indexes self.write_column_indexes(&mut row_groups)?; self.write_offset_indexes(&mut row_groups)?; @@ -443,6 +432,40 @@ impl SerializedFileWriter { } } +/// Serialize all the bloom filters of the given row group to the given buffer, +/// and returns the updated row group metadata. +fn write_bloom_filters( + buf: &mut TrackedWrite, + bloom_filters: &mut [Vec>], + row_group: &mut RowGroupMetaData, +) -> Result<()> { + // iter row group + // iter each column + // write bloom filter to the file + + let row_group_idx: u16 = row_group + .ordinal() + .expect("Missing row group ordinal") + .try_into() + .expect("Negative row group ordinal"); + let row_group_idx = row_group_idx as usize; + for (column_idx, column_chunk) in row_group.columns_mut().iter_mut().enumerate() { + if let Some(bloom_filter) = bloom_filters[row_group_idx][column_idx].take() { + let start_offset = buf.bytes_written(); + bloom_filter.write(&mut *buf)?; + let end_offset = buf.bytes_written(); + // set offset and index for bloom filter + *column_chunk = column_chunk + .clone() + .into_builder() + .set_bloom_filter_offset(Some(start_offset as i64)) + .set_bloom_filter_length(Some((end_offset - start_offset) as i32)) + .build()?; + } + } + Ok(()) +} + /// Parquet row group writer API. /// Provides methods to access column writers in an iterator-like fashion, order is /// guaranteed to match the order of schema leaves (column descriptors). @@ -469,7 +492,7 @@ pub struct SerializedRowGroupWriter<'a, W: Write> { offset_indexes: Vec>, row_group_index: i16, file_offset: i64, - on_close: Option>, + on_close: Option>, } impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { @@ -486,7 +509,7 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { properties: WriterPropertiesPtr, buf: &'a mut TrackedWrite, row_group_index: i16, - on_close: Option>, + on_close: Option>, ) -> Self { let num_columns = schema_descr.num_columns(); let file_offset = buf.bytes_written() as i64; @@ -637,7 +660,8 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_total_uncompressed_size(metadata.uncompressed_size()) .set_num_values(metadata.num_values()) .set_data_page_offset(map_offset(src_data_offset)) - .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)); + .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)) + .set_unencoded_byte_array_data_bytes(metadata.unencoded_byte_array_data_bytes()); if let Some(statistics) = metadata.statistics() { builder = builder.set_statistics(statistics.clone()) @@ -670,12 +694,12 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_file_offset(self.file_offset) .build()?; - let metadata = Arc::new(row_group_metadata); - self.row_group_metadata = Some(metadata.clone()); + self.row_group_metadata = Some(Arc::new(row_group_metadata.clone())); if let Some(on_close) = self.on_close.take() { on_close( - metadata, + self.buf, + row_group_metadata, self.bloom_filters, self.column_indexes, self.offset_indexes, @@ -805,7 +829,7 @@ mod tests { use crate::column::page::{Page, PageReader}; use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; - use crate::data_type::{BoolType, Int32Type}; + use crate::data_type::{BoolType, ByteArrayType, Int32Type}; use crate::file::page_index::index::Index; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; @@ -818,6 +842,7 @@ mod tests { use crate::record::{Row, RowAccessor}; use crate::schema::parser::parse_message_type; use crate::schema::types::{ColumnDescriptor, ColumnPath}; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_row_group_writer_error_not_all_columns_written() { @@ -1447,7 +1472,7 @@ mod tests { assert_eq!(flushed.len(), idx + 1); assert_eq!(Some(idx as i16), last_group.ordinal()); assert_eq!(Some(row_group_file_offset as i64), last_group.file_offset()); - assert_eq!(flushed[idx].as_ref(), last_group.as_ref()); + assert_eq!(&flushed[idx], last_group.as_ref()); } let file_metadata = file_writer.close().unwrap(); @@ -1829,4 +1854,83 @@ mod tests { let b_idx = &column_index[0][1]; assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); } + + #[test] + fn test_byte_array_size_statistics() { + let message_type = " + message test_schema { + OPTIONAL BYTE_ARRAY a (UTF8); + } + "; + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let data = ByteArrayType::gen_vec(32, 7); + let def_levels = [1, 1, 1, 1, 0, 1, 0, 1, 0, 1]; + let unenc_size: i64 = data.iter().map(|x| x.len() as i64).sum(); + let file: File = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), + ); + + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), None) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let meta_data = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap(); + assert!(meta_data.size_statistics.is_some()); + let size_stats = meta_data.size_statistics.as_ref().unwrap(); + + assert!(size_stats.repetition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_none()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); + assert_eq!( + unenc_size, + size_stats.unencoded_byte_array_data_bytes.unwrap() + ); + + // check that the read metadata is also correct + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + let rfile_metadata = reader.metadata().file_metadata(); + assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(reader.num_row_groups(), 1); + let rowgroup = reader.get_row_group(0).unwrap(); + assert_eq!(rowgroup.num_columns(), 1); + let column = rowgroup.metadata().column(0); + assert!(column.unencoded_byte_array_data_bytes().is_some()); + assert_eq!( + unenc_size, + column.unencoded_byte_array_data_bytes().unwrap() + ); + + assert!(reader.metadata().offset_index().is_some()); + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); + assert_eq!(offset_index[0].len(), 1); + assert!(offset_index[0][0].unencoded_byte_array_data_bytes.is_some()); + let page_sizes = offset_index[0][0] + .unencoded_byte_array_data_bytes + .as_ref() + .unwrap(); + assert_eq!(page_sizes.len(), 1); + assert_eq!(page_sizes[0], unenc_size); + } } diff --git a/parquet/src/format.rs b/parquet/src/format.rs index ae68865be694..287d08b7a95c 100644 --- a/parquet/src/format.rs +++ b/parquet/src/format.rs @@ -117,12 +117,12 @@ impl ConvertedType { /// a list is converted into an optional field containing a repeated field for its /// values pub const LIST: ConvertedType = ConvertedType(3); - /// an enum is converted into a binary field + /// an enum is converted into a BYTE_ARRAY field pub const ENUM: ConvertedType = ConvertedType(4); /// A decimal value. /// - /// This may be used to annotate binary or fixed primitive types. The - /// underlying byte array stores the unscaled value encoded as two's + /// This may be used to annotate BYTE_ARRAY or FIXED_LEN_BYTE_ARRAY primitive + /// types. The underlying byte array stores the unscaled value encoded as two's /// complement using big-endian byte order (the most significant byte is the /// zeroth element). The value of the decimal is the value * 10^{-scale}. /// @@ -185,7 +185,7 @@ impl ConvertedType { pub const JSON: ConvertedType = ConvertedType(19); /// An embedded BSON document /// - /// A BSON document embedded within a single BINARY column. + /// A BSON document embedded within a single BYTE_ARRAY column. pub const BSON: ConvertedType = ConvertedType(20); /// An interval of time /// @@ -288,9 +288,9 @@ impl From<&ConvertedType> for i32 { pub struct FieldRepetitionType(pub i32); impl FieldRepetitionType { - /// This field is required (can not be null) and each record has exactly 1 value. + /// This field is required (can not be null) and each row has exactly 1 value. pub const REQUIRED: FieldRepetitionType = FieldRepetitionType(0); - /// The field is optional (can be null) and each record has 0 or 1 values. + /// The field is optional (can be null) and each row has 0 or 1 values. pub const OPTIONAL: FieldRepetitionType = FieldRepetitionType(1); /// The field is repeated and can contain 0 or more values pub const REPEATED: FieldRepetitionType = FieldRepetitionType(2); @@ -379,12 +379,15 @@ impl Encoding { pub const DELTA_BYTE_ARRAY: Encoding = Encoding(7); /// Dictionary encoding: the ids are encoded using the RLE encoding pub const RLE_DICTIONARY: Encoding = Encoding(8); - /// Encoding for floating-point data. + /// Encoding for fixed-width data (FLOAT, DOUBLE, INT32, INT64, FIXED_LEN_BYTE_ARRAY). /// K byte-streams are created where K is the size in bytes of the data type. - /// The individual bytes of an FP value are scattered to the corresponding stream and + /// The individual bytes of a value are scattered to the corresponding stream and /// the streams are concatenated. /// This itself does not reduce the size of the data but can lead to better compression /// afterwards. + /// + /// Added in 2.8 for FLOAT and DOUBLE. + /// Support for INT32, INT64 and FIXED_LEN_BYTE_ARRAY added in 2.11. pub const BYTE_STREAM_SPLIT: Encoding = Encoding(9); pub const ENUM_VALUES: &'static [Self] = &[ Self::PLAIN, @@ -634,6 +637,143 @@ impl From<&BoundaryOrder> for i32 { } } +// +// SizeStatistics +// + +/// A structure for capturing metadata for estimating the unencoded, +/// uncompressed size of data written. This is useful for readers to estimate +/// how much memory is needed to reconstruct data in their memory model and for +/// fine grained filter pushdown on nested structures (the histograms contained +/// in this structure can help determine the number of nulls at a particular +/// nesting level and maximum length of lists). +#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct SizeStatistics { + /// The number of physical bytes stored for BYTE_ARRAY data values assuming + /// no encoding. This is exclusive of the bytes needed to store the length of + /// each byte array. In other words, this field is equivalent to the `(size + /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values + /// written)`. To determine unencoded sizes of other types readers can use + /// schema information multiplied by the number of non-null and null values. + /// The number of null/non-null values can be inferred from the histograms + /// below. + /// + /// For example, if a column chunk is dictionary-encoded with dictionary + /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], + /// then this value for that data page should be 7 (1 + 1 + 2 + 3). + /// + /// This field should only be set for types that use BYTE_ARRAY as their + /// physical type. + pub unencoded_byte_array_data_bytes: Option, + /// When present, there is expected to be one element corresponding to each + /// repetition (i.e. size=max repetition_level+1) where each element + /// represents the number of times the repetition level was observed in the + /// data. + /// + /// This field may be omitted if max_repetition_level is 0 without loss + /// of information. + /// + pub repetition_level_histogram: Option>, + /// Same as repetition_level_histogram except for definition levels. + /// + /// This field may be omitted if max_definition_level is 0 or 1 without + /// loss of information. + /// + pub definition_level_histogram: Option>, +} + +impl SizeStatistics { + pub fn new(unencoded_byte_array_data_bytes: F1, repetition_level_histogram: F2, definition_level_histogram: F3) -> SizeStatistics where F1: Into>, F2: Into>>, F3: Into>> { + SizeStatistics { + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), + repetition_level_histogram: repetition_level_histogram.into(), + definition_level_histogram: definition_level_histogram.into(), + } + } +} + +impl crate::thrift::TSerializable for SizeStatistics { + fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { + i_prot.read_struct_begin()?; + let mut f_1: Option = None; + let mut f_2: Option> = None; + let mut f_3: Option> = None; + loop { + let field_ident = i_prot.read_field_begin()?; + if field_ident.field_type == TType::Stop { + break; + } + let field_id = field_id(&field_ident)?; + match field_id { + 1 => { + let val = i_prot.read_i64()?; + f_1 = Some(val); + }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_0 = i_prot.read_i64()?; + val.push(list_elem_0); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, + 3 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_1 = i_prot.read_i64()?; + val.push(list_elem_1); + } + i_prot.read_list_end()?; + f_3 = Some(val); + }, + _ => { + i_prot.skip(field_ident.field_type)?; + }, + }; + i_prot.read_field_end()?; + } + i_prot.read_struct_end()?; + let ret = SizeStatistics { + unencoded_byte_array_data_bytes: f_1, + repetition_level_histogram: f_2, + definition_level_histogram: f_3, + }; + Ok(ret) + } + fn write_to_out_protocol(&self, o_prot: &mut T) -> thrift::Result<()> { + let struct_ident = TStructIdentifier::new("SizeStatistics"); + o_prot.write_struct_begin(&struct_ident)?; + if let Some(fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::I64, 1))?; + o_prot.write_i64(fld_var)?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.repetition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histogram", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histogram { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histogram", TType::List, 3))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + o_prot.write_field_stop()?; + o_prot.write_struct_end() + } +} + // // Statistics // @@ -1123,7 +1263,7 @@ impl crate::thrift::TSerializable for NullType { /// To maintain forward-compatibility in v1, implementations using this logical /// type must also set scale and precision on the annotated SchemaElement. /// -/// Allowed for physical types: INT32, INT64, FIXED, and BINARY +/// Allowed for physical types: INT32, INT64, FIXED_LEN_BYTE_ARRAY, and BYTE_ARRAY. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DecimalType { pub scale: i32, @@ -1620,7 +1760,7 @@ impl crate::thrift::TSerializable for IntType { /// Embedded JSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct JsonType { } @@ -1660,7 +1800,7 @@ impl crate::thrift::TSerializable for JsonType { /// Embedded BSON logical type annotation /// -/// Allowed for physical types: BINARY +/// Allowed for physical types: BYTE_ARRAY #[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct BsonType { } @@ -2150,7 +2290,12 @@ impl crate::thrift::TSerializable for SchemaElement { /// Data page header #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct DataPageHeader { - /// Number of values, including NULLs, in this data page. * + /// Number of values, including NULLs, in this data page. + /// + /// If a OffsetIndex is present, a page must begin at a row + /// boundary (repetition_level = 0). Otherwise, pages may begin + /// within a row (repetition_level > 0). + /// pub num_values: i32, /// Encoding used for this data page * pub encoding: Encoding, @@ -2158,7 +2303,7 @@ pub struct DataPageHeader { pub definition_level_encoding: Encoding, /// Encoding used for repetition levels * pub repetition_level_encoding: Encoding, - /// Optional statistics for the data in this page* + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -2394,21 +2539,24 @@ pub struct DataPageHeaderV2 { /// Number of NULL values, in this data page. /// Number of non-null = num_values - num_nulls which is also the number of values in the data section * pub num_nulls: i32, - /// Number of rows in this data page. which means pages change on record boundaries (r = 0) * + /// Number of rows in this data page. Every page must begin at a + /// row boundary (repetition_level = 0): rows must **not** be + /// split across page boundaries when using V2 data pages. + /// pub num_rows: i32, /// Encoding used for data in this page * pub encoding: Encoding, - /// length of the definition levels + /// Length of the definition levels pub definition_levels_byte_length: i32, - /// length of the repetition levels + /// Length of the repetition levels pub repetition_levels_byte_length: i32, - /// whether the values are compressed. + /// Whether the values are compressed. /// Which means the section of the page between /// definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) /// is compressed with the compression_codec. /// If missing it is considered compressed pub is_compressed: Option, - /// optional statistics for the data in this page * + /// Optional statistics for the data in this page * pub statistics: Option, } @@ -3211,10 +3359,10 @@ impl crate::thrift::TSerializable for KeyValue { // SortingColumn // -/// Wrapper struct to specify sort order +/// Sort order within a RowGroup of a leaf column #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct SortingColumn { - /// The column index (in this row group) * + /// The ordinal position of the column (in this row group) * pub column_idx: i32, /// If true, indicates this column is sorted in descending order. * pub descending: bool, @@ -3421,10 +3569,15 @@ pub struct ColumnMetaData { /// Writers should write this field so readers can read the bloom filter /// in a single I/O. pub bloom_filter_length: Option, + /// Optional statistics to help estimate total memory when converted to in-memory + /// representations. The histograms contained in these statistics can + /// also be useful in some cases for more fine-grained nullability/list length + /// filter pushdown. + pub size_statistics: Option, } impl ColumnMetaData { - pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into> { + pub fn new(type_: Type, encodings: Vec, path_in_schema: Vec, codec: CompressionCodec, num_values: i64, total_uncompressed_size: i64, total_compressed_size: i64, key_value_metadata: F8, data_page_offset: i64, index_page_offset: F10, dictionary_page_offset: F11, statistics: F12, encoding_stats: F13, bloom_filter_offset: F14, bloom_filter_length: F15, size_statistics: F16) -> ColumnMetaData where F8: Into>>, F10: Into>, F11: Into>, F12: Into>, F13: Into>>, F14: Into>, F15: Into>, F16: Into> { ColumnMetaData { type_, encodings, @@ -3441,6 +3594,7 @@ impl ColumnMetaData { encoding_stats: encoding_stats.into(), bloom_filter_offset: bloom_filter_offset.into(), bloom_filter_length: bloom_filter_length.into(), + size_statistics: size_statistics.into(), } } } @@ -3463,6 +3617,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { let mut f_13: Option> = None; let mut f_14: Option = None; let mut f_15: Option = None; + let mut f_16: Option = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -3478,8 +3633,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_0 = Encoding::read_from_in_protocol(i_prot)?; - val.push(list_elem_0); + let list_elem_2 = Encoding::read_from_in_protocol(i_prot)?; + val.push(list_elem_2); } i_prot.read_list_end()?; f_2 = Some(val); @@ -3488,8 +3643,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_1 = i_prot.read_string()?; - val.push(list_elem_1); + let list_elem_3 = i_prot.read_string()?; + val.push(list_elem_3); } i_prot.read_list_end()?; f_3 = Some(val); @@ -3514,8 +3669,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_2 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_2); + let list_elem_4 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_4); } i_prot.read_list_end()?; f_8 = Some(val); @@ -3540,8 +3695,8 @@ impl crate::thrift::TSerializable for ColumnMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_3 = PageEncodingStats::read_from_in_protocol(i_prot)?; - val.push(list_elem_3); + let list_elem_5 = PageEncodingStats::read_from_in_protocol(i_prot)?; + val.push(list_elem_5); } i_prot.read_list_end()?; f_13 = Some(val); @@ -3554,6 +3709,10 @@ impl crate::thrift::TSerializable for ColumnMetaData { let val = i_prot.read_i32()?; f_15 = Some(val); }, + 16 => { + let val = SizeStatistics::read_from_in_protocol(i_prot)?; + f_16 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -3585,6 +3744,7 @@ impl crate::thrift::TSerializable for ColumnMetaData { encoding_stats: f_13, bloom_filter_offset: f_14, bloom_filter_length: f_15, + size_statistics: f_16, }; Ok(ret) } @@ -3666,6 +3826,11 @@ impl crate::thrift::TSerializable for ColumnMetaData { o_prot.write_i32(fld_var)?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.size_statistics { + o_prot.write_field_begin(&TFieldIdentifier::new("size_statistics", TType::Struct, 16))?; + fld_var.write_to_out_protocol(o_prot)?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -3745,8 +3910,8 @@ impl crate::thrift::TSerializable for EncryptionWithColumnKey { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_4 = i_prot.read_string()?; - val.push(list_elem_4); + let list_elem_6 = i_prot.read_string()?; + val.push(list_elem_6); } i_prot.read_list_end()?; f_1 = Some(val); @@ -3885,11 +4050,19 @@ pub struct ColumnChunk { /// metadata. This path is relative to the current file. /// pub file_path: Option, - /// Byte offset in file_path to the ColumnMetaData * + /// Deprecated: Byte offset in file_path to the ColumnMetaData + /// + /// Past use of this field has been inconsistent, with some implementations + /// using it to point to the ColumnMetaData and some using it to point to + /// the first page in the column chunk. In many cases, the ColumnMetaData at this + /// location is wrong. This field is now deprecated and should not be used. + /// Writers should set this field to 0 if no ColumnMetaData has been written outside + /// the footer. pub file_offset: i64, - /// Column metadata for this chunk. This is the same content as what is at - /// file_path/file_offset. Having it here has it replicated in the file - /// metadata. + /// Column metadata for this chunk. Some writers may also replicate this at the + /// location pointed to by file_path/file_offset. + /// Note: while marked as optional, this field is in fact required by most major + /// Parquet implementations. As such, writers MUST populate this field. /// pub meta_data: Option, /// File offset of ColumnChunk's OffsetIndex * @@ -4111,8 +4284,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_5 = ColumnChunk::read_from_in_protocol(i_prot)?; - val.push(list_elem_5); + let list_elem_7 = ColumnChunk::read_from_in_protocol(i_prot)?; + val.push(list_elem_7); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4129,8 +4302,8 @@ impl crate::thrift::TSerializable for RowGroup { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_6 = SortingColumn::read_from_in_protocol(i_prot)?; - val.push(list_elem_6); + let list_elem_8 = SortingColumn::read_from_in_protocol(i_prot)?; + val.push(list_elem_8); } i_prot.read_list_end()?; f_4 = Some(val); @@ -4335,8 +4508,9 @@ pub struct PageLocation { /// Size of the page, including header. Sum of compressed_page_size and header /// length pub compressed_page_size: i32, - /// Index within the RowGroup of the first row of the page; this means pages - /// change on record boundaries (r = 0). + /// Index within the RowGroup of the first row of the page. When an + /// OffsetIndex is present, pages must begin on row boundaries + /// (repetition_level = 0). pub first_row_index: i64, } @@ -4413,17 +4587,28 @@ impl crate::thrift::TSerializable for PageLocation { // OffsetIndex // +/// Optional offsets for each data page in a ColumnChunk. +/// +/// Forms part of the page index, along with ColumnIndex. +/// +/// OffsetIndex may be present even if ColumnIndex is not. #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct OffsetIndex { /// PageLocations, ordered by increasing PageLocation.offset. It is required /// that page_locations\[i\].first_row_index < page_locations\[i+1\].first_row_index. pub page_locations: Vec, + /// Unencoded/uncompressed size for BYTE_ARRAY types. + /// + /// See documention for unencoded_byte_array_data_bytes in SizeStatistics for + /// more details on this field. + pub unencoded_byte_array_data_bytes: Option>, } impl OffsetIndex { - pub fn new(page_locations: Vec) -> OffsetIndex { + pub fn new(page_locations: Vec, unencoded_byte_array_data_bytes: F2) -> OffsetIndex where F2: Into>> { OffsetIndex { page_locations, + unencoded_byte_array_data_bytes: unencoded_byte_array_data_bytes.into(), } } } @@ -4432,6 +4617,7 @@ impl crate::thrift::TSerializable for OffsetIndex { fn read_from_in_protocol(i_prot: &mut T) -> thrift::Result { i_prot.read_struct_begin()?; let mut f_1: Option> = None; + let mut f_2: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4443,12 +4629,22 @@ impl crate::thrift::TSerializable for OffsetIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_7 = PageLocation::read_from_in_protocol(i_prot)?; - val.push(list_elem_7); + let list_elem_9 = PageLocation::read_from_in_protocol(i_prot)?; + val.push(list_elem_9); } i_prot.read_list_end()?; f_1 = Some(val); }, + 2 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_10 = i_prot.read_i64()?; + val.push(list_elem_10); + } + i_prot.read_list_end()?; + f_2 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4459,6 +4655,7 @@ impl crate::thrift::TSerializable for OffsetIndex { verify_required_field_exists("OffsetIndex.page_locations", &f_1)?; let ret = OffsetIndex { page_locations: f_1.expect("auto-generated code should have checked for presence of required fields"), + unencoded_byte_array_data_bytes: f_2, }; Ok(ret) } @@ -4472,6 +4669,15 @@ impl crate::thrift::TSerializable for OffsetIndex { } o_prot.write_list_end()?; o_prot.write_field_end()?; + if let Some(ref fld_var) = self.unencoded_byte_array_data_bytes { + o_prot.write_field_begin(&TFieldIdentifier::new("unencoded_byte_array_data_bytes", TType::List, 2))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4481,8 +4687,14 @@ impl crate::thrift::TSerializable for OffsetIndex { // ColumnIndex // -/// Description for ColumnIndex. -/// Each ``\[i\] refers to the page at OffsetIndex.page_locations\[i\] +/// Optional statistics for each data page in a ColumnChunk. +/// +/// Forms part the page index, along with OffsetIndex. +/// +/// If this structure is present, OffsetIndex must also be present. +/// +/// For each field in this structure, ``\[i\] refers to the page at +/// OffsetIndex.page_locations\[i\] #[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] pub struct ColumnIndex { /// A list of Boolean values to determine the validity of the corresponding @@ -4508,16 +4720,33 @@ pub struct ColumnIndex { pub boundary_order: BoundaryOrder, /// A list containing the number of null values for each page * pub null_counts: Option>, + /// Contains repetition level histograms for each page + /// concatenated together. The repetition_level_histogram field on + /// SizeStatistics contains more details. + /// + /// When present the length should always be (number of pages * + /// (max_repetition_level + 1)) elements. + /// + /// Element 0 is the first element of the histogram for the first page. + /// Element (max_repetition_level + 1) is the first element of the histogram + /// for the second page. + /// + pub repetition_level_histograms: Option>, + /// Same as repetition_level_histograms except for definitions levels. + /// + pub definition_level_histograms: Option>, } impl ColumnIndex { - pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5) -> ColumnIndex where F5: Into>> { + pub fn new(null_pages: Vec, min_values: Vec>, max_values: Vec>, boundary_order: BoundaryOrder, null_counts: F5, repetition_level_histograms: F6, definition_level_histograms: F7) -> ColumnIndex where F5: Into>>, F6: Into>>, F7: Into>> { ColumnIndex { null_pages, min_values, max_values, boundary_order, null_counts: null_counts.into(), + repetition_level_histograms: repetition_level_histograms.into(), + definition_level_histograms: definition_level_histograms.into(), } } } @@ -4530,6 +4759,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let mut f_3: Option>> = None; let mut f_4: Option = None; let mut f_5: Option> = None; + let mut f_6: Option> = None; + let mut f_7: Option> = None; loop { let field_ident = i_prot.read_field_begin()?; if field_ident.field_type == TType::Stop { @@ -4541,8 +4772,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_8 = i_prot.read_bool()?; - val.push(list_elem_8); + let list_elem_11 = i_prot.read_bool()?; + val.push(list_elem_11); } i_prot.read_list_end()?; f_1 = Some(val); @@ -4551,8 +4782,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_9 = i_prot.read_bytes()?; - val.push(list_elem_9); + let list_elem_12 = i_prot.read_bytes()?; + val.push(list_elem_12); } i_prot.read_list_end()?; f_2 = Some(val); @@ -4561,8 +4792,8 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec> = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_10 = i_prot.read_bytes()?; - val.push(list_elem_10); + let list_elem_13 = i_prot.read_bytes()?; + val.push(list_elem_13); } i_prot.read_list_end()?; f_3 = Some(val); @@ -4575,12 +4806,32 @@ impl crate::thrift::TSerializable for ColumnIndex { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_11 = i_prot.read_i64()?; - val.push(list_elem_11); + let list_elem_14 = i_prot.read_i64()?; + val.push(list_elem_14); } i_prot.read_list_end()?; f_5 = Some(val); }, + 6 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_15 = i_prot.read_i64()?; + val.push(list_elem_15); + } + i_prot.read_list_end()?; + f_6 = Some(val); + }, + 7 => { + let list_ident = i_prot.read_list_begin()?; + let mut val: Vec = Vec::with_capacity(list_ident.size as usize); + for _ in 0..list_ident.size { + let list_elem_16 = i_prot.read_i64()?; + val.push(list_elem_16); + } + i_prot.read_list_end()?; + f_7 = Some(val); + }, _ => { i_prot.skip(field_ident.field_type)?; }, @@ -4598,6 +4849,8 @@ impl crate::thrift::TSerializable for ColumnIndex { max_values: f_3.expect("auto-generated code should have checked for presence of required fields"), boundary_order: f_4.expect("auto-generated code should have checked for presence of required fields"), null_counts: f_5, + repetition_level_histograms: f_6, + definition_level_histograms: f_7, }; Ok(ret) } @@ -4637,6 +4890,24 @@ impl crate::thrift::TSerializable for ColumnIndex { o_prot.write_list_end()?; o_prot.write_field_end()? } + if let Some(ref fld_var) = self.repetition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("repetition_level_histograms", TType::List, 6))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } + if let Some(ref fld_var) = self.definition_level_histograms { + o_prot.write_field_begin(&TFieldIdentifier::new("definition_level_histograms", TType::List, 7))?; + o_prot.write_list_begin(&TListIdentifier::new(TType::I64, fld_var.len() as i32))?; + for e in fld_var { + o_prot.write_i64(*e)?; + } + o_prot.write_list_end()?; + o_prot.write_field_end()? + } o_prot.write_field_stop()?; o_prot.write_struct_end() } @@ -4996,8 +5267,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_12 = SchemaElement::read_from_in_protocol(i_prot)?; - val.push(list_elem_12); + let list_elem_17 = SchemaElement::read_from_in_protocol(i_prot)?; + val.push(list_elem_17); } i_prot.read_list_end()?; f_2 = Some(val); @@ -5010,8 +5281,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_13 = RowGroup::read_from_in_protocol(i_prot)?; - val.push(list_elem_13); + let list_elem_18 = RowGroup::read_from_in_protocol(i_prot)?; + val.push(list_elem_18); } i_prot.read_list_end()?; f_4 = Some(val); @@ -5020,8 +5291,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_14 = KeyValue::read_from_in_protocol(i_prot)?; - val.push(list_elem_14); + let list_elem_19 = KeyValue::read_from_in_protocol(i_prot)?; + val.push(list_elem_19); } i_prot.read_list_end()?; f_5 = Some(val); @@ -5034,8 +5305,8 @@ impl crate::thrift::TSerializable for FileMetaData { let list_ident = i_prot.read_list_begin()?; let mut val: Vec = Vec::with_capacity(list_ident.size as usize); for _ in 0..list_ident.size { - let list_elem_15 = ColumnOrder::read_from_in_protocol(i_prot)?; - val.push(list_elem_15); + let list_elem_20 = ColumnOrder::read_from_in_protocol(i_prot)?; + val.push(list_elem_20); } i_prot.read_list_end()?; f_7 = Some(val); diff --git a/parquet/tests/arrow_writer_layout.rs b/parquet/tests/arrow_writer_layout.rs index cd124031cfdc..3e0f6ce3a8b3 100644 --- a/parquet/tests/arrow_writer_layout.rs +++ b/parquet/tests/arrow_writer_layout.rs @@ -89,12 +89,15 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { for (column_index, column_layout) in offset_index.iter().zip(&row_group_layout.columns) { assert_eq!( - column_index.len(), + column_index.page_locations.len(), column_layout.pages.len(), "index page count mismatch" ); - for (idx, (page, page_layout)) in - column_index.iter().zip(&column_layout.pages).enumerate() + for (idx, (page, page_layout)) in column_index + .page_locations + .iter() + .zip(&column_layout.pages) + .enumerate() { assert_eq!( page.compressed_page_size as usize, @@ -102,6 +105,7 @@ fn assert_layout(file_reader: &Bytes, meta: &ParquetMetaData, layout: &Layout) { "index page {idx} size mismatch" ); let next_first_row_index = column_index + .page_locations .get(idx + 1) .map(|x| x.first_row_index) .unwrap_or_else(|| row_group.num_rows()); From b06ffceaab2b59edc098d86f75b2a5125a8352ee Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 26 Jul 2024 10:37:30 -0700 Subject: [PATCH 02/17] Add support for level histograms added in PARQUET-2261 to `ParquetMetaData` (#6105) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (#6041) * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` Signed-off-by: Bugen Zhao * fix example tests Signed-off-by: Bugen Zhao --------- Signed-off-by: Bugen Zhao * Remove `impl> From for Buffer` that easily accidentally copies data (#6043) * deprecate auto copy, ask explicit reference * update comments * make cargo doc happy * Make display of interval types more pretty (#6006) * improve dispaly for interval. * update test in pretty, and fix display problem. * tmp * fix tests in arrow-cast. * fix tests in pretty. * fix style. * Update snafu (#5930) * Update Parquet thrift generated structures (#6045) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * Revert "Revert "Write Bloom filters between row groups instead of the end (#…" (#5933) This reverts commit 22e0b4432c9838f2536284015271d3de9a165135. * Revert "Update snafu (#5930)" (#6069) This reverts commit 756b1fb26d1702f36f446faf9bb40a4869c3e840. * Update pyo3 requirement from 0.21.1 to 0.22.1 (fixed) (#6075) * Update pyo3 requirement from 0.21.1 to 0.22.1 Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.1) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * refactor: remove deprecated `FromPyArrow::from_pyarrow` "GIL Refs" are being phased out. * chore: update `pyo3` in integration tests --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * remove repeated codes to make the codes more concise. (#6080) * Add `unencoded_byte_array_data_bytes` to `ParquetMetaData` (#6068) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * add support for unencoded_byte_array_data_bytes * add comments * change sig of ColumnMetrics::update_variable_length_bytes() * rename ParquetOffsetIndex to OffsetSizeIndex * rename some functions * suggestion from review Co-authored-by: Andrew Lamb * add Default trait to ColumnMetrics as suggested in review * rename OffsetSizeIndex to OffsetIndexMetaData --------- Co-authored-by: Andrew Lamb * deprecate read_page_locations * add level histograms to metadata * add to_thrift() to OffsetIndexMetaData * Update pyo3 requirement from 0.21.1 to 0.22.2 (#6085) Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/v0.22.2/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.2) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Deprecate read_page_locations() and simplify offset index in `ParquetMetaData` (#6095) * deprecate read_page_locations * add to_thrift() to OffsetIndexMetaData * move valid test into ColumnIndexBuilder::append_histograms * move update_histogram() inside ColumnMetrics * Update parquet/src/column/writer/mod.rs Co-authored-by: Ed Seidl * Implement LevelHistograms as a struct * formatting * fix error in docs --------- Signed-off-by: Bugen Zhao Signed-off-by: dependabot[bot] Co-authored-by: Bugen Zhao Co-authored-by: Xiangpeng Hao Co-authored-by: kamille Co-authored-by: Jesse Co-authored-by: Andrew Lamb Co-authored-by: Marco Neumann Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- parquet/src/column/writer/mod.rs | 137 +++++++++++++++-- parquet/src/file/metadata/memory.rs | 2 + parquet/src/file/metadata/mod.rs | 217 +++++++++++++++++++++++++-- parquet/src/file/page_index/index.rs | 87 +++++++++-- parquet/src/file/writer.rs | 144 +++++++++++++++++- 5 files changed, 550 insertions(+), 37 deletions(-) diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 2c0c957d87d3..54d8fd3cc13e 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -33,7 +33,7 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::levels::LevelEncoder; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ColumnIndexBuilder, OffsetIndexBuilder}; +use crate::file::metadata::{ColumnIndexBuilder, LevelHistogram, OffsetIndexBuilder}; use crate::file::properties::EnabledStatistics; use crate::file::statistics::{Statistics, ValueStatistics}; use crate::file::{ @@ -189,6 +189,54 @@ struct PageMetrics { num_buffered_values: u32, num_buffered_rows: u32, num_page_nulls: u64, + repetition_level_histogram: Option, + definition_level_histogram: Option, +} + +impl PageMetrics { + fn new() -> Self { + Default::default() + } + + /// Initialize the repetition level histogram + fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + self.repetition_level_histogram = LevelHistogram::try_new(max_level); + self + } + + /// Initialize the definition level histogram + fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + self.definition_level_histogram = LevelHistogram::try_new(max_level); + self + } + + /// Resets the state of this `PageMetrics` to the initial state. + /// If histograms have been initialized their contents will be reset to zero. + fn new_page(&mut self) { + self.num_buffered_values = 0; + self.num_buffered_rows = 0; + self.num_page_nulls = 0; + self.repetition_level_histogram + .as_mut() + .map(LevelHistogram::reset); + self.definition_level_histogram + .as_mut() + .map(LevelHistogram::reset); + } + + /// Updates histogram values using provided repetition levels + fn update_repetition_level_histogram(&mut self, levels: &[i16]) { + if let Some(ref mut rep_hist) = self.repetition_level_histogram { + rep_hist.update_from_levels(levels); + } + } + + /// Updates histogram values using provided definition levels + fn update_definition_level_histogram(&mut self, levels: &[i16]) { + if let Some(ref mut def_hist) = self.definition_level_histogram { + def_hist.update_from_levels(levels); + } + } } // Metrics per column writer @@ -206,6 +254,8 @@ struct ColumnMetrics { num_column_nulls: u64, column_distinct_count: Option, variable_length_bytes: Option, + repetition_level_histogram: Option, + definition_level_histogram: Option, } impl ColumnMetrics { @@ -213,6 +263,41 @@ impl ColumnMetrics { Default::default() } + /// Initialize the repetition level histogram + fn with_repetition_level_histogram(mut self, max_level: i16) -> Self { + self.repetition_level_histogram = LevelHistogram::try_new(max_level); + self + } + + /// Initialize the definition level histogram + fn with_definition_level_histogram(mut self, max_level: i16) -> Self { + self.definition_level_histogram = LevelHistogram::try_new(max_level); + self + } + + /// Sum `page_histogram` into `chunk_histogram` + fn update_histogram( + chunk_histogram: &mut Option, + page_histogram: &Option, + ) { + if let (Some(page_hist), Some(chunk_hist)) = (page_histogram, chunk_histogram) { + chunk_hist.add(page_hist); + } + } + + /// Sum the provided PageMetrics histograms into the chunk histograms. Does nothing if + /// page histograms are not initialized. + fn update_from_page_metrics(&mut self, page_metrics: &PageMetrics) { + ColumnMetrics::::update_histogram( + &mut self.definition_level_histogram, + &page_metrics.definition_level_histogram, + ); + ColumnMetrics::::update_histogram( + &mut self.repetition_level_histogram, + &page_metrics.repetition_level_histogram, + ); + } + /// Sum the provided page variable_length_bytes into the chunk variable_length_bytes fn update_variable_length_bytes(&mut self, variable_length_bytes: Option) { if let Some(var_bytes) = variable_length_bytes { @@ -275,6 +360,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Used for level information encodings.insert(Encoding::RLE); + let mut page_metrics = PageMetrics::new(); + let mut column_metrics = ColumnMetrics::::new(); + + // Initialize level histograms if collecting page or chunk statistics + if statistics_enabled != EnabledStatistics::None { + page_metrics = page_metrics + .with_repetition_level_histogram(descr.max_rep_level()) + .with_definition_level_histogram(descr.max_def_level()); + column_metrics = column_metrics + .with_repetition_level_histogram(descr.max_rep_level()) + .with_definition_level_histogram(descr.max_def_level()) + } + // Disable column_index_builder if not collecting page statistics. let mut column_index_builder = ColumnIndexBuilder::new(); if statistics_enabled != EnabledStatistics::Page { @@ -292,12 +390,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { def_levels_sink: vec![], rep_levels_sink: vec![], data_pages: VecDeque::new(), - page_metrics: PageMetrics { - num_buffered_values: 0, - num_buffered_rows: 0, - num_page_nulls: 0, - }, - column_metrics: ColumnMetrics::::new(), + page_metrics, + column_metrics, column_index_builder, offset_index_builder: OffsetIndexBuilder::new(), encodings, @@ -547,6 +641,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } + // Update histogram + self.page_metrics.update_definition_level_histogram(levels); + self.def_levels_sink.extend_from_slice(levels); values_to_write } else { @@ -575,6 +672,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.page_metrics.num_buffered_rows += (level == 0) as u32 } + // Update histogram + self.page_metrics.update_repetition_level_histogram(levels); + self.rep_levels_sink.extend_from_slice(levels); } else { // Each value is exactly one row. @@ -718,7 +818,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } } } - // update the offset index + + // Append page histograms to the `ColumnIndex` histograms + self.column_index_builder.append_histograms( + &self.page_metrics.repetition_level_histogram, + &self.page_metrics.definition_level_histogram, + ); + + // Update the offset index self.offset_index_builder .append_row_count(self.page_metrics.num_buffered_rows as i64); @@ -804,7 +911,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { values_data.variable_length_bytes, ); - // Update variable_length_bytes in column_metrics + // Update histograms and variable_length_bytes in column_metrics + self.column_metrics + .update_from_page_metrics(&self.page_metrics); self.column_metrics .update_variable_length_bytes(values_data.variable_length_bytes); @@ -911,7 +1020,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Reset state. self.rep_levels_sink.clear(); self.def_levels_sink.clear(); - self.page_metrics = PageMetrics::default(); + self.page_metrics.new_page(); Ok(()) } @@ -1019,7 +1128,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { builder = builder .set_statistics(statistics) - .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes); + .set_unencoded_byte_array_data_bytes(self.column_metrics.variable_length_bytes) + .set_repetition_level_histogram( + self.column_metrics.repetition_level_histogram.take(), + ) + .set_definition_level_histogram( + self.column_metrics.definition_level_histogram.take(), + ); } let metadata = builder.build()?; diff --git a/parquet/src/file/metadata/memory.rs b/parquet/src/file/metadata/memory.rs index 0b6d1f0d1a24..bb822b4ccbe7 100644 --- a/parquet/src/file/metadata/memory.rs +++ b/parquet/src/file/metadata/memory.rs @@ -99,6 +99,8 @@ impl HeapSize for ColumnChunkMetaData { + self.statistics.heap_size() + self.encoding_stats.heap_size() + self.unencoded_byte_array_data_bytes.heap_size() + + self.repetition_level_histogram.heap_size() + + self.definition_level_histogram.heap_size() } } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 52206e66a590..cd3555de828c 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -557,6 +557,114 @@ pub struct ColumnChunkMetaData { column_index_offset: Option, column_index_length: Option, unencoded_byte_array_data_bytes: Option, + repetition_level_histogram: Option, + definition_level_histogram: Option, +} + +/// Histograms for repetition and definition levels. +/// +/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of +/// values at level `i`. +/// +/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the +/// number of rows with level 1, and so on. +/// +#[derive(Debug, Clone, PartialEq)] +pub struct LevelHistogram { + inner: Vec, +} + +impl LevelHistogram { + /// Creates a new level histogram data. + /// + /// Length will be `max_level + 1`. + /// + /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case) + pub fn try_new(max_level: i16) -> Option { + if max_level > 0 { + Some(Self { + inner: vec![0; max_level as usize + 1], + }) + } else { + None + } + } + /// Returns a reference to the the histogram's values. + pub fn values(&self) -> &[i64] { + &self.inner + } + + /// Return the inner vector, consuming self + pub fn into_inner(self) -> Vec { + self.inner + } + + /// Returns the histogram value at the given index. + /// + /// The value of `i` is the number of values with level `i`. For example, + /// `get(1)` returns the number of values with level 1. + /// + /// Returns `None` if the index is out of bounds. + pub fn get(&self, index: usize) -> Option { + self.inner.get(index).copied() + } + + /// Adds the values from the other histogram to this histogram + /// + /// # Panics + /// If the histograms have different lengths + pub fn add(&mut self, other: &Self) { + assert_eq!(self.len(), other.len()); + for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) { + *dst += src; + } + } + + /// return the length of the histogram + pub fn len(&self) -> usize { + self.inner.len() + } + + /// returns if the histogram is empty + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Sets the values of all histogram levels to 0. + pub fn reset(&mut self) { + for value in self.inner.iter_mut() { + *value = 0; + } + } + + /// Updates histogram values using provided repetition levels + /// + /// # Panics + /// if any of the levels is greater than the length of the histogram ( + /// the argument supplied to [`Self::try_new`]) + pub fn update_from_levels(&mut self, levels: &[i16]) { + for &level in levels { + self.inner[level as usize] += 1; + } + } +} + +impl From> for LevelHistogram { + fn from(inner: Vec) -> Self { + Self { inner } + } +} + +impl From for Vec { + fn from(value: LevelHistogram) -> Self { + value.into_inner() + } +} + +impl HeapSize for LevelHistogram { + fn heap_size(&self) -> usize { + self.inner.heap_size() + } } /// Represents common operations for a column chunk. @@ -717,6 +825,24 @@ impl ColumnChunkMetaData { self.unencoded_byte_array_data_bytes } + /// Returns the repetition level histogram. + /// + /// The returned value `vec[i]` is how many values are at repetition level `i`. For example, + /// `vec[0]` indicates how many rows the page contains. + /// This field may not be set by older writers. + pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { + self.repetition_level_histogram.as_ref() + } + + /// Returns the definition level histogram. + /// + /// The returned value `vec[i]` is how many values are at definition level `i`. For example, + /// `vec[max_definition_level]` indicates how many non-null values are present in the page. + /// This field may not be set by older writers. + pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { + self.definition_level_histogram.as_ref() + } + /// Method to convert from Thrift. pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result { if cc.meta_data.is_none() { @@ -754,13 +880,23 @@ impl ColumnChunkMetaData { let offset_index_length = cc.offset_index_length; let column_index_offset = cc.column_index_offset; let column_index_length = cc.column_index_length; - let unencoded_byte_array_data_bytes = if let Some(size_stats) = col_metadata.size_statistics - { - size_stats.unencoded_byte_array_data_bytes + let ( + unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, + ) = if let Some(size_stats) = col_metadata.size_statistics { + ( + size_stats.unencoded_byte_array_data_bytes, + size_stats.repetition_level_histogram, + size_stats.definition_level_histogram, + ) } else { - None + (None, None, None) }; + let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from); + let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from); + let result = ColumnChunkMetaData { column_descr, encodings, @@ -782,6 +918,8 @@ impl ColumnChunkMetaData { column_index_offset, column_index_length, unencoded_byte_array_data_bytes, + repetition_level_histogram, + definition_level_histogram, }; Ok(result) } @@ -805,11 +943,24 @@ impl ColumnChunkMetaData { /// Method to convert to Thrift `ColumnMetaData` pub fn to_column_metadata_thrift(&self) -> ColumnMetaData { - let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() { + let size_statistics = if self.unencoded_byte_array_data_bytes.is_some() + || self.repetition_level_histogram.is_some() + || self.definition_level_histogram.is_some() + { + let repetition_level_histogram = self + .repetition_level_histogram + .as_ref() + .map(|hist| hist.clone().into_inner()); + + let definition_level_histogram = self + .definition_level_histogram + .as_ref() + .map(|hist| hist.clone().into_inner()); + Some(SizeStatistics { unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes, - repetition_level_histogram: None, - definition_level_histogram: None, + repetition_level_histogram, + definition_level_histogram, }) } else { None @@ -871,6 +1022,8 @@ impl ColumnChunkMetaDataBuilder { column_index_offset: None, column_index_length: None, unencoded_byte_array_data_bytes: None, + repetition_level_histogram: None, + definition_level_histogram: None, }) } @@ -988,6 +1141,18 @@ impl ColumnChunkMetaDataBuilder { self } + /// Sets optional repetition level histogram + pub fn set_repetition_level_histogram(mut self, value: Option) -> Self { + self.0.repetition_level_histogram = value; + self + } + + /// Sets optional repetition level histogram + pub fn set_definition_level_histogram(mut self, value: Option) -> Self { + self.0.definition_level_histogram = value; + self + } + /// Builds column chunk metadata. pub fn build(self) -> Result { Ok(self.0) @@ -1003,6 +1168,10 @@ pub struct ColumnIndexBuilder { max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, + /// contains the concatenation of the histograms of all pages + repetition_level_histograms: Option>, + /// contains the concatenation of the histograms of all pages + definition_level_histograms: Option>, /// Is the information in the builder valid? /// /// Set to `false` if any entry in the page doesn't have statistics for @@ -1027,6 +1196,8 @@ impl ColumnIndexBuilder { max_values: Vec::new(), null_counts: Vec::new(), boundary_order: BoundaryOrder::UNORDERED, + repetition_level_histograms: None, + definition_level_histograms: None, valid: true, } } @@ -1045,6 +1216,28 @@ impl ColumnIndexBuilder { self.null_counts.push(null_count); } + /// Append the given page-level histograms to the [`ColumnIndex`] histograms. + /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state. + pub fn append_histograms( + &mut self, + repetition_level_histogram: &Option, + definition_level_histogram: &Option, + ) { + if !self.valid { + return; + } + if let Some(ref rep_lvl_hist) = repetition_level_histogram { + let hist = self.repetition_level_histograms.get_or_insert(Vec::new()); + hist.reserve(rep_lvl_hist.len()); + hist.extend(rep_lvl_hist.values()); + } + if let Some(ref def_lvl_hist) = definition_level_histogram { + let hist = self.definition_level_histograms.get_or_insert(Vec::new()); + hist.reserve(def_lvl_hist.len()); + hist.extend(def_lvl_hist.values()); + } + } + pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) { self.boundary_order = boundary_order; } @@ -1069,8 +1262,8 @@ impl ColumnIndexBuilder { self.max_values, self.boundary_order, self.null_counts, - None, - None, + self.repetition_level_histograms, + self.definition_level_histograms, ) } } @@ -1286,6 +1479,8 @@ mod tests { .set_column_index_offset(Some(8000)) .set_column_index_length(Some(25)) .set_unencoded_byte_array_data_bytes(Some(2000)) + .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100]))) + .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200]))) .build() .unwrap(); @@ -1397,7 +1592,7 @@ mod tests { let row_group_meta_with_stats = vec![row_group_meta_with_stats]; let parquet_meta = ParquetMetaData::new(file_metadata.clone(), row_group_meta_with_stats); - let base_expected_size = 2088; + let base_expected_size = 2280; assert_eq!(parquet_meta.memory_size(), base_expected_size); @@ -1425,7 +1620,7 @@ mod tests { ]]), ); - let bigger_expected_size = 2400; + let bigger_expected_size = 2784; // more set fields means more memory usage assert!(bigger_expected_size > base_expected_size); assert_eq!(parquet_meta.memory_size(), bigger_expected_size); diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 7eba949042f1..68412572b5f2 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -36,6 +36,17 @@ pub struct PageIndex { pub max: Option, /// Null values in the page pub null_count: Option, + /// Repetition level histogram for the page + /// + /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. + /// For example, `repetition_level_histogram[0]` indicates how many rows the page contains. + pub repetition_level_histogram: Option>, + /// Definition level histogram for the page + /// + /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. + /// For example, `definition_level_histogram[max_definition_level]` indicates how many + /// non-null values are present in the page. + pub definition_level_histogram: Option>, } impl PageIndex { @@ -48,6 +59,12 @@ impl PageIndex { pub fn null_count(&self) -> Option { self.null_count } + pub fn repetition_level_histogram(&self) -> Option<&Vec> { + self.repetition_level_histogram.as_ref() + } + pub fn definition_level_histogram(&self) -> Option<&Vec> { + self.definition_level_histogram.as_ref() + } } impl PageIndex @@ -149,26 +166,57 @@ impl NativeIndex { .map(|x| x.into_iter().map(Some).collect::>()) .unwrap_or_else(|| vec![None; len]); + // histograms are a 1D array encoding a 2D num_pages X num_levels matrix. + let to_page_histograms = |opt_hist: Option>| { + if let Some(hist) = opt_hist { + // TODO: should we assert (hist.len() % len) == 0? + let num_levels = hist.len() / len; + let mut res = Vec::with_capacity(len); + for i in 0..len { + let page_idx = i * num_levels; + let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); + res.push(Some(page_hist)); + } + res + } else { + vec![None; len] + } + }; + + let rep_hists: Vec>> = + to_page_histograms(index.repetition_level_histograms); + let def_hists: Vec>> = + to_page_histograms(index.definition_level_histograms); + let indexes = index .min_values .iter() .zip(index.max_values.into_iter()) .zip(index.null_pages.into_iter()) .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - let min = min.as_slice(); - let max = max.as_slice(); - (Some(from_le_slice::(min)), Some(from_le_slice::(max))) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) + .zip(rep_hists.into_iter()) + .zip(def_hists.into_iter()) + .map( + |( + ((((min, max), is_null), null_count), repetition_level_histogram), + definition_level_histogram, + )| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min.as_slice(); + let max = max.as_slice(); + (Some(from_le_slice::(min)), Some(from_le_slice::(max))) + }; + Ok(PageIndex { + min, + max, + null_count, + repetition_level_histogram, + definition_level_histogram, + }) + }, + ) .collect::, ParquetError>>()?; Ok(Self { @@ -188,6 +236,8 @@ mod tests { min: Some(-123), max: Some(234), null_count: Some(0), + repetition_level_histogram: Some(vec![1, 2]), + definition_level_histogram: Some(vec![1, 2, 3]), }; assert_eq!(page_index.min().unwrap(), &-123); @@ -195,6 +245,11 @@ mod tests { assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes()); assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); assert_eq!(page_index.null_count().unwrap(), 0); + assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2])); + assert_eq!( + page_index.definition_level_histogram(), + Some(&vec![1, 2, 3]) + ); } #[test] @@ -203,6 +258,8 @@ mod tests { min: None, max: None, null_count: None, + repetition_level_histogram: None, + definition_level_histogram: None, }; assert_eq!(page_index.min(), None); @@ -210,5 +267,7 @@ mod tests { assert_eq!(page_index.min_bytes(), None); assert_eq!(page_index.max_bytes(), None); assert_eq!(page_index.null_count(), None); + assert_eq!(page_index.repetition_level_histogram(), None); + assert_eq!(page_index.definition_level_histogram(), None); } } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index c44a7e6697f0..f2e8f74a378c 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -663,6 +663,12 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { .set_dictionary_page_offset(src_dictionary_offset.map(map_offset)) .set_unencoded_byte_array_data_bytes(metadata.unencoded_byte_array_data_bytes()); + if let Some(rep_hist) = metadata.repetition_level_histogram() { + builder = builder.set_repetition_level_histogram(Some(rep_hist.clone())) + } + if let Some(def_hist) = metadata.definition_level_histogram() { + builder = builder.set_definition_level_histogram(Some(def_hist.clone())) + } if let Some(statistics) = metadata.statistics() { builder = builder.set_statistics(statistics.clone()) } @@ -1889,6 +1895,12 @@ mod tests { assert_eq!(file_metadata.row_groups[0].columns.len(), 1); assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let check_def_hist = |def_hist: &[i64]| { + assert_eq!(def_hist.len(), 2); + assert_eq!(def_hist[0], 3); + assert_eq!(def_hist[1], 7); + }; + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); let meta_data = file_metadata.row_groups[0].columns[0] .meta_data @@ -1898,12 +1910,13 @@ mod tests { let size_stats = meta_data.size_statistics.as_ref().unwrap(); assert!(size_stats.repetition_level_histogram.is_none()); - assert!(size_stats.definition_level_histogram.is_none()); + assert!(size_stats.definition_level_histogram.is_some()); assert!(size_stats.unencoded_byte_array_data_bytes.is_some()); assert_eq!( unenc_size, size_stats.unencoded_byte_array_data_bytes.unwrap() ); + check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); // check that the read metadata is also correct let options = ReadOptionsBuilder::new().with_page_index().build(); @@ -1915,12 +1928,31 @@ mod tests { let rowgroup = reader.get_row_group(0).unwrap(); assert_eq!(rowgroup.num_columns(), 1); let column = rowgroup.metadata().column(0); + assert!(column.definition_level_histogram().is_some()); + assert!(column.repetition_level_histogram().is_none()); assert!(column.unencoded_byte_array_data_bytes().is_some()); + check_def_hist(column.definition_level_histogram().unwrap().values()); assert_eq!( unenc_size, column.unencoded_byte_array_data_bytes().unwrap() ); + // check histogram in column index as well + assert!(reader.metadata().column_index().is_some()); + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); + assert_eq!(column_index[0].len(), 1); + let col_idx = if let Index::BYTE_ARRAY(index) = &column_index[0][0] { + assert_eq!(index.indexes.len(), 1); + &index.indexes[0] + } else { + unreachable!() + }; + + assert!(col_idx.repetition_level_histogram().is_none()); + assert!(col_idx.definition_level_histogram().is_some()); + check_def_hist(col_idx.definition_level_histogram().unwrap()); + assert!(reader.metadata().offset_index().is_some()); let offset_index = reader.metadata().offset_index().unwrap(); assert_eq!(offset_index.len(), 1); @@ -1933,4 +1965,114 @@ mod tests { assert_eq!(page_sizes.len(), 1); assert_eq!(page_sizes[0], unenc_size); } + + #[test] + fn test_size_statistics_with_repetition_and_nulls() { + let message_type = " + message test_schema { + OPTIONAL group i32_list (LIST) { + REPEATED group list { + OPTIONAL INT32 element; + } + } + } + "; + // column is: + // row 0: [1, 2] + // row 1: NULL + // row 2: [4, NULL] + // row 3: [] + // row 4: [7, 8, 9, 10] + let schema = Arc::new(parse_message_type(message_type).unwrap()); + let data = [1, 2, 4, 7, 8, 9, 10]; + let def_levels = [3, 3, 0, 3, 2, 1, 3, 3, 3, 3]; + let rep_levels = [0, 1, 0, 0, 1, 0, 0, 1, 1, 1]; + let file = tempfile::tempfile().unwrap(); + let props = Arc::new( + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), + ); + let mut writer = SerializedFileWriter::new(&file, schema, props).unwrap(); + let mut row_group_writer = writer.next_row_group().unwrap(); + + let mut col_writer = row_group_writer.next_column().unwrap().unwrap(); + col_writer + .typed::() + .write_batch(&data, Some(&def_levels), Some(&rep_levels)) + .unwrap(); + col_writer.close().unwrap(); + row_group_writer.close().unwrap(); + let file_metadata = writer.close().unwrap(); + + assert_eq!(file_metadata.row_groups.len(), 1); + assert_eq!(file_metadata.row_groups[0].columns.len(), 1); + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + + let check_def_hist = |def_hist: &[i64]| { + assert_eq!(def_hist.len(), 4); + assert_eq!(def_hist[0], 1); + assert_eq!(def_hist[1], 1); + assert_eq!(def_hist[2], 1); + assert_eq!(def_hist[3], 7); + }; + + let check_rep_hist = |rep_hist: &[i64]| { + assert_eq!(rep_hist.len(), 2); + assert_eq!(rep_hist[0], 5); + assert_eq!(rep_hist[1], 5); + }; + + // check that histograms are set properly in the write and read metadata + // also check that unencoded_byte_array_data_bytes is not set + assert!(file_metadata.row_groups[0].columns[0].meta_data.is_some()); + let meta_data = file_metadata.row_groups[0].columns[0] + .meta_data + .as_ref() + .unwrap(); + assert!(meta_data.size_statistics.is_some()); + let size_stats = meta_data.size_statistics.as_ref().unwrap(); + assert!(size_stats.repetition_level_histogram.is_some()); + assert!(size_stats.definition_level_histogram.is_some()); + assert!(size_stats.unencoded_byte_array_data_bytes.is_none()); + check_def_hist(size_stats.definition_level_histogram.as_ref().unwrap()); + check_rep_hist(size_stats.repetition_level_histogram.as_ref().unwrap()); + + // check that the read metadata is also correct + let options = ReadOptionsBuilder::new().with_page_index().build(); + let reader = SerializedFileReader::new_with_options(file, options).unwrap(); + + let rfile_metadata = reader.metadata().file_metadata(); + assert_eq!(rfile_metadata.num_rows(), file_metadata.num_rows); + assert_eq!(reader.num_row_groups(), 1); + let rowgroup = reader.get_row_group(0).unwrap(); + assert_eq!(rowgroup.num_columns(), 1); + let column = rowgroup.metadata().column(0); + assert!(column.definition_level_histogram().is_some()); + assert!(column.repetition_level_histogram().is_some()); + assert!(column.unencoded_byte_array_data_bytes().is_none()); + check_def_hist(column.definition_level_histogram().unwrap().values()); + check_rep_hist(column.repetition_level_histogram().unwrap().values()); + + // check histogram in column index as well + assert!(reader.metadata().column_index().is_some()); + let column_index = reader.metadata().column_index().unwrap(); + assert_eq!(column_index.len(), 1); + assert_eq!(column_index[0].len(), 1); + let col_idx = if let Index::INT32(index) = &column_index[0][0] { + assert_eq!(index.indexes.len(), 1); + &index.indexes[0] + } else { + unreachable!() + }; + + check_def_hist(col_idx.definition_level_histogram().unwrap()); + check_rep_hist(col_idx.repetition_level_histogram().unwrap()); + + assert!(reader.metadata().offset_index().is_some()); + let offset_index = reader.metadata().offset_index().unwrap(); + assert_eq!(offset_index.len(), 1); + assert_eq!(offset_index[0].len(), 1); + assert!(offset_index[0][0].unencoded_byte_array_data_bytes.is_none()); + } } From f42d2420525a05a9b55461d83b359779ca5cc2a3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 26 Jul 2024 11:47:46 -0600 Subject: [PATCH 03/17] Add ArrowError::ArithmeticError (#6130) --- arrow-arith/src/arithmetic.rs | 4 ++-- arrow-arith/src/numeric.rs | 32 +++++++++++++++++++------------- arrow-array/src/arithmetic.rs | 17 ++++++++++------- arrow-cast/src/cast/mod.rs | 7 +++++-- arrow-schema/src/error.rs | 2 ++ 5 files changed, 38 insertions(+), 24 deletions(-) diff --git a/arrow-arith/src/arithmetic.rs b/arrow-arith/src/arithmetic.rs index 124614d77f97..febf5ceabdd9 100644 --- a/arrow-arith/src/arithmetic.rs +++ b/arrow-arith/src/arithmetic.rs @@ -122,7 +122,7 @@ pub fn multiply_fixed_point_checked( let mut mul = a.wrapping_mul(b); mul = divide_and_round::(mul, divisor); mul.to_i128().ok_or_else(|| { - ArrowError::ComputeError(format!("Overflow happened on: {:?} * {:?}", a, b)) + ArrowError::ArithmeticOverflow(format!("Overflow happened on: {:?} * {:?}", a, b)) }) }) .and_then(|a| a.with_precision_and_scale(precision, required_scale)) @@ -323,7 +323,7 @@ mod tests { // `multiply` overflows on this case. let err = mul(&a, &b).unwrap_err(); - assert_eq!(err.to_string(), "Compute error: Overflow happened on: 123456789000000000000000000 * 10000000000000000000"); + assert_eq!(err.to_string(), "Arithmetic overflow: Overflow happened on: 123456789000000000000000000 * 10000000000000000000"); // Avoid overflow by reducing the scale. let result = multiply_fixed_point(&a, &b, 28).unwrap(); diff --git a/arrow-arith/src/numeric.rs b/arrow-arith/src/numeric.rs index 91aaf628438d..b6af40f7d7c2 100644 --- a/arrow-arith/src/numeric.rs +++ b/arrow-arith/src/numeric.rs @@ -888,15 +888,15 @@ mod tests { test_neg_primitive::( &[i32::MIN], - Err("Compute error: Overflow happened on: - -2147483648"), + Err("Arithmetic overflow: Overflow happened on: - -2147483648"), ); test_neg_primitive::( &[i64::MIN], - Err("Compute error: Overflow happened on: - -9223372036854775808"), + Err("Arithmetic overflow: Overflow happened on: - -9223372036854775808"), ); test_neg_primitive::( &[i64::MIN], - Err("Compute error: Overflow happened on: - -9223372036854775808"), + Err("Arithmetic overflow: Overflow happened on: - -9223372036854775808"), ); let r = neg_wrapping(&Int32Array::from(vec![i32::MIN])).unwrap(); @@ -911,7 +911,7 @@ mod tests { assert_eq!( err, - "Compute error: Overflow happened on: - -9223372036854775808" + "Arithmetic overflow: Overflow happened on: - -9223372036854775808" ); let a = Decimal128Array::from(vec![1, 3, -44, 2, 4]) @@ -1016,28 +1016,31 @@ mod tests { let a = UInt8Array::from(vec![56, 5, 3]); let b = UInt8Array::from(vec![200, 2, 5]); let err = add(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 56 + 200"); + assert_eq!(err, "Arithmetic overflow: Overflow happened on: 56 + 200"); let result = add_wrapping(&a, &b).unwrap(); assert_eq!(result.as_ref(), &UInt8Array::from(vec![0, 7, 8])); let a = UInt8Array::from(vec![34, 5, 3]); let b = UInt8Array::from(vec![200, 2, 5]); let err = sub(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 34 - 200"); + assert_eq!(err, "Arithmetic overflow: Overflow happened on: 34 - 200"); let result = sub_wrapping(&a, &b).unwrap(); assert_eq!(result.as_ref(), &UInt8Array::from(vec![90, 3, 254])); let a = UInt8Array::from(vec![34, 5, 3]); let b = UInt8Array::from(vec![200, 2, 5]); let err = mul(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 34 * 200"); + assert_eq!(err, "Arithmetic overflow: Overflow happened on: 34 * 200"); let result = mul_wrapping(&a, &b).unwrap(); assert_eq!(result.as_ref(), &UInt8Array::from(vec![144, 10, 15])); let a = Int16Array::from(vec![i16::MIN]); let b = Int16Array::from(vec![-1]); let err = div(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: -32768 / -1"); + assert_eq!( + err, + "Arithmetic overflow: Overflow happened on: -32768 / -1" + ); let a = Int16Array::from(vec![21]); let b = Int16Array::from(vec![0]); @@ -1146,7 +1149,7 @@ mod tests { .with_precision_and_scale(3, -2) .unwrap(); let err = add(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 10 ^ 39"); + assert_eq!(err, "Arithmetic overflow: Overflow happened on: 10 ^ 39"); let a = Decimal128Array::from(vec![10]) .with_precision_and_scale(3, -1) @@ -1154,7 +1157,7 @@ mod tests { let err = add(&a, &b).unwrap_err().to_string(); assert_eq!( err, - "Compute error: Overflow happened on: 10 * 100000000000000000000000000000000000000" + "Arithmetic overflow: Overflow happened on: 10 * 100000000000000000000000000000000000000" ); let b = Decimal128Array::from(vec![0]) @@ -1349,7 +1352,10 @@ mod tests { let a = IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNano::MAX]); let b = IntervalMonthDayNanoArray::from(vec![IntervalMonthDayNano::ONE]); let err = add(&a, &b).unwrap_err().to_string(); - assert_eq!(err, "Compute error: Overflow happened on: 2147483647 + 1"); + assert_eq!( + err, + "Arithmetic overflow: Overflow happened on: 2147483647 + 1" + ); } fn test_duration_impl>() { @@ -1384,7 +1390,7 @@ mod tests { let err = add(&a, &b).unwrap_err().to_string(); assert_eq!( err, - "Compute error: Overflow happened on: 9223372036854775807 + 1" + "Arithmetic overflow: Overflow happened on: 9223372036854775807 + 1" ); } @@ -1511,7 +1517,7 @@ mod tests { let err = sub(&a, &b).unwrap_err().to_string(); assert_eq!( err, - "Compute error: Overflow happened on: 9223372036854775807 - -1" + "Arithmetic overflow: Overflow happened on: 9223372036854775807 - -1" ); } } diff --git a/arrow-array/src/arithmetic.rs b/arrow-array/src/arithmetic.rs index 078c2e3bc40e..fb9c868fb6c0 100644 --- a/arrow-array/src/arithmetic.rs +++ b/arrow-array/src/arithmetic.rs @@ -154,7 +154,7 @@ macro_rules! native_type_op { #[inline] fn add_checked(self, rhs: Self) -> Result { self.checked_add(rhs).ok_or_else(|| { - ArrowError::ComputeError(format!( + ArrowError::ArithmeticOverflow(format!( "Overflow happened on: {:?} + {:?}", self, rhs )) @@ -169,7 +169,7 @@ macro_rules! native_type_op { #[inline] fn sub_checked(self, rhs: Self) -> Result { self.checked_sub(rhs).ok_or_else(|| { - ArrowError::ComputeError(format!( + ArrowError::ArithmeticOverflow(format!( "Overflow happened on: {:?} - {:?}", self, rhs )) @@ -184,7 +184,7 @@ macro_rules! native_type_op { #[inline] fn mul_checked(self, rhs: Self) -> Result { self.checked_mul(rhs).ok_or_else(|| { - ArrowError::ComputeError(format!( + ArrowError::ArithmeticOverflow(format!( "Overflow happened on: {:?} * {:?}", self, rhs )) @@ -202,7 +202,7 @@ macro_rules! native_type_op { Err(ArrowError::DivideByZero) } else { self.checked_div(rhs).ok_or_else(|| { - ArrowError::ComputeError(format!( + ArrowError::ArithmeticOverflow(format!( "Overflow happened on: {:?} / {:?}", self, rhs )) @@ -221,7 +221,7 @@ macro_rules! native_type_op { Err(ArrowError::DivideByZero) } else { self.checked_rem(rhs).ok_or_else(|| { - ArrowError::ComputeError(format!( + ArrowError::ArithmeticOverflow(format!( "Overflow happened on: {:?} % {:?}", self, rhs )) @@ -237,14 +237,17 @@ macro_rules! native_type_op { #[inline] fn neg_checked(self) -> Result { self.checked_neg().ok_or_else(|| { - ArrowError::ComputeError(format!("Overflow happened on: - {:?}", self)) + ArrowError::ArithmeticOverflow(format!("Overflow happened on: - {:?}", self)) }) } #[inline] fn pow_checked(self, exp: u32) -> Result { self.checked_pow(exp).ok_or_else(|| { - ArrowError::ComputeError(format!("Overflow happened on: {:?} ^ {exp:?}", self)) + ArrowError::ArithmeticOverflow(format!( + "Overflow happened on: {:?} ^ {exp:?}", + self + )) }) } diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 7df9420f94f0..5f72debcdad2 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -4531,7 +4531,7 @@ mod tests { ))], IntervalUnit::DayTime, format!( - "Compute error: Overflow happened on: {} * 100", + "Arithmetic overflow: Overflow happened on: {} * 100", i64::MAX - 2 ) ); @@ -4543,7 +4543,10 @@ mod tests { i64::MAX - 2 ))], IntervalUnit::MonthDayNano, - format!("Compute error: Overflow happened on: {} * 12", i64::MAX - 2) + format!( + "Arithmetic overflow: Overflow happened on: {} * 12", + i64::MAX - 2 + ) ); } diff --git a/arrow-schema/src/error.rs b/arrow-schema/src/error.rs index d9a0f3452c86..5e632d051f0f 100644 --- a/arrow-schema/src/error.rs +++ b/arrow-schema/src/error.rs @@ -33,6 +33,7 @@ pub enum ArrowError { SchemaError(String), ComputeError(String), DivideByZero, + ArithmeticOverflow(String), CsvError(String), JsonError(String), IoError(String, std::io::Error), @@ -88,6 +89,7 @@ impl Display for ArrowError { ArrowError::ParseError(desc) => write!(f, "Parser error: {desc}"), ArrowError::SchemaError(desc) => write!(f, "Schema error: {desc}"), ArrowError::ComputeError(desc) => write!(f, "Compute error: {desc}"), + ArrowError::ArithmeticOverflow(desc) => write!(f, "Arithmetic overflow: {desc}"), ArrowError::DivideByZero => write!(f, "Divide by zero error"), ArrowError::CsvError(desc) => write!(f, "Csv error: {desc}"), ArrowError::JsonError(desc) => write!(f, "Json error: {desc}"), From e815d067664f6f76cd26d34aef70303f479c2c4c Mon Sep 17 00:00:00 2001 From: Nick Cameron Date: Sat, 27 Jul 2024 22:27:47 +1200 Subject: [PATCH 04/17] Implement data_part for intervals (#6071) Signed-off-by: Nick Cameron Co-authored-by: Andrew Lamb --- arrow-arith/src/temporal.rs | 304 +++++++++++++++++++++++++++++++++++- 1 file changed, 299 insertions(+), 5 deletions(-) diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index d52af32f42c4..a665b7c06bad 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use arrow_array::cast::AsArray; +use cast::as_primitive_array; use chrono::{Datelike, NaiveDateTime, Offset, TimeZone, Timelike, Utc}; use arrow_array::temporal_conversions::{ @@ -31,7 +32,7 @@ use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; use arrow_buffer::ArrowNativeType; -use arrow_schema::{ArrowError, DataType}; +use arrow_schema::{ArrowError, DataType, IntervalUnit}; /// Valid parts to extract from date/time/timestamp arrays. /// @@ -111,6 +112,7 @@ where /// - Date32/Date64 /// - Time32/Time64 /// - Timestamp +/// - Interval /// /// Returns an [`Int32Array`] unless input was a dictionary type, in which case returns /// the dictionary but with this function applied onto its values. @@ -137,10 +139,21 @@ pub fn date_part(array: &dyn Array, part: DatePart) -> Result { - // todo!(); - // } + DataType::Interval(IntervalUnit::YearMonth) => { + let array = as_primitive_array::(array).date_part(part)?; + let array = Arc::new(array) as ArrayRef; + Ok(array) + } + DataType::Interval(IntervalUnit::DayTime) => { + let array = as_primitive_array::(array).date_part(part)?; + let array = Arc::new(array) as ArrayRef; + Ok(array) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let array = as_primitive_array::(array).date_part(part)?; + let array = Arc::new(array) as ArrayRef; + Ok(array) + } DataType::Dictionary(_, _) => { let array = array.as_any_dictionary(); let values = date_part(array.values(), part)?; @@ -387,6 +400,88 @@ impl ExtractDatePartExt for PrimitiveArray { } } +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Year => Ok(self.unary_opt(|d| Some(d / 12))), + DatePart::Month => Ok(self.unary_opt(|d| Some(d % 12))), + + DatePart::Quarter + | DatePart::Week + | DatePart::Day + | DatePart::DayOfWeekSunday0 + | DatePart::DayOfWeekMonday0 + | DatePart::DayOfYear + | DatePart::Hour + | DatePart::Minute + | DatePart::Second + | DatePart::Millisecond + | DatePart::Microsecond + | DatePart::Nanosecond => { + return_compute_error_with!(format!("{part} does not support"), self.data_type()) + } + } + } +} + +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Week => Ok(self.unary_opt(|d| Some(d.days / 7))), + DatePart::Day => Ok(self.unary_opt(|d| Some(d.days))), + DatePart::Hour => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 60 * 1_000)))), + DatePart::Minute => Ok(self.unary_opt(|d| Some(d.milliseconds / (60 * 1_000)))), + DatePart::Second => Ok(self.unary_opt(|d| Some(d.milliseconds / 1_000))), + DatePart::Millisecond => Ok(self.unary_opt(|d| Some(d.milliseconds))), + DatePart::Microsecond => Ok(self.unary_opt(|d| d.milliseconds.checked_mul(1_000))), + DatePart::Nanosecond => Ok(self.unary_opt(|d| d.milliseconds.checked_mul(1_000_000))), + + DatePart::Quarter + | DatePart::Year + | DatePart::Month + | DatePart::DayOfWeekSunday0 + | DatePart::DayOfWeekMonday0 + | DatePart::DayOfYear => { + return_compute_error_with!(format!("{part} does not support"), self.data_type()) + } + } + } +} + +impl ExtractDatePartExt for PrimitiveArray { + fn date_part(&self, part: DatePart) -> Result { + match part { + DatePart::Year => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months / 12))), + DatePart::Month => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.months))), + DatePart::Week => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.days / 7))), + DatePart::Day => Ok(self.unary_opt(|d: IntervalMonthDayNano| Some(d.days))), + DatePart::Hour => { + Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 60 * 1_000_000_000)).try_into().ok())) + } + DatePart::Minute => { + Ok(self.unary_opt(|d| (d.nanoseconds / (60 * 1_000_000_000)).try_into().ok())) + } + DatePart::Second => { + Ok(self.unary_opt(|d| (d.nanoseconds / 1_000_000_000).try_into().ok())) + } + DatePart::Millisecond => { + Ok(self.unary_opt(|d| (d.nanoseconds / 1_000_000).try_into().ok())) + } + DatePart::Microsecond => { + Ok(self.unary_opt(|d| (d.nanoseconds / 1_000).try_into().ok())) + } + DatePart::Nanosecond => Ok(self.unary_opt(|d| d.nanoseconds.try_into().ok())), + + DatePart::Quarter + | DatePart::DayOfWeekSunday0 + | DatePart::DayOfWeekMonday0 + | DatePart::DayOfYear => { + return_compute_error_with!(format!("{part} does not support"), self.data_type()) + } + } + } +} + macro_rules! return_compute_error_with { ($msg:expr, $param:expr) => { return { Err(ArrowError::ComputeError(format!("{}: {:?}", $msg, $param))) } @@ -1500,4 +1595,203 @@ mod tests { ensure_returns_error(&Time64MicrosecondArray::from(vec![0])); ensure_returns_error(&Time64NanosecondArray::from(vec![0])); } + + // IntervalDayTimeType week, day, hour, miute, second, mili, u, nano; invalid month, year; ignores the other part + // IntervalMonthDayNanoType year -> nano; days don't affect months, time doesn't affect days, time doesn't affect months (and vice versa) + #[test] + fn test_interval_year_month_array() { + let input: IntervalYearMonthArray = vec![0, 5, 24].into(); + + let actual = date_part(&input, DatePart::Year).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(2, actual.value(2)); + + let actual = date_part(&input, DatePart::Month).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(5, actual.value(1)); + assert_eq!(0, actual.value(2)); + + assert!(date_part(&input, DatePart::Day).is_err()); + assert!(date_part(&input, DatePart::Week).is_err()); + } + + #[test] + fn test_interval_day_time_array() { + let input: IntervalDayTimeArray = vec![ + IntervalDayTime::ZERO, + IntervalDayTime::new(10, 42), + IntervalDayTime::new(10, 1042), + IntervalDayTime::new(10, MILLISECONDS_IN_DAY as i32 + 1), + ] + .into(); + + // Time doesn't affect days. + let actual = date_part(&input, DatePart::Day).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(10, actual.value(1)); + assert_eq!(10, actual.value(2)); + assert_eq!(10, actual.value(3)); + + let actual = date_part(&input, DatePart::Week).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(1, actual.value(1)); + assert_eq!(1, actual.value(2)); + assert_eq!(1, actual.value(3)); + + // Days doesn't affect time. + let actual = date_part(&input, DatePart::Nanosecond).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(42_000_000, actual.value(1)); + assert_eq!(1_042_000_000, actual.value(2)); + // Overflow returns zero. + assert_eq!(0, actual.value(3)); + + let actual = date_part(&input, DatePart::Microsecond).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(42_000, actual.value(1)); + assert_eq!(1_042_000, actual.value(2)); + // Overflow returns zero. + assert_eq!(0, actual.value(3)); + + let actual = date_part(&input, DatePart::Millisecond).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(42, actual.value(1)); + assert_eq!(1042, actual.value(2)); + assert_eq!(MILLISECONDS_IN_DAY as i32 + 1, actual.value(3)); + + let actual = date_part(&input, DatePart::Second).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(1, actual.value(2)); + assert_eq!(24 * 60 * 60, actual.value(3)); + + let actual = date_part(&input, DatePart::Minute).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(0, actual.value(2)); + assert_eq!(24 * 60, actual.value(3)); + + let actual = date_part(&input, DatePart::Hour).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(0, actual.value(2)); + assert_eq!(24, actual.value(3)); + + // Month and year are not valid (since days in month varies). + assert!(date_part(&input, DatePart::Month).is_err()); + assert!(date_part(&input, DatePart::Year).is_err()); + } + + #[test] + fn test_interval_month_day_nano_array() { + let input: IntervalMonthDayNanoArray = vec![ + IntervalMonthDayNano::ZERO, + IntervalMonthDayNano::new(5, 10, 42), + IntervalMonthDayNano::new(16, 35, MILLISECONDS_IN_DAY * 1_000_000 + 1), + ] + .into(); + + // Year and month follow from month, but are not affected by days or nanos. + let actual = date_part(&input, DatePart::Year).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(1, actual.value(2)); + + let actual = date_part(&input, DatePart::Month).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(5, actual.value(1)); + assert_eq!(16, actual.value(2)); + + // Week and day follow from day, but are not affected by months or nanos. + let actual = date_part(&input, DatePart::Week).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(1, actual.value(1)); + assert_eq!(5, actual.value(2)); + + let actual = date_part(&input, DatePart::Day).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(10, actual.value(1)); + assert_eq!(35, actual.value(2)); + + // Times follow from nanos, but are not affected by months or dats. + let actual = date_part(&input, DatePart::Hour).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(24, actual.value(2)); + + let actual = date_part(&input, DatePart::Minute).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(24 * 60, actual.value(2)); + + let actual = date_part(&input, DatePart::Second).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(24 * 60 * 60, actual.value(2)); + + let actual = date_part(&input, DatePart::Millisecond).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + assert_eq!(24 * 60 * 60 * 1_000, actual.value(2)); + + let actual = date_part(&input, DatePart::Microsecond).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(0, actual.value(1)); + // Overflow gives zero. + assert_eq!(0, actual.value(2)); + + let actual = date_part(&input, DatePart::Nanosecond).unwrap(); + let actual = actual.as_primitive::(); + assert_eq!(0, actual.value(0)); + assert_eq!(42, actual.value(1)); + // Overflow gives zero. + assert_eq!(0, actual.value(2)); + } + + #[test] + fn test_interval_array_invalid_parts() { + fn ensure_returns_error(array: &dyn Array) { + let invalid_parts = [ + DatePart::Quarter, + DatePart::DayOfWeekSunday0, + DatePart::DayOfWeekMonday0, + DatePart::DayOfYear, + ]; + + for part in invalid_parts { + let err = date_part(array, part).unwrap_err(); + let expected = format!( + "Compute error: {part} does not support: {}", + array.data_type() + ); + assert_eq!(expected, err.to_string()); + } + } + + ensure_returns_error(&IntervalYearMonthArray::from(vec![0])); + ensure_returns_error(&IntervalDayTimeArray::from(vec![IntervalDayTime::ZERO])); + ensure_returns_error(&IntervalMonthDayNanoArray::from(vec![ + IntervalMonthDayNano::ZERO, + ])); + } } From 705d3414eb991c234aaced332c11973e997d360f Mon Sep 17 00:00:00 2001 From: Alexander Rafferty Date: Sat, 27 Jul 2024 20:41:34 +1000 Subject: [PATCH 05/17] Remove `SchemaBuilder` dependency from `StructArray` constructors (#6139) --- arrow-array/src/array/struct_array.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/struct_array.rs b/arrow-array/src/array/struct_array.rs index 44a7f38c32ce..059bc0b5e65b 100644 --- a/arrow-array/src/array/struct_array.rs +++ b/arrow-array/src/array/struct_array.rs @@ -18,7 +18,7 @@ use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch}; use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, SchemaBuilder}; +use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; use std::sync::Arc; use std::{any::Any, ops::Index}; @@ -326,7 +326,7 @@ impl TryFrom> for StructArray { /// builds a StructArray from a vector of names and arrays. fn try_from(values: Vec<(&str, ArrayRef)>) -> Result { - let (schema, arrays): (SchemaBuilder, _) = values + let (fields, arrays): (Vec<_>, _) = values .into_iter() .map(|(name, array)| { ( @@ -336,7 +336,7 @@ impl TryFrom> for StructArray { }) .unzip(); - StructArray::try_new(schema.finish().fields, arrays, None) + StructArray::try_new(fields.into(), arrays, None) } } @@ -397,8 +397,8 @@ impl Array for StructArray { impl From> for StructArray { fn from(v: Vec<(FieldRef, ArrayRef)>) -> Self { - let (schema, arrays): (SchemaBuilder, _) = v.into_iter().unzip(); - StructArray::new(schema.finish().fields, arrays, None) + let (fields, arrays): (Vec<_>, _) = v.into_iter().unzip(); + StructArray::new(fields.into(), arrays, None) } } @@ -424,9 +424,9 @@ impl std::fmt::Debug for StructArray { impl From<(Vec<(FieldRef, ArrayRef)>, Buffer)> for StructArray { fn from(pair: (Vec<(FieldRef, ArrayRef)>, Buffer)) -> Self { let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default(); - let (fields, arrays): (SchemaBuilder, Vec<_>) = pair.0.into_iter().unzip(); + let (fields, arrays): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip(); let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len)); - Self::new(fields.finish().fields, arrays, Some(nulls)) + Self::new(fields.into(), arrays, Some(nulls)) } } From 5f5a82cb388094ea2a54f313b5a43249613195aa Mon Sep 17 00:00:00 2001 From: V0ldek Date: Sun, 28 Jul 2024 12:55:40 +0200 Subject: [PATCH 06/17] Remove automatic buffering in `ipc::reader::FileReader` for for consistent buffering (#6132) * change ipc::reader and writer APIs for consistent buffering Current writer API automatically wraps the supplied std::io::Writer impl into a BufWriter. It is cleaner and more idiomatic to have the default be using the supplied impl directly, as the user might already have a BufWriter or an impl that doesn't actually benefit from buffering at all. StreamReader does a similar thing, but it also exposes a `try_new_unbuffered` that bypasses the internal wrap. Here we propose a consistent and non-buffered by default API: - `try_new` does not wrap the passed reader/writer, - `try_new_buffered` is a convenience function that does wrap the reader/writer into a BufReader/BufWriter, - all four publicly exposed IPC reader/writers follow the above consistently, i.e. `StreamReader`, `FileReader`, `StreamWriter`, `FileWriter`. Those are breaking changes. An additional tweak: removed the generic type bounds from struct definitions on the four types, as that is the idiomatic Rust approach (see e.g. stdlib's HashMap that has no bounds on the struct definition, only the impl requires Hash + Eq). See #6099 for the discussion. * improvements to docs in `arrow::ipc::reader` and `writer` Applied a few suggestions, made `Error` sections more consistent. --- arrow-ipc/src/reader.rs | 65 +++++++++++++++++++++-------- arrow-ipc/src/writer.rs | 92 ++++++++++++++++++++++++++++++----------- 2 files changed, 115 insertions(+), 42 deletions(-) diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 1f83200d65f8..2b1d09dc9588 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -1010,8 +1010,8 @@ impl FileReaderBuilder { } /// Arrow File reader -pub struct FileReader { - /// Buffered file reader that supports reading and seeking +pub struct FileReader { + /// File reader that supports reading and seeking reader: R, /// The decoder @@ -1032,7 +1032,7 @@ pub struct FileReader { custom_metadata: HashMap, } -impl fmt::Debug for FileReader { +impl fmt::Debug for FileReader { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { f.debug_struct("FileReader") .field("decoder", &self.decoder) @@ -1043,10 +1043,26 @@ impl fmt::Debug for FileReader { } } +impl FileReader> { + /// Try to create a new file reader with the reader wrapped in a BufReader. + /// + /// See [`FileReader::try_new`] for an unbuffered version. + pub fn try_new_buffered(reader: R, projection: Option>) -> Result { + Self::try_new(BufReader::new(reader), projection) + } +} + impl FileReader { - /// Try to create a new file reader + /// Try to create a new file reader. /// - /// Returns errors if the file does not meet the Arrow Format footer requirements + /// There is no internal buffering. If buffered reads are needed you likely want to use + /// [`FileReader::try_new_buffered`] instead. + /// + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if: + /// - the file does not meet the Arrow Format footer requirements, or + /// - file endianness does not match the target endianness. pub fn try_new(reader: R, projection: Option>) -> Result { let builder = FileReaderBuilder { projection, @@ -1129,7 +1145,7 @@ impl RecordBatchReader for FileReader { } /// Arrow Stream reader -pub struct StreamReader { +pub struct StreamReader { /// Stream reader reader: R, @@ -1150,10 +1166,10 @@ pub struct StreamReader { projection: Option<(Vec, Schema)>, } -impl fmt::Debug for StreamReader { +impl fmt::Debug for StreamReader { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), fmt::Error> { f.debug_struct("StreamReader") - .field("reader", &"BufReader<..>") + .field("reader", &"R") .field("schema", &self.schema) .field("dictionaries_by_id", &self.dictionaries_by_id) .field("finished", &self.finished) @@ -1163,21 +1179,27 @@ impl fmt::Debug for StreamReader { } impl StreamReader> { - /// Try to create a new stream reader with the reader wrapped in a BufReader + /// Try to create a new stream reader with the reader wrapped in a BufReader. /// - /// The first message in the stream is the schema, the reader will fail if it does not - /// encounter a schema. - /// To check if the reader is done, use `is_finished(self)` - pub fn try_new(reader: R, projection: Option>) -> Result { - Self::try_new_unbuffered(BufReader::new(reader), projection) + /// See [`StreamReader::try_new`] for an unbuffered version. + pub fn try_new_buffered(reader: R, projection: Option>) -> Result { + Self::try_new(BufReader::new(reader), projection) } } impl StreamReader { - /// Try to create a new stream reader but do not wrap the reader in a BufReader. + /// Try to create a new stream reader. /// - /// Unless you need the StreamReader to be unbuffered you likely want to use `StreamReader::try_new` instead. - pub fn try_new_unbuffered( + /// To check if the reader is done, use [`is_finished(self)`](StreamReader::is_finished). + /// + /// There is no internal buffering. If buffered reads are needed you likely want to use + /// [`StreamReader::try_new_buffered`] instead. + /// + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if the reader does not encounter a schema + /// as the first message in the stream. + pub fn try_new( mut reader: R, projection: Option>, ) -> Result, ArrowError> { @@ -1224,6 +1246,15 @@ impl StreamReader { }) } + /// Deprecated, use [`StreamReader::try_new`] instead. + #[deprecated(since = "53.0.0", note = "use `try_new` instead")] + pub fn try_new_unbuffered( + reader: R, + projection: Option>, + ) -> Result { + Self::try_new(reader, projection) + } + /// Return the schema of the stream pub fn schema(&self) -> SchemaRef { self.schema.clone() diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 5a8adb31b038..ade902f7cafd 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -821,9 +821,9 @@ impl DictionaryTracker { } /// Writer for an IPC file -pub struct FileWriter { +pub struct FileWriter { /// The object to write to - writer: BufWriter, + writer: W, /// IPC write options write_options: IpcWriteOptions, /// A reference to the schema, used in validating record batches @@ -844,21 +844,41 @@ pub struct FileWriter { data_gen: IpcDataGenerator, } +impl FileWriter> { + /// Try to create a new file writer with the writer wrapped in a BufWriter. + /// + /// See [`FileWriter::try_new`] for an unbuffered version. + pub fn try_new_buffered(writer: W, schema: &Schema) -> Result { + Self::try_new(BufWriter::new(writer), schema) + } +} + impl FileWriter { /// Try to create a new writer, with the schema written as part of the header + /// + /// Note the created writer is not buffered. See [`FileWriter::try_new_buffered`] for details. + /// + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if writing the header to the writer fails. pub fn try_new(writer: W, schema: &Schema) -> Result { let write_options = IpcWriteOptions::default(); Self::try_new_with_options(writer, schema, write_options) } /// Try to create a new writer with IpcWriteOptions + /// + /// Note the created writer is not buffered. See [`FileWriter::try_new_buffered`] for details. + /// + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if writing the header to the writer fails. pub fn try_new_with_options( - writer: W, + mut writer: W, schema: &Schema, write_options: IpcWriteOptions, ) -> Result { let data_gen = IpcDataGenerator::default(); - let mut writer = BufWriter::new(writer); // write magic to header aligned on alignment boundary let pad_len = pad_to_alignment(write_options.alignment, super::ARROW_MAGIC.len()); let header_size = super::ARROW_MAGIC.len() + pad_len; @@ -972,14 +992,14 @@ impl FileWriter { /// Gets a reference to the underlying writer. pub fn get_ref(&self) -> &W { - self.writer.get_ref() + &self.writer } /// Gets a mutable reference to the underlying writer. /// /// It is inadvisable to directly write to the underlying writer. pub fn get_mut(&mut self) -> &mut W { - self.writer.get_mut() + &mut self.writer } /// Flush the underlying writer. @@ -990,16 +1010,20 @@ impl FileWriter { Ok(()) } - /// Unwraps the BufWriter housed in FileWriter.writer, returning the underlying - /// writer + /// Unwraps the the underlying writer. + /// + /// The writer is flushed and the FileWriter is finished before returning. /// - /// The buffer is flushed and the FileWriter is finished before returning the - /// writer. + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if an error occurs while finishing the StreamWriter + /// or while flushing the writer. pub fn into_inner(mut self) -> Result { if !self.finished { + // `finish` flushes the writer. self.finish()?; } - self.writer.into_inner().map_err(ArrowError::from) + Ok(self.writer) } } @@ -1014,9 +1038,9 @@ impl RecordBatchWriter for FileWriter { } /// Writer for an IPC stream -pub struct StreamWriter { +pub struct StreamWriter { /// The object to write to - writer: BufWriter, + writer: W, /// IPC write options write_options: IpcWriteOptions, /// Whether the writer footer has been written, and the writer is finished @@ -1027,20 +1051,39 @@ pub struct StreamWriter { data_gen: IpcDataGenerator, } +impl StreamWriter> { + /// Try to create a new stream writer with the writer wrapped in a BufWriter. + /// + /// See [`StreamWriter::try_new`] for an unbuffered version. + pub fn try_new_buffered(writer: W, schema: &Schema) -> Result { + Self::try_new(BufWriter::new(writer), schema) + } +} + impl StreamWriter { - /// Try to create a new writer, with the schema written as part of the header + /// Try to create a new writer, with the schema written as part of the header. + /// + /// Note that there is no internal buffering. See also [`StreamWriter::try_new_buffered`]. + /// + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if writing the header to the writer fails. pub fn try_new(writer: W, schema: &Schema) -> Result { let write_options = IpcWriteOptions::default(); Self::try_new_with_options(writer, schema, write_options) } + /// Try to create a new writer with [`IpcWriteOptions`]. + /// + /// # Errors + /// + /// An ['Err'](Result::Err) may be returned if writing the header to the writer fails. pub fn try_new_with_options( - writer: W, + mut writer: W, schema: &Schema, write_options: IpcWriteOptions, ) -> Result { let data_gen = IpcDataGenerator::default(); - let mut writer = BufWriter::new(writer); // write the schema, set the written bytes to the schema let encoded_message = data_gen.schema_to_bytes(schema, &write_options); write_message(&mut writer, encoded_message, &write_options)?; @@ -1095,14 +1138,14 @@ impl StreamWriter { /// Gets a reference to the underlying writer. pub fn get_ref(&self) -> &W { - self.writer.get_ref() + &self.writer } /// Gets a mutable reference to the underlying writer. /// /// It is inadvisable to directly write to the underlying writer. pub fn get_mut(&mut self) -> &mut W { - self.writer.get_mut() + &mut self.writer } /// Flush the underlying writer. @@ -1113,16 +1156,14 @@ impl StreamWriter { Ok(()) } - /// Unwraps the BufWriter housed in StreamWriter.writer, returning the underlying - /// writer + /// Unwraps the the underlying writer. /// - /// The buffer is flushed and the StreamWriter is finished before returning the - /// writer. + /// The writer is flushed and the StreamWriter is finished before returning. /// /// # Errors /// - /// An ['Err'] may be returned if an error occurs while finishing the StreamWriter - /// or while flushing the buffer. + /// An ['Err'](Result::Err) may be returned if an error occurs while finishing the StreamWriter + /// or while flushing the writer. /// /// # Example /// @@ -1154,9 +1195,10 @@ impl StreamWriter { /// ``` pub fn into_inner(mut self) -> Result { if !self.finished { + // `finish` flushes. self.finish()?; } - self.writer.into_inner().map_err(ArrowError::from) + Ok(self.writer) } } From 80ed7128510bac114c6feec08c34ef3beed3a44a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Mon, 29 Jul 2024 03:32:41 -0700 Subject: [PATCH 07/17] Use `LevelHistogram` in `PageIndex` (#6135) * use LevelHistogram in PageIndex and ColumnIndexBuilder * revert changes to OffsetIndexBuilder --- parquet/src/file/metadata/mod.rs | 2 +- parquet/src/file/page_index/index.rs | 28 ++++++++++++++++------------ parquet/src/file/writer.rs | 6 +++--- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index cd3555de828c..d99cd951037c 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -569,7 +569,7 @@ pub struct ColumnChunkMetaData { /// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the /// number of rows with level 1, and so on. /// -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)] pub struct LevelHistogram { inner: Vec, } diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 68412572b5f2..cebb602b31a1 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -21,6 +21,7 @@ use crate::basic::Type; use crate::data_type::private::ParquetValueType; use crate::data_type::{AsBytes, ByteArray, FixedLenByteArray, Int96}; use crate::errors::ParquetError; +use crate::file::metadata::LevelHistogram; use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; use std::fmt::Debug; @@ -40,13 +41,13 @@ pub struct PageIndex { /// /// `repetition_level_histogram[i]` is a count of how many values are at repetition level `i`. /// For example, `repetition_level_histogram[0]` indicates how many rows the page contains. - pub repetition_level_histogram: Option>, + pub repetition_level_histogram: Option, /// Definition level histogram for the page /// /// `definition_level_histogram[i]` is a count of how many values are at definition level `i`. /// For example, `definition_level_histogram[max_definition_level]` indicates how many /// non-null values are present in the page. - pub definition_level_histogram: Option>, + pub definition_level_histogram: Option, } impl PageIndex { @@ -59,10 +60,10 @@ impl PageIndex { pub fn null_count(&self) -> Option { self.null_count } - pub fn repetition_level_histogram(&self) -> Option<&Vec> { + pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> { self.repetition_level_histogram.as_ref() } - pub fn definition_level_histogram(&self) -> Option<&Vec> { + pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> { self.definition_level_histogram.as_ref() } } @@ -175,7 +176,7 @@ impl NativeIndex { for i in 0..len { let page_idx = i * num_levels; let page_hist = hist[page_idx..page_idx + num_levels].to_vec(); - res.push(Some(page_hist)); + res.push(Some(LevelHistogram::from(page_hist))); } res } else { @@ -183,9 +184,9 @@ impl NativeIndex { } }; - let rep_hists: Vec>> = + let rep_hists: Vec> = to_page_histograms(index.repetition_level_histograms); - let def_hists: Vec>> = + let def_hists: Vec> = to_page_histograms(index.definition_level_histograms); let indexes = index @@ -236,8 +237,8 @@ mod tests { min: Some(-123), max: Some(234), null_count: Some(0), - repetition_level_histogram: Some(vec![1, 2]), - definition_level_histogram: Some(vec![1, 2, 3]), + repetition_level_histogram: Some(LevelHistogram::from(vec![1, 2])), + definition_level_histogram: Some(LevelHistogram::from(vec![1, 2, 3])), }; assert_eq!(page_index.min().unwrap(), &-123); @@ -245,10 +246,13 @@ mod tests { assert_eq!(page_index.min_bytes().unwrap(), (-123).as_bytes()); assert_eq!(page_index.max_bytes().unwrap(), 234.as_bytes()); assert_eq!(page_index.null_count().unwrap(), 0); - assert_eq!(page_index.repetition_level_histogram(), Some(&vec![1, 2])); assert_eq!( - page_index.definition_level_histogram(), - Some(&vec![1, 2, 3]) + page_index.repetition_level_histogram().unwrap().values(), + &vec![1, 2] + ); + assert_eq!( + page_index.definition_level_histogram().unwrap().values(), + &vec![1, 2, 3] ); } diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index f2e8f74a378c..89aaf028d1b9 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1951,7 +1951,7 @@ mod tests { assert!(col_idx.repetition_level_histogram().is_none()); assert!(col_idx.definition_level_histogram().is_some()); - check_def_hist(col_idx.definition_level_histogram().unwrap()); + check_def_hist(col_idx.definition_level_histogram().unwrap().values()); assert!(reader.metadata().offset_index().is_some()); let offset_index = reader.metadata().offset_index().unwrap(); @@ -2066,8 +2066,8 @@ mod tests { unreachable!() }; - check_def_hist(col_idx.definition_level_histogram().unwrap()); - check_rep_hist(col_idx.repetition_level_histogram().unwrap()); + check_def_hist(col_idx.definition_level_histogram().unwrap().values()); + check_rep_hist(col_idx.repetition_level_histogram().unwrap().values()); assert!(reader.metadata().offset_index().is_some()); let offset_index = reader.metadata().offset_index().unwrap(); From 11f2bb80764a3980196acd243c8d03597980dafb Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 29 Jul 2024 17:41:47 +0100 Subject: [PATCH 08/17] Fix comparison kernel benchmarks (#6147) * fix comparison kernel benchmarks * add comment as suggested by @alamb --- arrow/benches/comparison_kernels.rs | 20 ++++++++++---------- arrow/src/util/bench_util.rs | 29 ++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 5b3d700d6030..5d18a62d13a1 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -215,11 +215,11 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("like_utf8 scalar ends with", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx%")) + b.iter(|| bench_like_utf8_scalar(&arr_string, "%xxxx")) }); c.bench_function("like_utf8 scalar starts with", |b| { - b.iter(|| bench_like_utf8_scalar(&arr_string, "%xxxx")) + b.iter(|| bench_like_utf8_scalar(&arr_string, "xxxx%")) }); c.bench_function("like_utf8 scalar complex", |b| { @@ -237,11 +237,11 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("like_utf8view scalar ends with", |b| { - b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxx%")) + b.iter(|| bench_like_utf8view_scalar(&string_view_left, "%xxxx")) }); c.bench_function("like_utf8view scalar starts with", |b| { - b.iter(|| bench_like_utf8view_scalar(&string_view_left, "%xxxx")) + b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxx%")) }); c.bench_function("like_utf8view scalar complex", |b| { @@ -259,11 +259,11 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("nlike_utf8 scalar ends with", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "xxxx%")) + b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xxxx")) }); c.bench_function("nlike_utf8 scalar starts with", |b| { - b.iter(|| bench_nlike_utf8_scalar(&arr_string, "%xxxx")) + b.iter(|| bench_nlike_utf8_scalar(&arr_string, "xxxx%")) }); c.bench_function("nlike_utf8 scalar complex", |b| { @@ -281,11 +281,11 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("ilike_utf8 scalar ends with", |b| { - b.iter(|| bench_ilike_utf8_scalar(&arr_string, "xXXx%")) + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%xXXx")) }); c.bench_function("ilike_utf8 scalar starts with", |b| { - b.iter(|| bench_ilike_utf8_scalar(&arr_string, "%XXXx")) + b.iter(|| bench_ilike_utf8_scalar(&arr_string, "XXXx%")) }); c.bench_function("ilike_utf8 scalar complex", |b| { @@ -303,11 +303,11 @@ fn add_benchmark(c: &mut Criterion) { }); c.bench_function("nilike_utf8 scalar ends with", |b| { - b.iter(|| bench_nilike_utf8_scalar(&arr_string, "xXXx%")) + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%xXXx")) }); c.bench_function("nilike_utf8 scalar starts with", |b| { - b.iter(|| bench_nilike_utf8_scalar(&arr_string, "%XXXx")) + b.iter(|| bench_nilike_utf8_scalar(&arr_string, "XXXx%")) }); c.bench_function("nilike_utf8 scalar complex", |b| { diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 9fae8e6bab38..ac7f86d561d5 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -90,7 +90,7 @@ pub fn create_month_day_nano_array_with_seed( .collect() } -/// Creates an random (but fixed-seeded) array of a given size and null density +/// Creates a random (but fixed-seeded) array of a given size and null density pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray where Standard: Distribution, @@ -108,12 +108,35 @@ where .collect() } -/// Creates an random (but fixed-seeded) array of a given size and null density +/// Creates a random (but fixed-seeded) string array of a given size and null density, strings have a random length +/// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths, +/// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex. pub fn create_string_array( size: usize, null_density: f32, ) -> GenericStringArray { - create_string_array_with_len(size, null_density, 4) + create_string_array_with_max_len(size, null_density, 400) +} + +/// Creates a random (but fixed-seeded) array of rand size with a given max size, null density and length +fn create_string_array_with_max_len( + size: usize, + null_density: f32, + max_str_len: usize, +) -> GenericStringArray { + let rng = &mut seedable_rng(); + (0..size) + .map(|_| { + if rng.gen::() < null_density { + None + } else { + let str_len = rng.gen_range(0..max_str_len); + let value = rng.sample_iter(&Alphanumeric).take(str_len).collect(); + let value = String::from_utf8(value).unwrap(); + Some(value) + } + }) + .collect() } /// Creates a random (but fixed-seeded) array of a given size, null density and length From bd1e76b0857fc1c4fcbf8ba51aa55698a0f527ab Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 29 Jul 2024 14:22:38 -0400 Subject: [PATCH 09/17] Implement exponential block size growing strategy for `StringViewBuilder` (#6136) * new block size growing strategy * Update arrow-array/src/builder/generic_bytes_view_builder.rs Co-authored-by: Andrew Lamb * update function name, deprecate old function * update comments --------- Co-authored-by: Andrew Lamb --- arrow-array/src/array/byte_view_array.rs | 4 +- .../src/builder/generic_bytes_view_builder.rs | 104 +++++++++++++++++- arrow-cast/src/cast/mod.rs | 8 +- 3 files changed, 104 insertions(+), 12 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 63b9fe30ed42..a9aed95318f7 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -757,7 +757,7 @@ mod tests { fn test_in_progress_recreation() { let array = { // make a builder with small block size. - let mut builder = StringViewBuilder::new().with_block_size(14); + let mut builder = StringViewBuilder::new().with_fixed_block_size(14); builder.append_value("large payload over 12 bytes"); builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created")); builder.finish() @@ -848,7 +848,7 @@ mod tests { ]; let array = { - let mut builder = StringViewBuilder::new().with_block_size(8); // create multiple buffers + let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers test_data.into_iter().for_each(|v| builder.append_option(v)); builder.finish() }; diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 7726ee35240f..4f19204b86ef 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -30,7 +30,30 @@ use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{ArrayRef, GenericByteViewArray}; -const DEFAULT_BLOCK_SIZE: u32 = 8 * 1024; +const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB +const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB + +enum BlockSizeGrowthStrategy { + Fixed { size: u32 }, + Exponential { current_size: u32 }, +} + +impl BlockSizeGrowthStrategy { + fn next_size(&mut self) -> u32 { + match self { + Self::Fixed { size } => *size, + Self::Exponential { current_size } => { + if *current_size < MAX_BLOCK_SIZE { + // we have fixed start/end block sizes, so we can't overflow + *current_size = current_size.saturating_mul(2); + *current_size + } else { + MAX_BLOCK_SIZE + } + } + } + } +} /// A builder for [`GenericByteViewArray`] /// @@ -58,7 +81,7 @@ pub struct GenericByteViewBuilder { null_buffer_builder: NullBufferBuilder, completed: Vec, in_progress: Vec, - block_size: u32, + block_size: BlockSizeGrowthStrategy, /// Some if deduplicating strings /// map ` -> ` string_tracker: Option<(HashTable, ahash::RandomState)>, @@ -78,15 +101,42 @@ impl GenericByteViewBuilder { null_buffer_builder: NullBufferBuilder::new(capacity), completed: vec![], in_progress: vec![], - block_size: DEFAULT_BLOCK_SIZE, + block_size: BlockSizeGrowthStrategy::Exponential { + current_size: STARTING_BLOCK_SIZE, + }, string_tracker: None, phantom: Default::default(), } } + /// Set a fixed buffer size for variable length strings + /// + /// The block size is the size of the buffer used to store values greater + /// than 12 bytes. The builder allocates new buffers when the current + /// buffer is full. + /// + /// By default the builder balances buffer size and buffer count by + /// growing buffer size exponentially from 8KB up to 2MB. The + /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. + /// + /// If this method is used, any new buffers allocated are + /// exactly this size. This can be useful for advanced users + /// that want to control the memory usage and buffer count. + /// + /// See for more details on the implications. + pub fn with_fixed_block_size(self, block_size: u32) -> Self { + debug_assert!(block_size > 0, "Block size must be greater than 0"); + Self { + block_size: BlockSizeGrowthStrategy::Fixed { size: block_size }, + ..self + } + } + /// Override the size of buffers to allocate for holding string data + /// Use `with_fixed_block_size` instead. + #[deprecated(note = "Use `with_fixed_block_size` instead")] pub fn with_block_size(self, block_size: u32) -> Self { - Self { block_size, ..self } + self.with_fixed_block_size(block_size) } /// Deduplicate strings while building the array @@ -277,7 +327,7 @@ impl GenericByteViewBuilder { let required_cap = self.in_progress.len() + v.len(); if self.in_progress.capacity() < required_cap { self.flush_in_progress(); - let to_reserve = v.len().max(self.block_size as usize); + let to_reserve = v.len().max(self.block_size.next_size() as usize); self.in_progress.reserve(to_reserve); }; let offset = self.in_progress.len() as u32; @@ -478,7 +528,7 @@ mod tests { let mut builder = StringViewBuilder::new() .with_deduplicate_strings() - .with_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers + .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers let values = vec![ Some(value_1), @@ -585,4 +635,46 @@ mod tests { "Invalid argument error: No block found with index 5" ); } + + #[test] + fn test_string_view_with_block_size_growth() { + let mut exp_builder = StringViewBuilder::new(); + let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE); + + let long_string = String::from_utf8(vec![b'a'; STARTING_BLOCK_SIZE as usize]).unwrap(); + + for i in 0..9 { + // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M + for _ in 0..(2_u32.pow(i)) { + exp_builder.append_value(&long_string); + fixed_builder.append_value(&long_string); + } + exp_builder.flush_in_progress(); + fixed_builder.flush_in_progress(); + + // Every step only add one buffer, but the buffer size is much larger + assert_eq!(exp_builder.completed.len(), i as usize + 1); + assert_eq!( + exp_builder.completed[i as usize].len(), + STARTING_BLOCK_SIZE as usize * 2_usize.pow(i) + ); + + // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 + assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1); + + // Every buffer is fixed size + assert!(fixed_builder + .completed + .iter() + .all(|b| b.len() == STARTING_BLOCK_SIZE as usize)); + } + + // Add one more value, and the buffer stop growing. + exp_builder.append_value(&long_string); + exp_builder.flush_in_progress(); + assert_eq!( + exp_builder.completed.last().unwrap().capacity(), + MAX_BLOCK_SIZE as usize + ); + } } diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 5f72debcdad2..f6103cb84136 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -5321,7 +5321,7 @@ mod tests { let typed_dict = string_dict_array.downcast_dict::().unwrap(); let string_view_array = { - let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers. + let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers. for v in typed_dict.into_iter() { builder.append_option(v); } @@ -5338,7 +5338,7 @@ mod tests { let typed_binary_dict = binary_dict_array.downcast_dict::().unwrap(); let binary_view_array = { - let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers. + let mut builder = BinaryViewBuilder::new().with_fixed_block_size(8); // multiple buffers. for v in typed_binary_dict.into_iter() { builder.append_option(v); } @@ -5381,7 +5381,7 @@ mod tests { O: OffsetSizeTrait, { let view_array = { - let mut builder = StringViewBuilder::new().with_block_size(8); // multiple buffers. + let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // multiple buffers. for s in VIEW_TEST_DATA.iter() { builder.append_option(*s); } @@ -5410,7 +5410,7 @@ mod tests { O: OffsetSizeTrait, { let view_array = { - let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers. + let mut builder = BinaryViewBuilder::new().with_fixed_block_size(8); // multiple buffers. for s in VIEW_TEST_DATA.iter() { builder.append_option(*s); } From 0e99e3a64532665218bcb0d048c4e9961e39a913 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 29 Jul 2024 19:45:11 +0100 Subject: [PATCH 10/17] improve LIKE regex (#6145) --- arrow-string/src/predicate.rs | 83 ++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 01e3710a6d0a..c7ccffb3ada4 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -140,39 +140,54 @@ fn ends_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool { /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: /// -/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` -/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` -/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` +/// 1. Replace `LIKE` multi-character wildcards `%` => `.*` (unless they're at the start or end of the pattern, +/// where the regex is just truncated - e.g. `%foo%` => `foo` rather than `^.*foo.*$`) +/// 2. Replace `LIKE` single-character wildcards `_` => `.` +/// 3. Escape regex meta characters to match them and not be evaluated as regex special chars. e.g. `.` => `\\.` +/// 4. Replace escaped `LIKE` wildcards removing the escape characters to be able to match it as a regex. e.g. `\\%` => `%` fn regex_like(pattern: &str, case_insensitive: bool) -> Result { let mut result = String::with_capacity(pattern.len() * 2); - result.push('^'); let mut chars_iter = pattern.chars().peekable(); + match chars_iter.peek() { + // if the pattern starts with `%`, we avoid starting the regex with a slow but meaningless `^.*` + Some('%') => { + chars_iter.next(); + } + _ => result.push('^'), + }; + while let Some(c) = chars_iter.next() { - if c == '\\' { - let next = chars_iter.peek(); - match next { - Some(next) if is_like_pattern(*next) => { - result.push(*next); - // Skipping the next char as it is already appended - chars_iter.next(); + match c { + '\\' => { + match chars_iter.peek() { + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + _ => { + result.push('\\'); + result.push('\\'); + } } - _ => { - result.push('\\'); + } + '%' => result.push_str(".*"), + '_' => result.push('.'), + c => { + if regex_syntax::is_meta_character(c) { result.push('\\'); } + result.push(c); } - } else if regex_syntax::is_meta_character(c) { - result.push('\\'); - result.push(c); - } else if c == '%' { - result.push_str(".*"); - } else if c == '_' { - result.push('.'); - } else { - result.push(c); } } - result.push('$'); + // instead of ending the regex with `.*$` and making it needlessly slow, we just end the regex + if result.ends_with(".*") { + result.pop(); + result.pop(); + } else { + result.push('$'); + } RegexBuilder::new(&result) .case_insensitive(case_insensitive) .dot_matches_new_line(true) @@ -197,9 +212,25 @@ mod tests { use super::*; #[test] - fn test_replace_like_wildcards() { - let a_eq = "_%"; - let expected = "^..*$"; + fn test_replace_start_end_percent() { + let a_eq = "%foobar%"; + let expected = "foobar"; + let r = regex_like(a_eq, false).unwrap(); + assert_eq!(r.to_string(), expected); + } + + #[test] + fn test_replace_middle_percent() { + let a_eq = "foo%bar"; + let expected = "^foo.*bar$"; + let r = regex_like(a_eq, false).unwrap(); + assert_eq!(r.to_string(), expected); + } + + #[test] + fn test_replace_underscore() { + let a_eq = "foo_bar"; + let expected = "^foo.bar$"; let r = regex_like(a_eq, false).unwrap(); assert_eq!(r.to_string(), expected); } From bf9ce475df82d362631099d491d3454d64d50217 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 29 Jul 2024 22:08:30 +0100 Subject: [PATCH 11/17] Improve `LIKE` performance for "contains" style queries (#6128) * improve "contains" performance * add tests * cargo fmt :disappointed: --------- Co-authored-by: Andrew Lamb --- arrow-string/Cargo.toml | 2 +- arrow-string/src/like.rs | 8 ++++++-- arrow-string/src/predicate.rs | 35 +++++++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/arrow-string/Cargo.toml b/arrow-string/Cargo.toml index bdfa681113b4..0757067dc898 100644 --- a/arrow-string/Cargo.toml +++ b/arrow-string/Cargo.toml @@ -42,4 +42,4 @@ arrow-select = { workspace = true } regex = { version = "1.7.0", default-features = false, features = ["std", "unicode", "perf"] } regex-syntax = { version = "0.8.0", default-features = false, features = ["unicode"] } num = { version = "0.4", default-features = false, features = ["std"] } -memchr = "2.7.1" +memchr = "2.7.4" diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index 49831092ffcd..d23040b592da 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -239,7 +239,7 @@ fn op_scalar<'a, T: StringArrayType<'a>>( let r = match op { Op::Like(neg) => Predicate::like(r)?.evaluate_array(l, neg), Op::ILike(neg) => Predicate::ilike(r, l.is_ascii())?.evaluate_array(l, neg), - Op::Contains => Predicate::Contains(r).evaluate_array(l, false), + Op::Contains => Predicate::contains(r).evaluate_array(l, false), Op::StartsWith => Predicate::StartsWith(r).evaluate_array(l, false), Op::EndsWith => Predicate::EndsWith(r).evaluate_array(l, false), }; @@ -273,12 +273,16 @@ fn op_binary<'a>( match op { Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like), Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)), - Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(l?.contains(r?))).collect()), + Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(l?, r?))).collect()), Op::StartsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.starts_with(r?))).collect()), Op::EndsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.ends_with(r?))).collect()), } } +fn str_contains(haystack: &str, needle: &str) -> bool { + memchr::memmem::find(haystack.as_bytes(), needle.as_bytes()).is_some() +} + fn binary_predicate<'a>( l: impl Iterator>, r: impl Iterator>, diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index c7ccffb3ada4..0769da3fa0da 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -18,12 +18,13 @@ use arrow_array::{ArrayAccessor, BooleanArray}; use arrow_schema::ArrowError; use memchr::memchr2; +use memchr::memmem::Finder; use regex::{Regex, RegexBuilder}; /// A string based predicate pub enum Predicate<'a> { Eq(&'a str), - Contains(&'a str), + Contains(Finder<'a>), StartsWith(&'a str), EndsWith(&'a str), @@ -54,12 +55,16 @@ impl<'a> Predicate<'a> { && !pattern.ends_with("\\%") && !contains_like_pattern(&pattern[1..pattern.len() - 1]) { - Ok(Self::Contains(&pattern[1..pattern.len() - 1])) + Ok(Self::contains(&pattern[1..pattern.len() - 1])) } else { Ok(Self::Regex(regex_like(pattern, false)?)) } } + pub fn contains(needle: &'a str) -> Self { + Self::Contains(Finder::new(needle.as_bytes())) + } + /// Create a predicate for the given ilike pattern pub fn ilike(pattern: &'a str, is_ascii: bool) -> Result { if is_ascii && pattern.is_ascii() { @@ -82,7 +87,7 @@ impl<'a> Predicate<'a> { match self { Predicate::Eq(v) => *v == haystack, Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), - Predicate::Contains(v) => haystack.contains(v), + Predicate::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), Predicate::StartsWith(v) => haystack.starts_with(v), Predicate::IStartsWithAscii(v) => starts_with_ignore_ascii_case(haystack, v), Predicate::EndsWith(v) => haystack.ends_with(v), @@ -106,9 +111,9 @@ impl<'a> Predicate<'a> { Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { haystack.eq_ignore_ascii_case(v) != negate }), - Predicate::Contains(v) => { - BooleanArray::from_unary(array, |haystack| haystack.contains(v) != negate) - } + Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }), Predicate::StartsWith(v) => { BooleanArray::from_unary(array, |haystack| haystack.starts_with(v) != negate) } @@ -258,4 +263,22 @@ mod tests { let r = regex_like(a_eq, false).unwrap(); assert_eq!(r.to_string(), expected); } + #[test] + fn test_contains() { + assert!(Predicate::contains("hay").evaluate("haystack")); + assert!(Predicate::contains("haystack").evaluate("haystack")); + assert!(Predicate::contains("h").evaluate("haystack")); + assert!(Predicate::contains("k").evaluate("haystack")); + assert!(Predicate::contains("stack").evaluate("haystack")); + assert!(Predicate::contains("sta").evaluate("haystack")); + assert!(Predicate::contains("stack").evaluate("hay£stack")); + assert!(Predicate::contains("y£s").evaluate("hay£stack")); + assert!(Predicate::contains("£").evaluate("hay£stack")); + assert!(Predicate::contains("a").evaluate("a")); + // not matching + assert!(!Predicate::contains("hy").evaluate("haystack")); + assert!(!Predicate::contains("stackx").evaluate("haystack")); + assert!(!Predicate::contains("x").evaluate("haystack")); + assert!(!Predicate::contains("haystack haystack").evaluate("haystack")); + } } From bf0ea9129e617e4a3cf915a900b747cc5485315f Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Tue, 30 Jul 2024 20:49:44 +0100 Subject: [PATCH 12/17] improvements to `(i)starts_with` and `(i)ends_with` performance (#6118) * improvements to "starts_with" and "ends_with" * add tests and refactor slightly * add comments --- arrow-string/src/like.rs | 10 ++- arrow-string/src/predicate.rs | 139 +++++++++++++++++++++++++++++----- 2 files changed, 129 insertions(+), 20 deletions(-) diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index d23040b592da..e878e241853a 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -274,8 +274,14 @@ fn op_binary<'a>( Op::Like(neg) => binary_predicate(l, r, neg, Predicate::like), Op::ILike(neg) => binary_predicate(l, r, neg, |s| Predicate::ilike(s, false)), Op::Contains => Ok(l.zip(r).map(|(l, r)| Some(str_contains(l?, r?))).collect()), - Op::StartsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.starts_with(r?))).collect()), - Op::EndsWith => Ok(l.zip(r).map(|(l, r)| Some(l?.ends_with(r?))).collect()), + Op::StartsWith => Ok(l + .zip(r) + .map(|(l, r)| Some(Predicate::StartsWith(r?).evaluate(l?))) + .collect()), + Op::EndsWith => Ok(l + .zip(r) + .map(|(l, r)| Some(Predicate::EndsWith(r?).evaluate(l?))) + .collect()), } } diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 0769da3fa0da..ec0c4827830c 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -20,6 +20,7 @@ use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; use regex::{Regex, RegexBuilder}; +use std::iter::zip; /// A string based predicate pub enum Predicate<'a> { @@ -88,10 +89,12 @@ impl<'a> Predicate<'a> { Predicate::Eq(v) => *v == haystack, Predicate::IEqAscii(v) => haystack.eq_ignore_ascii_case(v), Predicate::Contains(finder) => finder.find(haystack.as_bytes()).is_some(), - Predicate::StartsWith(v) => haystack.starts_with(v), - Predicate::IStartsWithAscii(v) => starts_with_ignore_ascii_case(haystack, v), - Predicate::EndsWith(v) => haystack.ends_with(v), - Predicate::IEndsWithAscii(v) => ends_with_ignore_ascii_case(haystack, v), + Predicate::StartsWith(v) => starts_with(haystack, v, equals_kernel), + Predicate::IStartsWithAscii(v) => { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) + } + Predicate::EndsWith(v) => ends_with(haystack, v, equals_kernel), + Predicate::IEndsWithAscii(v) => ends_with(haystack, v, equals_ignore_ascii_case_kernel), Predicate::Regex(v) => v.is_match(haystack), } } @@ -114,17 +117,17 @@ impl<'a> Predicate<'a> { Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { finder.find(haystack.as_bytes()).is_some() != negate }), - Predicate::StartsWith(v) => { - BooleanArray::from_unary(array, |haystack| haystack.starts_with(v) != negate) - } + Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_kernel) != negate + }), Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { - starts_with_ignore_ascii_case(haystack, v) != negate + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }), + Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate }), - Predicate::EndsWith(v) => { - BooleanArray::from_unary(array, |haystack| haystack.ends_with(v) != negate) - } Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { - ends_with_ignore_ascii_case(haystack, v) != negate + ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate }), Predicate::Regex(v) => { BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) @@ -133,14 +136,36 @@ impl<'a> Predicate<'a> { } } -fn starts_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool { - let end = haystack.len().min(needle.len()); - haystack.is_char_boundary(end) && needle.eq_ignore_ascii_case(&haystack[..end]) +/// This is faster than `str::starts_with` for small strings. +/// See for more details. +fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { + if needle.len() > haystack.len() { + false + } else { + zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) + } } -fn ends_with_ignore_ascii_case(haystack: &str, needle: &str) -> bool { - let start = haystack.len().saturating_sub(needle.len()); - haystack.is_char_boundary(start) && needle.eq_ignore_ascii_case(&haystack[start..]) +/// This is faster than `str::ends_with` for small strings. +/// See for more details. +fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { + if needle.len() > haystack.len() { + false + } else { + zip( + haystack.as_bytes().iter().rev(), + needle.as_bytes().iter().rev(), + ) + .all(byte_eq_kernel) + } +} + +fn equals_kernel((n, h): (&u8, &u8)) -> bool { + n == h +} + +fn equals_ignore_ascii_case_kernel((n, h): (&u8, &u8)) -> bool { + n.to_ascii_lowercase() == h.to_ascii_lowercase() } /// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: @@ -263,6 +288,7 @@ mod tests { let r = regex_like(a_eq, false).unwrap(); assert_eq!(r.to_string(), expected); } + #[test] fn test_contains() { assert!(Predicate::contains("hay").evaluate("haystack")); @@ -281,4 +307,81 @@ mod tests { assert!(!Predicate::contains("x").evaluate("haystack")); assert!(!Predicate::contains("haystack haystack").evaluate("haystack")); } + + #[test] + fn test_starts_with() { + assert!(Predicate::StartsWith("hay").evaluate("haystack")); + assert!(Predicate::StartsWith("h£ay").evaluate("h£aystack")); + assert!(Predicate::StartsWith("haystack").evaluate("haystack")); + assert!(Predicate::StartsWith("ha").evaluate("haystack")); + assert!(Predicate::StartsWith("h").evaluate("haystack")); + assert!(Predicate::StartsWith("").evaluate("haystack")); + + assert!(!Predicate::StartsWith("stack").evaluate("haystack")); + assert!(!Predicate::StartsWith("haystacks").evaluate("haystack")); + assert!(!Predicate::StartsWith("HAY").evaluate("haystack")); + assert!(!Predicate::StartsWith("h£ay").evaluate("haystack")); + assert!(!Predicate::StartsWith("hay").evaluate("h£aystack")); + } + + #[test] + fn test_ends_with() { + assert!(Predicate::EndsWith("stack").evaluate("haystack")); + assert!(Predicate::EndsWith("st£ack").evaluate("hayst£ack")); + assert!(Predicate::EndsWith("haystack").evaluate("haystack")); + assert!(Predicate::EndsWith("ck").evaluate("haystack")); + assert!(Predicate::EndsWith("k").evaluate("haystack")); + assert!(Predicate::EndsWith("").evaluate("haystack")); + + assert!(!Predicate::EndsWith("hay").evaluate("haystack")); + assert!(!Predicate::EndsWith("STACK").evaluate("haystack")); + assert!(!Predicate::EndsWith("haystacks").evaluate("haystack")); + assert!(!Predicate::EndsWith("xhaystack").evaluate("haystack")); + assert!(!Predicate::EndsWith("st£ack").evaluate("haystack")); + assert!(!Predicate::EndsWith("stack").evaluate("hayst£ack")); + } + + #[test] + fn test_istarts_with() { + assert!(Predicate::IStartsWithAscii("hay").evaluate("haystack")); + assert!(Predicate::IStartsWithAscii("hay").evaluate("HAYSTACK")); + assert!(Predicate::IStartsWithAscii("HAY").evaluate("haystack")); + assert!(Predicate::IStartsWithAscii("HaY").evaluate("haystack")); + assert!(Predicate::IStartsWithAscii("hay").evaluate("HaYsTaCk")); + assert!(Predicate::IStartsWithAscii("HAY").evaluate("HaYsTaCk")); + assert!(Predicate::IStartsWithAscii("haystack").evaluate("HaYsTaCk")); + assert!(Predicate::IStartsWithAscii("HaYsTaCk").evaluate("HaYsTaCk")); + assert!(Predicate::IStartsWithAscii("").evaluate("HaYsTaCk")); + + assert!(!Predicate::IStartsWithAscii("stack").evaluate("haystack")); + assert!(!Predicate::IStartsWithAscii("haystacks").evaluate("haystack")); + assert!(!Predicate::IStartsWithAscii("h.ay").evaluate("haystack")); + assert!(!Predicate::IStartsWithAscii("hay").evaluate("h£aystack")); + } + + #[test] + fn test_iends_with() { + assert!(Predicate::IEndsWithAscii("stack").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("STACK").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("StAcK").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("stack").evaluate("HAYSTACK")); + assert!(Predicate::IEndsWithAscii("STACK").evaluate("HAYSTACK")); + assert!(Predicate::IEndsWithAscii("StAcK").evaluate("HAYSTACK")); + assert!(Predicate::IEndsWithAscii("stack").evaluate("HAYsTaCk")); + assert!(Predicate::IEndsWithAscii("STACK").evaluate("HAYsTaCk")); + assert!(Predicate::IEndsWithAscii("StAcK").evaluate("HAYsTaCk")); + assert!(Predicate::IEndsWithAscii("haystack").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("HAYSTACK").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("haystack").evaluate("HAYSTACK")); + assert!(Predicate::IEndsWithAscii("ck").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("cK").evaluate("haystack")); + assert!(Predicate::IEndsWithAscii("ck").evaluate("haystacK")); + assert!(Predicate::IEndsWithAscii("").evaluate("haystack")); + + assert!(!Predicate::IEndsWithAscii("hay").evaluate("haystack")); + assert!(!Predicate::IEndsWithAscii("stac").evaluate("HAYSTACK")); + assert!(!Predicate::IEndsWithAscii("haystacks").evaluate("haystack")); + assert!(!Predicate::IEndsWithAscii("stack").evaluate("haystac£k")); + assert!(!Predicate::IEndsWithAscii("xhaystack").evaluate("haystack")); + } } From 6e893b5d41f6fcdbffd0353abaf5868754d7d0ab Mon Sep 17 00:00:00 2001 From: pn <13125187405@163.com> Date: Wed, 31 Jul 2024 03:51:48 +0800 Subject: [PATCH 13/17] Add `BooleanArray::new_from_packed` and `BooleanArray::new_from_u8` (#6127) * Support construct BooleanArray from &[u8] * fix doc * add new_from_packed and new_from_u8; delete impl From<&[u8]> for BooleanArray and BooleanBuffer --- arrow-array/src/array/boolean_array.rs | 59 +++++++++++++++++++++++++- arrow-buffer/src/buffer/boolean.rs | 11 +++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index fe374d965714..2bf8129fd007 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -19,7 +19,7 @@ use crate::array::print_long_array; use crate::builder::BooleanBuilder; use crate::iterator::BooleanIter; use crate::{Array, ArrayAccessor, ArrayRef, Scalar}; -use arrow_buffer::{bit_util, BooleanBuffer, MutableBuffer, NullBuffer}; +use arrow_buffer::{bit_util, BooleanBuffer, Buffer, MutableBuffer, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::DataType; use std::any::Any; @@ -110,6 +110,24 @@ impl BooleanArray { Scalar::new(Self::new(values, None)) } + /// Create a new [`BooleanArray`] from a [`Buffer`] specified by `offset` and `len`, the `offset` and `len` in bits + /// Logically convert each bit in [`Buffer`] to boolean and use it to build [`BooleanArray`]. + /// using this method will make the following points self-evident: + /// * there is no `null` in the constructed [`BooleanArray`]; + /// * without considering `buffer.into()`, this method is efficient because there is no need to perform pack and unpack operations on boolean; + pub fn new_from_packed(buffer: impl Into, offset: usize, len: usize) -> Self { + BooleanBuffer::new(buffer.into(), offset, len).into() + } + + /// Create a new [`BooleanArray`] from `&[u8]` + /// This method uses `new_from_packed` and constructs a [`Buffer`] using `value`, and offset is set to 0 and len is set to `value.len() * 8` + /// using this method will make the following points self-evident: + /// * there is no `null` in the constructed [`BooleanArray`]; + /// * the length of the constructed [`BooleanArray`] is always a multiple of 8; + pub fn new_from_u8(value: &[u8]) -> Self { + BooleanBuffer::new(Buffer::from(value), 0, value.len() * 8).into() + } + /// Returns the length of this array. pub fn len(&self) -> usize { self.values.len() @@ -509,6 +527,45 @@ mod tests { } } + #[test] + fn test_boolean_array_from_packed() { + let v = [1_u8, 2_u8, 3_u8]; + let arr = BooleanArray::new_from_packed(v, 0, 24); + assert_eq!(24, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + assert!(arr.nulls.is_none()); + for i in 0..24 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!( + i == 0 || i == 9 || i == 16 || i == 17, + arr.value(i), + "failed t {i}" + ) + } + } + + #[test] + fn test_boolean_array_from_slice_u8() { + let v: Vec = vec![1, 2, 3]; + let slice = &v[..]; + let arr = BooleanArray::new_from_u8(slice); + assert_eq!(24, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(0, arr.null_count()); + assert!(arr.nulls().is_none()); + for i in 0..24 { + assert!(!arr.is_null(i)); + assert!(arr.is_valid(i)); + assert_eq!( + i == 0 || i == 9 || i == 16 || i == 17, + arr.value(i), + "failed t {i}" + ) + } + } + #[test] fn test_boolean_array_from_iter() { let v = vec![Some(false), Some(true), Some(false), Some(true)]; diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 1589cc5b102b..49a75b468dbe 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -21,6 +21,7 @@ use crate::{ bit_util, buffer_bin_and, buffer_bin_or, buffer_bin_xor, buffer_unary_not, BooleanBufferBuilder, Buffer, MutableBuffer, }; + use std::ops::{BitAnd, BitOr, BitXor, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans @@ -414,4 +415,14 @@ mod tests { let expected = BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len); assert_eq!(!boolean_buf, expected); } + + #[test] + fn test_boolean_from_slice_bool() { + let v = [true, false, false]; + let buf = BooleanBuffer::from(&v[..]); + assert_eq!(buf.offset(), 0); + assert_eq!(buf.len(), 3); + assert_eq!(buf.values().len(), 1); + assert!(buf.value(0)); + } } From 2905ce6796cad396241fc50164970dbf1237440a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 31 Jul 2024 12:39:36 -0400 Subject: [PATCH 14/17] Update object store MSRV to `1.64` (#6123) * Update MSRV to 1.64 * Revert "clippy ignore" This reverts commit 7a4b760bfb2a63c7778b20a4710c2828224f9565. --- object_store/Cargo.toml | 2 +- object_store/src/client/mock_server.rs | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 4b11661bfb2a..4e845e5ca2d0 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -24,7 +24,7 @@ readme = "README.md" description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = ["object", "storage", "cloud"] repository = "https://github.com/apache/arrow-rs/tree/master/object_store" -rust-version = "1.62.1" +rust-version = "1.64.0" [package.metadata.docs.rs] all-features = true diff --git a/object_store/src/client/mock_server.rs b/object_store/src/client/mock_server.rs index 0f8e8bf6466f..aa5a9e0ab4dd 100644 --- a/object_store/src/client/mock_server.rs +++ b/object_store/src/client/mock_server.rs @@ -60,8 +60,6 @@ impl MockServer { let mut set = JoinSet::new(); loop { - // https://github.com/apache/arrow-rs/issues/6122 - #[allow(clippy::incompatible_msrv)] let (stream, _) = tokio::select! { conn = listener.accept() => conn.unwrap(), _ = &mut rx => break, From 4d1651cb95dfbe7daba1406b471182e6753d0dde Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 31 Jul 2024 12:47:39 -0700 Subject: [PATCH 15/17] separate tests that require arrow into a separate module --- parquet/src/file/writer.rs | 455 +++++++++++++++++++------------------ 1 file changed, 234 insertions(+), 221 deletions(-) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 3eb2f8882708..45bca7eb5e82 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -1015,12 +1015,9 @@ impl<'a, W: Write> ParquetMetadataWriter<'a, W> { mod tests { use super::*; - use arrow_array::{ArrayRef, Int32Array, RecordBatch}; - use arrow_schema::{DataType as ArrowDataType, Field, Schema}; - use bytes::{BufMut, Bytes, BytesMut}; + use bytes::Bytes; use std::fs::File; - use crate::arrow::ArrowWriter; use crate::basic::{ ColumnOrder, Compression, ConvertedType, Encoding, LogicalType, Repetition, SortOrder, Type, }; @@ -1028,7 +1025,6 @@ mod tests { use crate::column::reader::get_typed_column_reader; use crate::compression::{create_codec, Codec, CodecOptionsBuilder}; use crate::data_type::{BoolType, ByteArrayType, Int32Type}; - use crate::file::footer::parse_metadata; use crate::file::page_index::index::Index; use crate::file::properties::EnabledStatistics; use crate::file::serialized_reader::ReadOptionsBuilder; @@ -2054,222 +2050,6 @@ mod tests { assert!(matches!(b_idx, Index::NONE), "{b_idx:?}"); } - struct TestMetadata { - #[allow(dead_code)] - file_size: usize, - metadata: ParquetMetaData, - } - - fn get_test_metadata(write_page_index: bool, read_page_index: bool) -> TestMetadata { - let mut buf = BytesMut::new().writer(); - let schema: Arc = Arc::new(Schema::new(vec![Field::new( - "a", - ArrowDataType::Int32, - true, - )])); - - let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(2)])); - - let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); - - let writer_props = match write_page_index { - true => WriterProperties::builder() - .set_statistics_enabled(EnabledStatistics::Page) - .build(), - false => WriterProperties::builder() - .set_statistics_enabled(EnabledStatistics::Chunk) - .build(), - }; - - let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(writer_props)).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - let data = buf.into_inner().freeze(); - - let reader_opts = match read_page_index { - true => ReadOptionsBuilder::new().with_page_index().build(), - false => ReadOptionsBuilder::new().build(), - }; - let reader = SerializedFileReader::new_with_options(data.clone(), reader_opts).unwrap(); - let metadata = reader.metadata().clone(); - TestMetadata { - file_size: data.len(), - metadata, - } - } - - fn has_page_index(metadata: &ParquetMetaData) -> bool { - match metadata.column_index() { - Some(column_index) => column_index - .iter() - .any(|rg_idx| rg_idx.iter().all(|col_idx| !matches!(col_idx, Index::NONE))), - None => false, - } - } - - #[test] - fn test_roundtrip_parquet_metadata_without_page_index() { - // We currently don't have an ad-hoc ParquetMetadata loader that can load page indexes so - // we at least test round trip without them - let metadata = get_test_metadata(false, false); - assert!(!has_page_index(&metadata.metadata)); - - let mut buf = BytesMut::new().writer(); - { - let mut writer = ParquetMetadataWriter::new(&mut buf, &metadata.metadata); - writer.finish().unwrap(); - } - - let data = buf.into_inner().freeze(); - - let decoded_metadata = parse_metadata(&data).unwrap(); - assert!(!has_page_index(&metadata.metadata)); - - assert_eq!(metadata.metadata, decoded_metadata); - } - - /// Temporary function so we can test loading metadata with page indexes - /// while we haven't fully figured out how to load it cleanly - #[cfg(feature = "async")] - async fn load_metadata_from_bytes(file_size: usize, data: Bytes) -> ParquetMetaData { - use crate::arrow::async_reader::{MetadataFetch, MetadataLoader}; - use crate::errors::Result as ParquetResult; - use bytes::Bytes; - use futures::future::BoxFuture; - use futures::FutureExt; - use std::ops::Range; - - /// Adapt a `Bytes` to a `MetadataFetch` implementation. - struct AsyncBytes { - data: Bytes, - } - - impl AsyncBytes { - fn new(data: Bytes) -> Self { - Self { data } - } - } - - impl MetadataFetch for AsyncBytes { - fn fetch(&mut self, range: Range) -> BoxFuture<'_, ParquetResult> { - async move { Ok(self.data.slice(range.start..range.end)) }.boxed() - } - } - - /// A `MetadataFetch` implementation that reads from a subset of the full data - /// while accepting ranges that address the full data. - struct MaskedBytes { - inner: Box, - inner_range: Range, - } - - impl MaskedBytes { - fn new(inner: Box, inner_range: Range) -> Self { - Self { inner, inner_range } - } - } - - impl MetadataFetch for &mut MaskedBytes { - fn fetch(&mut self, range: Range) -> BoxFuture<'_, ParquetResult> { - let inner_range = self.inner_range.clone(); - println!("inner_range: {:?}", inner_range); - println!("range: {:?}", range); - assert!(inner_range.start <= range.start && inner_range.end >= range.end); - let range = - range.start - self.inner_range.start..range.end - self.inner_range.start; - self.inner.fetch(range) - } - } - - let metadata_length = data.len(); - let mut reader = MaskedBytes::new( - Box::new(AsyncBytes::new(data)), - file_size - metadata_length..file_size, - ); - let metadata = MetadataLoader::load(&mut reader, file_size, None) - .await - .unwrap(); - let loaded_metadata = metadata.finish(); - let mut metadata = MetadataLoader::new(&mut reader, loaded_metadata); - metadata.load_page_index(true, true).await.unwrap(); - metadata.finish() - } - - fn check_columns_are_equivalent(left: &ColumnChunkMetaData, right: &ColumnChunkMetaData) { - assert_eq!(left.column_descr(), right.column_descr()); - assert_eq!(left.encodings(), right.encodings()); - assert_eq!(left.num_values(), right.num_values()); - assert_eq!(left.compressed_size(), right.compressed_size()); - assert_eq!(left.data_page_offset(), right.data_page_offset()); - assert_eq!(left.statistics(), right.statistics()); - assert_eq!(left.offset_index_length(), right.offset_index_length()); - assert_eq!(left.column_index_length(), right.column_index_length()); - assert_eq!( - left.unencoded_byte_array_data_bytes(), - right.unencoded_byte_array_data_bytes() - ); - } - - fn check_row_groups_are_equivalent(left: &RowGroupMetaData, right: &RowGroupMetaData) { - assert_eq!(left.num_rows(), right.num_rows()); - assert_eq!(left.file_offset(), right.file_offset()); - assert_eq!(left.total_byte_size(), right.total_byte_size()); - assert_eq!(left.schema_descr(), right.schema_descr()); - assert_eq!(left.num_columns(), right.num_columns()); - left.columns() - .iter() - .zip(right.columns().iter()) - .for_each(|(lc, rc)| { - check_columns_are_equivalent(lc, rc); - }); - } - - #[tokio::test] - #[cfg(feature = "async")] - async fn test_encode_parquet_metadata_with_page_index() { - // Create a ParquetMetadata with page index information - let metadata = get_test_metadata(true, true); - assert!(has_page_index(&metadata.metadata)); - - let mut buf = BytesMut::new().writer(); - { - let mut writer = ParquetMetadataWriter::new(&mut buf, &metadata.metadata); - writer.finish().unwrap(); - } - - let data = buf.into_inner().freeze(); - - let decoded_metadata = load_metadata_from_bytes(data.len(), data).await; - - // Because the page index offsets will differ, compare invariant parts of the metadata - assert_eq!( - metadata.metadata.file_metadata(), - decoded_metadata.file_metadata() - ); - assert_eq!( - metadata.metadata.column_index(), - decoded_metadata.column_index() - ); - assert_eq!( - metadata.metadata.offset_index(), - decoded_metadata.offset_index() - ); - assert_eq!( - metadata.metadata.num_row_groups(), - decoded_metadata.num_row_groups() - ); - - metadata - .metadata - .row_groups() - .iter() - .zip(decoded_metadata.row_groups().iter()) - .for_each(|(left, right)| { - check_row_groups_are_equivalent(left, right); - }); - } - #[test] fn test_byte_array_size_statistics() { let message_type = " @@ -2348,4 +2128,237 @@ mod tests { assert_eq!(page_sizes.len(), 1); assert_eq!(page_sizes[0], unenc_size); } + + #[cfg(feature = "async")] + mod async_tests { + use std::sync::Arc; + + use crate::file::footer::parse_metadata; + use crate::file::properties::{EnabledStatistics, WriterProperties}; + use crate::file::reader::{FileReader, SerializedFileReader}; + use crate::file::writer::ParquetMetadataWriter; + use crate::{ + arrow::ArrowWriter, + file::{page_index::index::Index, serialized_reader::ReadOptionsBuilder}, + }; + use arrow_array::{ArrayRef, Int32Array, RecordBatch}; + use arrow_schema::{DataType as ArrowDataType, Field, Schema}; + use bytes::{BufMut, Bytes, BytesMut}; + + use super::{ColumnChunkMetaData, ParquetMetaData, RowGroupMetaData}; + + struct TestMetadata { + #[allow(dead_code)] + file_size: usize, + metadata: ParquetMetaData, + } + + fn has_page_index(metadata: &ParquetMetaData) -> bool { + match metadata.column_index() { + Some(column_index) => column_index + .iter() + .any(|rg_idx| rg_idx.iter().all(|col_idx| !matches!(col_idx, Index::NONE))), + None => false, + } + } + + #[test] + fn test_roundtrip_parquet_metadata_without_page_index() { + // We currently don't have an ad-hoc ParquetMetadata loader that can load page indexes so + // we at least test round trip without them + let metadata = get_test_metadata(false, false); + assert!(!has_page_index(&metadata.metadata)); + + let mut buf = BytesMut::new().writer(); + { + let mut writer = ParquetMetadataWriter::new(&mut buf, &metadata.metadata); + writer.finish().unwrap(); + } + + let data = buf.into_inner().freeze(); + + let decoded_metadata = parse_metadata(&data).unwrap(); + assert!(!has_page_index(&metadata.metadata)); + + assert_eq!(metadata.metadata, decoded_metadata); + } + + fn get_test_metadata(write_page_index: bool, read_page_index: bool) -> TestMetadata { + let mut buf = BytesMut::new().writer(); + let schema: Arc = Arc::new(Schema::new(vec![Field::new( + "a", + ArrowDataType::Int32, + true, + )])); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(2)])); + + let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); + + let writer_props = match write_page_index { + true => WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(), + false => WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) + .build(), + }; + + let mut writer = ArrowWriter::try_new(&mut buf, schema, Some(writer_props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let data = buf.into_inner().freeze(); + + let reader_opts = match read_page_index { + true => ReadOptionsBuilder::new().with_page_index().build(), + false => ReadOptionsBuilder::new().build(), + }; + let reader = SerializedFileReader::new_with_options(data.clone(), reader_opts).unwrap(); + let metadata = reader.metadata().clone(); + TestMetadata { + file_size: data.len(), + metadata, + } + } + + /// Temporary function so we can test loading metadata with page indexes + /// while we haven't fully figured out how to load it cleanly + async fn load_metadata_from_bytes(file_size: usize, data: Bytes) -> ParquetMetaData { + use crate::arrow::async_reader::{MetadataFetch, MetadataLoader}; + use crate::errors::Result as ParquetResult; + use bytes::Bytes; + use futures::future::BoxFuture; + use futures::FutureExt; + use std::ops::Range; + + /// Adapt a `Bytes` to a `MetadataFetch` implementation. + struct AsyncBytes { + data: Bytes, + } + + impl AsyncBytes { + fn new(data: Bytes) -> Self { + Self { data } + } + } + + impl MetadataFetch for AsyncBytes { + fn fetch(&mut self, range: Range) -> BoxFuture<'_, ParquetResult> { + async move { Ok(self.data.slice(range.start..range.end)) }.boxed() + } + } + + /// A `MetadataFetch` implementation that reads from a subset of the full data + /// while accepting ranges that address the full data. + struct MaskedBytes { + inner: Box, + inner_range: Range, + } + + impl MaskedBytes { + fn new(inner: Box, inner_range: Range) -> Self { + Self { inner, inner_range } + } + } + + impl MetadataFetch for &mut MaskedBytes { + fn fetch(&mut self, range: Range) -> BoxFuture<'_, ParquetResult> { + let inner_range = self.inner_range.clone(); + println!("inner_range: {:?}", inner_range); + println!("range: {:?}", range); + assert!(inner_range.start <= range.start && inner_range.end >= range.end); + let range = + range.start - self.inner_range.start..range.end - self.inner_range.start; + self.inner.fetch(range) + } + } + + let metadata_length = data.len(); + let mut reader = MaskedBytes::new( + Box::new(AsyncBytes::new(data)), + file_size - metadata_length..file_size, + ); + let metadata = MetadataLoader::load(&mut reader, file_size, None) + .await + .unwrap(); + let loaded_metadata = metadata.finish(); + let mut metadata = MetadataLoader::new(&mut reader, loaded_metadata); + metadata.load_page_index(true, true).await.unwrap(); + metadata.finish() + } + + fn check_columns_are_equivalent(left: &ColumnChunkMetaData, right: &ColumnChunkMetaData) { + assert_eq!(left.column_descr(), right.column_descr()); + assert_eq!(left.encodings(), right.encodings()); + assert_eq!(left.num_values(), right.num_values()); + assert_eq!(left.compressed_size(), right.compressed_size()); + assert_eq!(left.data_page_offset(), right.data_page_offset()); + assert_eq!(left.statistics(), right.statistics()); + assert_eq!(left.offset_index_length(), right.offset_index_length()); + assert_eq!(left.column_index_length(), right.column_index_length()); + assert_eq!( + left.unencoded_byte_array_data_bytes(), + right.unencoded_byte_array_data_bytes() + ); + } + + fn check_row_groups_are_equivalent(left: &RowGroupMetaData, right: &RowGroupMetaData) { + assert_eq!(left.num_rows(), right.num_rows()); + assert_eq!(left.file_offset(), right.file_offset()); + assert_eq!(left.total_byte_size(), right.total_byte_size()); + assert_eq!(left.schema_descr(), right.schema_descr()); + assert_eq!(left.num_columns(), right.num_columns()); + left.columns() + .iter() + .zip(right.columns().iter()) + .for_each(|(lc, rc)| { + check_columns_are_equivalent(lc, rc); + }); + } + + #[tokio::test] + async fn test_encode_parquet_metadata_with_page_index() { + // Create a ParquetMetadata with page index information + let metadata = get_test_metadata(true, true); + assert!(has_page_index(&metadata.metadata)); + + let mut buf = BytesMut::new().writer(); + { + let mut writer = ParquetMetadataWriter::new(&mut buf, &metadata.metadata); + writer.finish().unwrap(); + } + + let data = buf.into_inner().freeze(); + + let decoded_metadata = load_metadata_from_bytes(data.len(), data).await; + + // Because the page index offsets will differ, compare invariant parts of the metadata + assert_eq!( + metadata.metadata.file_metadata(), + decoded_metadata.file_metadata() + ); + assert_eq!( + metadata.metadata.column_index(), + decoded_metadata.column_index() + ); + assert_eq!( + metadata.metadata.offset_index(), + decoded_metadata.offset_index() + ); + assert_eq!( + metadata.metadata.num_row_groups(), + decoded_metadata.num_row_groups() + ); + + metadata + .metadata + .row_groups() + .iter() + .zip(decoded_metadata.row_groups().iter()) + .for_each(|(left, right)| { + check_row_groups_are_equivalent(left, right); + }); + } + } } From c14ade2b917889a55d4b098b4a48b36ddb2c48c7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 1 Aug 2024 07:05:10 -0400 Subject: [PATCH 16/17] Upgrade protobuf definitions to flightsql 17.0 (#6133) (#6169) * Update FlightSql.proto to version 17.0 Adds new message CommandStatementIngest and removes `experimental` from other messages. * Regenerate flight sql protocol This upgrades the file to version 17.0 of the protobuf definition. Co-authored-by: Douglas Anderson --- .../src/sql/arrow.flight.protocol.sql.rs | 177 +++++++++++++++++- format/FlightSql.proto | 112 +++++++---- 2 files changed, 252 insertions(+), 37 deletions(-) diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index 5e6f198df75c..3eeed6ff4b12 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -798,9 +798,157 @@ pub struct CommandPreparedStatementUpdate { pub prepared_statement_handle: ::prost::bytes::Bytes, } /// -/// Returned from the RPC call DoPut when a CommandStatementUpdate -/// CommandPreparedStatementUpdate was in the request, containing -/// results from the update. +/// Represents a bulk ingestion request. Used in the command member of FlightDescriptor +/// for the the RPC call DoPut to cause the server load the contents of the stream's +/// FlightData into the target destination. +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct CommandStatementIngest { + /// The behavior for handling the table definition. + #[prost(message, optional, tag = "1")] + pub table_definition_options: ::core::option::Option< + command_statement_ingest::TableDefinitionOptions, + >, + /// The table to load data into. + #[prost(string, tag = "2")] + pub table: ::prost::alloc::string::String, + /// The db_schema of the destination table to load data into. If unset, a backend-specific default may be used. + #[prost(string, optional, tag = "3")] + pub schema: ::core::option::Option<::prost::alloc::string::String>, + /// The catalog of the destination table to load data into. If unset, a backend-specific default may be used. + #[prost(string, optional, tag = "4")] + pub catalog: ::core::option::Option<::prost::alloc::string::String>, + /// + /// Store ingested data in a temporary table. + /// The effect of setting temporary is to place the table in a backend-defined namespace, and to drop the table at the end of the session. + /// The namespacing may make use of a backend-specific schema and/or catalog. + /// The server should return an error if an explicit choice of schema or catalog is incompatible with the server's namespacing decision. + #[prost(bool, tag = "5")] + pub temporary: bool, + /// Perform the ingestion as part of this transaction. If specified, results should not be committed in the event of an error/cancellation. + #[prost(bytes = "bytes", optional, tag = "6")] + pub transaction_id: ::core::option::Option<::prost::bytes::Bytes>, + /// Backend-specific options. + #[prost(map = "string, string", tag = "1000")] + pub options: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, +} +/// Nested message and enum types in `CommandStatementIngest`. +pub mod command_statement_ingest { + /// Options for table definition behavior + #[allow(clippy::derive_partial_eq_without_eq)] + #[derive(Clone, Copy, PartialEq, ::prost::Message)] + pub struct TableDefinitionOptions { + #[prost( + enumeration = "table_definition_options::TableNotExistOption", + tag = "1" + )] + pub if_not_exist: i32, + #[prost(enumeration = "table_definition_options::TableExistsOption", tag = "2")] + pub if_exists: i32, + } + /// Nested message and enum types in `TableDefinitionOptions`. + pub mod table_definition_options { + /// The action to take if the target table does not exist + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum TableNotExistOption { + /// Do not use. Servers should error if this is specified by a client. + Unspecified = 0, + /// Create the table if it does not exist + Create = 1, + /// Fail if the table does not exist + Fail = 2, + } + impl TableNotExistOption { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + TableNotExistOption::Unspecified => { + "TABLE_NOT_EXIST_OPTION_UNSPECIFIED" + } + TableNotExistOption::Create => "TABLE_NOT_EXIST_OPTION_CREATE", + TableNotExistOption::Fail => "TABLE_NOT_EXIST_OPTION_FAIL", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "TABLE_NOT_EXIST_OPTION_UNSPECIFIED" => Some(Self::Unspecified), + "TABLE_NOT_EXIST_OPTION_CREATE" => Some(Self::Create), + "TABLE_NOT_EXIST_OPTION_FAIL" => Some(Self::Fail), + _ => None, + } + } + } + /// The action to take if the target table already exists + #[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + PartialOrd, + Ord, + ::prost::Enumeration + )] + #[repr(i32)] + pub enum TableExistsOption { + /// Do not use. Servers should error if this is specified by a client. + Unspecified = 0, + /// Fail if the table already exists + Fail = 1, + /// Append to the table if it already exists + Append = 2, + /// Drop and recreate the table if it already exists + Replace = 3, + } + impl TableExistsOption { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + TableExistsOption::Unspecified => "TABLE_EXISTS_OPTION_UNSPECIFIED", + TableExistsOption::Fail => "TABLE_EXISTS_OPTION_FAIL", + TableExistsOption::Append => "TABLE_EXISTS_OPTION_APPEND", + TableExistsOption::Replace => "TABLE_EXISTS_OPTION_REPLACE", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "TABLE_EXISTS_OPTION_UNSPECIFIED" => Some(Self::Unspecified), + "TABLE_EXISTS_OPTION_FAIL" => Some(Self::Fail), + "TABLE_EXISTS_OPTION_APPEND" => Some(Self::Append), + "TABLE_EXISTS_OPTION_REPLACE" => Some(Self::Replace), + _ => None, + } + } + } + } +} +/// +/// Returned from the RPC call DoPut when a CommandStatementUpdate, +/// CommandPreparedStatementUpdate, or CommandStatementIngest was +/// in the request, containing results from the update. #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct DoPutUpdateResult { @@ -972,6 +1120,19 @@ pub enum SqlInfo { /// query cancellation (the CancelQuery action). FlightSqlServerCancel = 9, /// + /// Retrieves a boolean value indicating whether the Flight SQL Server supports executing + /// bulk ingestion. + FlightSqlServerBulkIngestion = 10, + /// + /// Retrieves a boolean value indicating whether transactions are supported for bulk ingestion. If not, invoking + /// the method commit in the context of a bulk ingestion is a noop, and the isolation level is + /// `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + /// + /// Returns: + /// - false: if bulk ingestion transactions are unsupported; + /// - true: if bulk ingestion transactions are supported. + FlightSqlServerIngestTransactionsSupported = 11, + /// /// Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. /// /// If 0, there is no timeout. Servers should reset the timeout when the handle is used in a command. @@ -1542,6 +1703,10 @@ impl SqlInfo { } SqlInfo::FlightSqlServerTransaction => "FLIGHT_SQL_SERVER_TRANSACTION", SqlInfo::FlightSqlServerCancel => "FLIGHT_SQL_SERVER_CANCEL", + SqlInfo::FlightSqlServerBulkIngestion => "FLIGHT_SQL_SERVER_BULK_INGESTION", + SqlInfo::FlightSqlServerIngestTransactionsSupported => { + "FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED" + } SqlInfo::FlightSqlServerStatementTimeout => { "FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT" } @@ -1674,6 +1839,12 @@ impl SqlInfo { } "FLIGHT_SQL_SERVER_TRANSACTION" => Some(Self::FlightSqlServerTransaction), "FLIGHT_SQL_SERVER_CANCEL" => Some(Self::FlightSqlServerCancel), + "FLIGHT_SQL_SERVER_BULK_INGESTION" => { + Some(Self::FlightSqlServerBulkIngestion) + } + "FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED" => { + Some(Self::FlightSqlServerIngestTransactionsSupported) + } "FLIGHT_SQL_SERVER_STATEMENT_TIMEOUT" => { Some(Self::FlightSqlServerStatementTimeout) } diff --git a/format/FlightSql.proto b/format/FlightSql.proto index 4fc68f2a5db0..8f9e1c8d829b 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -43,7 +43,6 @@ * where there is one row per requested piece of metadata information. */ message CommandGetSqlInfo { - option (experimental) = true; /* * Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide @@ -135,6 +134,23 @@ */ FLIGHT_SQL_SERVER_CANCEL = 9; + /* + * Retrieves a boolean value indicating whether the Flight SQL Server supports executing + * bulk ingestion. + */ + FLIGHT_SQL_SERVER_BULK_INGESTION = 10; + + /* + * Retrieves a boolean value indicating whether transactions are supported for bulk ingestion. If not, invoking + * the method commit in the context of a bulk ingestion is a noop, and the isolation level is + * `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + * + * Returns: + * - false: if bulk ingestion transactions are unsupported; + * - true: if bulk ingestion transactions are supported. + */ + FLIGHT_SQL_SERVER_INGEST_TRANSACTIONS_SUPPORTED = 11; + /* * Retrieves an int32 indicating the timeout (in milliseconds) for prepared statement handles. * @@ -1114,7 +1130,6 @@ * The returned data should be ordered by data_type and then by type_name. */ message CommandGetXdbcTypeInfo { - option (experimental) = true; /* * Specifies the data type to search for the info. @@ -1136,7 +1151,6 @@ * The returned data should be ordered by catalog_name. */ message CommandGetCatalogs { - option (experimental) = true; } /* @@ -1154,7 +1168,6 @@ * The returned data should be ordered by catalog_name, then db_schema_name. */ message CommandGetDbSchemas { - option (experimental) = true; /* * Specifies the Catalog to search for the tables. @@ -1202,7 +1215,6 @@ * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. */ message CommandGetTables { - option (experimental) = true; /* * Specifies the Catalog to search for the tables. @@ -1255,7 +1267,6 @@ * The returned data should be ordered by table_type. */ message CommandGetTableTypes { - option (experimental) = true; } /* @@ -1276,7 +1287,6 @@ * The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. */ message CommandGetPrimaryKeys { - option (experimental) = true; /* * Specifies the catalog to search for the table. @@ -1331,7 +1341,6 @@ * update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. */ message CommandGetExportedKeys { - option (experimental) = true; /* * Specifies the catalog to search for the foreign key table. @@ -1382,7 +1391,6 @@ * - 4 = SET DEFAULT */ message CommandGetImportedKeys { - option (experimental) = true; /* * Specifies the catalog to search for the primary key table. @@ -1435,7 +1443,6 @@ * - 4 = SET DEFAULT */ message CommandGetCrossReference { - option (experimental) = true; /** * The catalog name where the parent table is. @@ -1482,7 +1489,6 @@ * Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. */ message ActionCreatePreparedStatementRequest { - option (experimental) = true; // The valid SQL string to create a prepared statement for. string query = 1; @@ -1495,7 +1501,6 @@ * An embedded message describing a Substrait plan to execute. */ message SubstraitPlan { - option (experimental) = true; // The serialized substrait.Plan to create a prepared statement for. // XXX(ARROW-16902): this is bytes instead of an embedded message @@ -1512,7 +1517,6 @@ * Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. */ message ActionCreatePreparedSubstraitPlanRequest { - option (experimental) = true; // The serialized substrait.Plan to create a prepared statement for. SubstraitPlan plan = 1; @@ -1531,7 +1535,6 @@ * The result should be wrapped in a google.protobuf.Any message. */ message ActionCreatePreparedStatementResult { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1553,7 +1556,6 @@ * Closes server resources associated with the prepared statement handle. */ message ActionClosePreparedStatementRequest { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1564,7 +1566,6 @@ * Begins a transaction. */ message ActionBeginTransactionRequest { - option (experimental) = true; } /* @@ -1575,7 +1576,6 @@ * FLIGHT_SQL_TRANSACTION_SUPPORT_SAVEPOINT. */ message ActionBeginSavepointRequest { - option (experimental) = true; // The transaction to which a savepoint belongs. bytes transaction_id = 1; @@ -1593,7 +1593,6 @@ * The result should be wrapped in a google.protobuf.Any message. */ message ActionBeginTransactionResult { - option (experimental) = true; // Opaque handle for the transaction on the server. bytes transaction_id = 1; @@ -1609,7 +1608,6 @@ * The result should be wrapped in a google.protobuf.Any message. */ message ActionBeginSavepointResult { - option (experimental) = true; // Opaque handle for the savepoint on the server. bytes savepoint_id = 1; @@ -1624,7 +1622,6 @@ * invalidated, as are all associated savepoints. */ message ActionEndTransactionRequest { - option (experimental) = true; enum EndTransaction { END_TRANSACTION_UNSPECIFIED = 0; @@ -1650,7 +1647,6 @@ * savepoints created after the current savepoint. */ message ActionEndSavepointRequest { - option (experimental) = true; enum EndSavepoint { END_SAVEPOINT_UNSPECIFIED = 0; @@ -1685,7 +1681,6 @@ * - GetFlightInfo: execute the query. */ message CommandStatementQuery { - option (experimental) = true; // The SQL syntax. string query = 1; @@ -1712,7 +1707,6 @@ * - DoPut: execute the query. */ message CommandStatementSubstraitPlan { - option (experimental) = true; // A serialized substrait.Plan SubstraitPlan plan = 1; @@ -1725,7 +1719,6 @@ * This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. */ message TicketStatementQuery { - option (experimental) = true; // Unique identifier for the instance of the statement to execute. bytes statement_handle = 1; @@ -1753,7 +1746,6 @@ * - GetFlightInfo: execute the prepared statement instance. */ message CommandPreparedStatementQuery { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1764,7 +1756,6 @@ * for the RPC call DoPut to cause the server to execute the included SQL update. */ message CommandStatementUpdate { - option (experimental) = true; // The SQL syntax. string query = 1; @@ -1778,19 +1769,75 @@ * prepared statement handle as an update. */ message CommandPreparedStatementUpdate { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; } /* - * Returned from the RPC call DoPut when a CommandStatementUpdate - * CommandPreparedStatementUpdate was in the request, containing - * results from the update. + * Represents a bulk ingestion request. Used in the command member of FlightDescriptor + * for the the RPC call DoPut to cause the server load the contents of the stream's + * FlightData into the target destination. + */ +message CommandStatementIngest { + + // Options for table definition behavior + message TableDefinitionOptions { + // The action to take if the target table does not exist + enum TableNotExistOption { + // Do not use. Servers should error if this is specified by a client. + TABLE_NOT_EXIST_OPTION_UNSPECIFIED = 0; + // Create the table if it does not exist + TABLE_NOT_EXIST_OPTION_CREATE = 1; + // Fail if the table does not exist + TABLE_NOT_EXIST_OPTION_FAIL = 2; + } + // The action to take if the target table already exists + enum TableExistsOption { + // Do not use. Servers should error if this is specified by a client. + TABLE_EXISTS_OPTION_UNSPECIFIED = 0; + // Fail if the table already exists + TABLE_EXISTS_OPTION_FAIL = 1; + // Append to the table if it already exists + TABLE_EXISTS_OPTION_APPEND = 2; + // Drop and recreate the table if it already exists + TABLE_EXISTS_OPTION_REPLACE = 3; + } + + TableNotExistOption if_not_exist = 1; + TableExistsOption if_exists = 2; + } + + // The behavior for handling the table definition. + TableDefinitionOptions table_definition_options = 1; + // The table to load data into. + string table = 2; + // The db_schema of the destination table to load data into. If unset, a backend-specific default may be used. + optional string schema = 3; + // The catalog of the destination table to load data into. If unset, a backend-specific default may be used. + optional string catalog = 4; + /* + * Store ingested data in a temporary table. + * The effect of setting temporary is to place the table in a backend-defined namespace, and to drop the table at the end of the session. + * The namespacing may make use of a backend-specific schema and/or catalog. + * The server should return an error if an explicit choice of schema or catalog is incompatible with the server's namespacing decision. + */ + bool temporary = 5; + // Perform the ingestion as part of this transaction. If specified, results should not be committed in the event of an error/cancellation. + optional bytes transaction_id = 6; + + // Future extensions to the parameters of CommandStatementIngest should be added here, at a lower index than the generic 'options' parameter. + + // Backend-specific options. + map options = 1000; +} + +/* + * Returned from the RPC call DoPut when a CommandStatementUpdate, + * CommandPreparedStatementUpdate, or CommandStatementIngest was + * in the request, containing results from the update. */ message DoPutUpdateResult { - option (experimental) = true; // The number of records updated. A return value of -1 represents // an unknown updated record count. @@ -1804,7 +1851,6 @@ * can continue as though the fields in this message were not provided or set to sensible default values. */ message DoPutPreparedStatementResult { - option (experimental) = true; // Represents a (potentially updated) opaque handle for the prepared statement on the server. // Because the handle could potentially be updated, any previous handles for this prepared @@ -1836,7 +1882,6 @@ */ message ActionCancelQueryRequest { option deprecated = true; - option (experimental) = true; // The result of the GetFlightInfo RPC that initiated the query. // XXX(ARROW-16902): this must be a serialized FlightInfo, but is @@ -1855,7 +1900,6 @@ */ message ActionCancelQueryResult { option deprecated = true; - option (experimental) = true; enum CancelResult { // The cancellation status is unknown. Servers should avoid using From 241ee0288f00cce04e7a9019d6846d1ad08cbd2a Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 1 Aug 2024 08:00:58 -0700 Subject: [PATCH 17/17] add histograms to to_thrift() --- parquet/src/file/page_index/index.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index b990444c7928..0c23e4aa38b8 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -247,14 +247,29 @@ impl NativeIndex { .map(|x| x.null_count()) .collect::>>(); + // Concatenate page histograms into a single Option + let repetition_level_histograms = self + .indexes + .iter() + .map(|x| x.repetition_level_histogram().map(|v| v.values())) + .collect::>>() + .map(|hists| hists.concat()); + + let definition_level_histograms = self + .indexes + .iter() + .map(|x| x.definition_level_histogram().map(|v| v.values())) + .collect::>>() + .map(|hists| hists.concat()); + ColumnIndex::new( self.indexes.iter().map(|x| x.min().is_none()).collect(), min_values, max_values, self.boundary_order, null_counts, - None, - None, + repetition_level_histograms, + definition_level_histograms, ) } }