diff --git a/benches/avro_read.rs b/benches/avro_read.rs index 029d3ab7d6b..9143db9e7e0 100644 --- a/benches/avro_read.rs +++ b/benches/avro_read.rs @@ -24,12 +24,11 @@ fn schema() -> AvroSchema { fn write(size: usize, has_codec: bool) -> Result> { let avro = schema(); // a writer needs a schema and something to write to - let mut writer: Writer>; - if has_codec { - writer = Writer::with_codec(&avro, Vec::new(), Codec::Deflate); + let mut writer = if has_codec { + Writer::with_codec(&avro, Vec::new(), Codec::Deflate) } else { - writer = Writer::new(&avro, Vec::new()); - } + Writer::new(&avro, Vec::new()) + }; (0..size).for_each(|_| { let mut record = Record::new(writer.schema()).unwrap(); diff --git a/benches/write_parquet.rs b/benches/write_parquet.rs index 3709aedba65..32b264bfe53 100644 --- a/benches/write_parquet.rs +++ b/benches/write_parquet.rs @@ -1,4 +1,3 @@ -use std::io::Cursor; use std::sync::Arc; use criterion::{criterion_group, criterion_main, Criterion}; @@ -29,7 +28,7 @@ fn write(array: &dyn Array, encoding: Encoding) -> Result<()> { vec![encoding], )?; - let mut writer = vec![]; + let writer = vec![]; let mut writer = FileWriter::try_new(writer, schema, options)?; diff --git a/examples/ffi.rs b/examples/ffi.rs index 842161eb96e..d2f3c68fc28 100644 --- a/examples/ffi.rs +++ b/examples/ffi.rs @@ -9,6 +9,7 @@ unsafe fn export( array_ptr: *mut ffi::ArrowArray, schema_ptr: *mut ffi::ArrowSchema, ) { + // exporting an array requires an associated field so that the consumer knows its datatype let field = Field::new("a", array.data_type().clone(), true); ffi::export_array_to_c(array, array_ptr); ffi::export_field_to_c(&field, schema_ptr); @@ -25,23 +26,13 @@ fn main() -> Result<()> { // the goal is to export this array and import it back via FFI. // to import, we initialize the structs that will receive the data - let array_ptr = Box::new(ffi::ArrowArray::empty()); - let schema_ptr = Box::new(ffi::ArrowSchema::empty()); - - // since FFIs work in raw pointers, let's temporarily relinquish ownership so that producers - // can write into it in a thread-safe manner - let array_ptr = Box::into_raw(array_ptr); - let schema_ptr = Box::into_raw(schema_ptr); + let mut array_ptr = Box::new(ffi::ArrowArray::empty()); + let mut schema_ptr = Box::new(ffi::ArrowSchema::empty()); // this is where a producer (in this case also us ^_^) writes to the pointers' location. // `array` here could be anything or not even be available, if this was e.g. from Python. - // Safety: we just allocated the pointers correctly. - unsafe { export(array.clone(), array_ptr, schema_ptr) }; - - // we can now take ownership back, since we are responsible for deallocating this memory. - // Safety: we just into_raw them. - let array_ptr = unsafe { Box::from_raw(array_ptr) }; - let schema_ptr = unsafe { Box::from_raw(schema_ptr) }; + // Safety: we just allocated the pointers + unsafe { export(array.clone(), &mut *array_ptr, &mut *schema_ptr) }; // and finally interpret the written memory into a new array. // Safety: we used `export`, which is a valid exporter to the C data interface diff --git a/examples/parquet_read_async.rs b/examples/parquet_read_async.rs index e9ac530bb89..65a802b0008 100644 --- a/examples/parquet_read_async.rs +++ b/examples/parquet_read_async.rs @@ -2,8 +2,6 @@ use std::sync::Arc; use std::time::SystemTime; use futures::future::BoxFuture; -use futures::FutureExt; -use tokio; use tokio::fs::File; use tokio::io::BufReader; use tokio_util::compat::*; diff --git a/src/bitmap/bitmap_ops.rs b/src/bitmap/bitmap_ops.rs index bcc4e433ee8..282a7bca3e4 100644 --- a/src/bitmap/bitmap_ops.rs +++ b/src/bitmap/bitmap_ops.rs @@ -7,6 +7,7 @@ use super::{ Bitmap, }; +/// Creates a [Vec] from an [`Iterator`] of [`BitChunk`]. /// # Safety /// The iterator must be [`TrustedLen`]. pub unsafe fn from_chunk_iter_unchecked>( @@ -35,7 +36,7 @@ pub unsafe fn from_chunk_iter_unchecked>( buffer } -/// Creates a Vec from a [`TrustedLen`] of [`BitChunk`], +/// Creates a [`Vec`] from a [`TrustedLen`] of [`BitChunk`]. pub fn chunk_iter_to_vec>(iter: I) -> Vec { unsafe { from_chunk_iter_unchecked(iter) } } diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs index 1868d9b1854..5d0bbbb3849 100644 --- a/src/bitmap/mutable.rs +++ b/src/bitmap/mutable.rs @@ -58,7 +58,7 @@ impl MutableBitmap { } } - /// Initializes an a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits. + /// Initializes a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits. #[inline] pub fn with_capacity(capacity: usize) -> Self { Self { @@ -67,7 +67,7 @@ impl MutableBitmap { } } - /// Initializes an a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits. + /// Reserves `additional` bits in the [`MutableBitmap`], potentially re-allocating its buffer. #[inline(always)] pub fn reserve(&mut self, additional: usize) { self.buffer diff --git a/src/compute/like.rs b/src/compute/like.rs index 334c1fe74ca..3c262585672 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -17,6 +17,10 @@ fn is_like_pattern(c: char) -> bool { c == '%' || c == '_' } +fn replace_pattern(pattern: &str) -> String { + pattern.replace('%', ".*").replace('_', ".") +} + #[inline] fn a_like_utf8 bool>( lhs: &Utf8Array, @@ -40,7 +44,7 @@ fn a_like_utf8 bool>( let pattern = if let Some(pattern) = map.get(pattern) { pattern } else { - let re_pattern = pattern.replace("%", ".*").replace("_", "."); + let re_pattern = replace_pattern(pattern); let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::InvalidArgumentError(format!( "Unable to build regex from LIKE pattern: {}", @@ -113,7 +117,7 @@ fn a_like_utf8_scalar bool>( let ends_with = &rhs[1..]; Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with)))) } else { - let re_pattern = rhs.replace("%", ".*").replace("_", "."); + let re_pattern = replace_pattern(rhs); let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::InvalidArgumentError(format!( "Unable to build regex from LIKE pattern: {}", @@ -187,10 +191,8 @@ fn a_like_binary bool>( let pattern = if let Some(pattern) = map.get(pattern) { pattern } else { - let re_pattern = simdutf8::basic::from_utf8(pattern) - .unwrap() - .replace("%", ".*") - .replace("_", "."); + let re_pattern = simdutf8::basic::from_utf8(pattern).unwrap(); + let re_pattern = replace_pattern(re_pattern); let re = BytesRegex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::InvalidArgumentError(format!( "Unable to build regex from LIKE pattern: {}", @@ -270,7 +272,7 @@ fn a_like_binary_scalar bool>( let ends_with = &rhs[1..]; Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with)))) } else { - let re_pattern = pattern.replace("%", ".*").replace("_", "."); + let re_pattern = replace_pattern(pattern); let re = BytesRegex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::InvalidArgumentError(format!( "Unable to build regex from LIKE pattern: {}", diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs index 8463e2f61a9..767cde4cf32 100644 --- a/src/compute/sort/mod.rs +++ b/src/compute/sort/mod.rs @@ -355,7 +355,7 @@ where values.chain(null_indices.into_iter()).collect::>() }; - values.truncate(limit.unwrap_or_else(|| values.len())); + values.truncate(limit.unwrap_or(values.len())); let data_type = I::PRIMITIVE.into(); PrimitiveArray::::from_data(data_type, values.into(), None) diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs index 94090b7a1a1..97f1f4e03fd 100644 --- a/src/io/ipc/read/common.rs +++ b/src/io/ipc/read/common.rs @@ -121,8 +121,7 @@ pub fn read_record_batch( Ok(None) } }) - .map(|x| x.transpose()) - .flatten() + .filter_map(|x| x.transpose()) .collect::>>()? } else { fields diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index 81f48335d01..b8fb01038c9 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -125,8 +125,7 @@ pub fn infer_rows(rows: &[Value]) -> Result { // discard None values and deduplicate entries let types = types .into_iter() - .map(|x| x.transpose()) - .flatten() + .filter_map(|x| x.transpose()) .collect::>>()?; Ok(if !types.is_empty() { diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index d8210c89dfc..b73223b7488 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -170,8 +170,7 @@ fn to_binary(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc< .as_ref() .unwrap() .iter() - .map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap()) - .flatten() + .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap()) .collect(); Arc::new(BinaryArray::from_data(data_type, offsets, values, validity)) } @@ -184,8 +183,7 @@ fn to_utf8(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc Vec { - fields.iter().map(to_field).flatten().collect::>() + fields.iter().filter_map(to_field).collect::>() } fn from_int32( @@ -224,11 +224,7 @@ fn non_repeated_group( /// Converts a parquet group type to an arrow [`DataType::Struct`]. /// Returns [`None`] if all its fields are empty fn to_struct(fields: &[ParquetType]) -> Option { - let fields = fields - .iter() - .map(to_field) - .flatten() - .collect::>(); + let fields = fields.iter().filter_map(to_field).collect::>(); if fields.is_empty() { None } else { diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs index dcee10a2126..d42a2ad16e6 100644 --- a/src/io/parquet/read/statistics/mod.rs +++ b/src/io/parquet/read/statistics/mod.rs @@ -101,7 +101,7 @@ fn get_fields(field: &Field) -> Vec<&Field> { match field.data_type.to_logical_type() { DataType::List(inner) => get_fields(inner), DataType::LargeList(inner) => get_fields(inner), - DataType::Struct(fields) => fields.iter().map(get_fields).flatten().collect(), + DataType::Struct(fields) => fields.iter().flat_map(get_fields).collect(), _ => vec![field], } } diff --git a/src/io/parquet/write/dictionary.rs b/src/io/parquet/write/dictionary.rs index a18b654e036..521c863aac7 100644 --- a/src/io/parquet/write/dictionary.rs +++ b/src/io/parquet/write/dictionary.rs @@ -60,19 +60,15 @@ fn encode_keys( // encode indices // compute the required number of bits if let Some(validity) = validity { - let keys = array - .iter() - .flatten() - .map(|x| { - let index = x.to_usize().unwrap(); - // discard indices whose values are null, since they are part of the def levels. - if validity.get_bit(index) { - Some(index as u32) - } else { - None - } - }) - .flatten(); + let keys = array.iter().flatten().filter_map(|x| { + let index = x.to_usize().unwrap(); + // discard indices whose values are null, since they are part of the def levels. + if validity.get_bit(index) { + Some(index as u32) + } else { + None + } + }); let num_bits = utils::get_bit_width(keys.clone().max().unwrap_or(0) as u64) as u8; let keys = utils::ExactSizedIter::new(keys, array.len() - null_count); diff --git a/src/types/bit_chunk.rs b/src/types/bit_chunk.rs index eba57e5e484..ee78ac7c436 100644 --- a/src/types/bit_chunk.rs +++ b/src/types/bit_chunk.rs @@ -31,93 +31,36 @@ pub trait BitChunk: fn from_ne_bytes(v: Self::Bytes) -> Self; } -impl BitChunk for u8 { - #[inline(always)] - fn zero() -> Self { - 0 - } - - #[inline(always)] - fn to_ne_bytes(self) -> Self::Bytes { - self.to_ne_bytes() - } - - #[inline(always)] - fn from_ne_bytes(v: Self::Bytes) -> Self { - Self::from_ne_bytes(v) - } - - #[inline(always)] - fn one() -> Self { - 1 - } -} - -impl BitChunk for u16 { - #[inline(always)] - fn zero() -> Self { - 0 - } - - #[inline(always)] - fn to_ne_bytes(self) -> Self::Bytes { - self.to_ne_bytes() - } - - #[inline(always)] - fn from_ne_bytes(v: Self::Bytes) -> Self { - Self::from_ne_bytes(v) - } - - #[inline(always)] - fn one() -> Self { - 1 - } -} - -impl BitChunk for u32 { - #[inline(always)] - fn zero() -> Self { - 0 - } - - #[inline(always)] - fn from_ne_bytes(v: Self::Bytes) -> Self { - Self::from_ne_bytes(v) - } - - #[inline(always)] - fn to_ne_bytes(self) -> Self::Bytes { - self.to_ne_bytes() - } - - #[inline(always)] - fn one() -> Self { - 1 - } +macro_rules! bit_chunk { + ($ty:ty) => { + impl BitChunk for $ty { + #[inline(always)] + fn zero() -> Self { + 0 + } + + #[inline(always)] + fn to_ne_bytes(self) -> Self::Bytes { + self.to_ne_bytes() + } + + #[inline(always)] + fn from_ne_bytes(v: Self::Bytes) -> Self { + Self::from_ne_bytes(v) + } + + #[inline(always)] + fn one() -> Self { + 1 + } + } + }; } -impl BitChunk for u64 { - #[inline(always)] - fn zero() -> Self { - 0 - } - - #[inline(always)] - fn to_ne_bytes(self) -> Self::Bytes { - self.to_ne_bytes() - } - - #[inline(always)] - fn from_ne_bytes(v: Self::Bytes) -> Self { - Self::from_ne_bytes(v) - } - - #[inline(always)] - fn one() -> Self { - 1 - } -} +bit_chunk!(u8); +bit_chunk!(u16); +bit_chunk!(u32); +bit_chunk!(u64); /// An [`Iterator`] over a [`BitChunk`]. This iterator is often /// compiled to SIMD. diff --git a/tests/it/compute/sort/mod.rs b/tests/it/compute/sort/mod.rs index a29d99e2986..7618c041352 100644 --- a/tests/it/compute/sort/mod.rs +++ b/tests/it/compute/sort/mod.rs @@ -27,15 +27,15 @@ fn primitive_arrays( } fn to_indices_string_arrays(data: &[Option<&str>], options: SortOptions, expected_data: &[i32]) { - let input = Utf8Array::::from(&data.to_vec()); + let input = Utf8Array::::from(data); let expected = Int32Array::from_slice(expected_data); let output = sort_to_indices(&input, &options, None).unwrap(); assert_eq!(output, expected) } fn string_arrays(data: &[Option<&str>], options: SortOptions, expected_data: &[Option<&str>]) { - let input = Utf8Array::::from(&data.to_vec()); - let expected = Utf8Array::::from(&expected_data.to_vec()); + let input = Utf8Array::::from(data); + let expected = Utf8Array::::from(expected_data); let output = sort(&input, &options, None).unwrap(); assert_eq!(expected, output.as_ref()) }