jorgecarleitao · jorgecarleitao · Feb 23, 2022 · Feb 23, 2022
diff --git a/benches/avro_read.rs b/benches/avro_read.rs
@@ -24,12 +24,11 @@ fn schema() -> AvroSchema {
 fn write(size: usize, has_codec: bool) -> Result<Vec<u8>> {
     let avro = schema();
     // a writer needs a schema and something to write to
-    let mut writer: Writer<Vec<u8>>;
-    if has_codec {
-        writer = Writer::with_codec(&avro, Vec::new(), Codec::Deflate);
+    let mut writer = if has_codec {
+        Writer::with_codec(&avro, Vec::new(), Codec::Deflate)
     } else {
-        writer = Writer::new(&avro, Vec::new());
-    }
+        Writer::new(&avro, Vec::new())
+    };
 
     (0..size).for_each(|_| {
         let mut record = Record::new(writer.schema()).unwrap();

diff --git a/benches/write_parquet.rs b/benches/write_parquet.rs
@@ -1,4 +1,3 @@
-use std::io::Cursor;
 use std::sync::Arc;
 
 use criterion::{criterion_group, criterion_main, Criterion};
@@ -29,7 +28,7 @@ fn write(array: &dyn Array, encoding: Encoding) -> Result<()> {
         vec![encoding],
     )?;
 
-    let mut writer = vec![];
+    let writer = vec![];
 
     let mut writer = FileWriter::try_new(writer, schema, options)?;
 

diff --git a/examples/ffi.rs b/examples/ffi.rs
@@ -9,6 +9,7 @@ unsafe fn export(
     array_ptr: *mut ffi::ArrowArray,
     schema_ptr: *mut ffi::ArrowSchema,
 ) {
+    // exporting an array requires an associated field so that the consumer knows its datatype
     let field = Field::new("a", array.data_type().clone(), true);
     ffi::export_array_to_c(array, array_ptr);
     ffi::export_field_to_c(&field, schema_ptr);
@@ -25,23 +26,13 @@ fn main() -> Result<()> {
 
     // the goal is to export this array and import it back via FFI.
     // to import, we initialize the structs that will receive the data
-    let array_ptr = Box::new(ffi::ArrowArray::empty());
-    let schema_ptr = Box::new(ffi::ArrowSchema::empty());
-
-    // since FFIs work in raw pointers, let's temporarily relinquish ownership so that producers
-    // can write into it in a thread-safe manner
-    let array_ptr = Box::into_raw(array_ptr);
-    let schema_ptr = Box::into_raw(schema_ptr);
+    let mut array_ptr = Box::new(ffi::ArrowArray::empty());
+    let mut schema_ptr = Box::new(ffi::ArrowSchema::empty());
 
     // this is where a producer (in this case also us ^_^) writes to the pointers' location.
     // `array` here could be anything or not even be available, if this was e.g. from Python.
-    // Safety: we just allocated the pointers correctly.
-    unsafe { export(array.clone(), array_ptr, schema_ptr) };
-
-    // we can now take ownership back, since we are responsible for deallocating this memory.
-    // Safety: we just into_raw them.
-    let array_ptr = unsafe { Box::from_raw(array_ptr) };
-    let schema_ptr = unsafe { Box::from_raw(schema_ptr) };
+    // Safety: we just allocated the pointers
+    unsafe { export(array.clone(), &mut *array_ptr, &mut *schema_ptr) };
 
     // and finally interpret the written memory into a new array.
     // Safety: we used `export`, which is a valid exporter to the C data interface

diff --git a/examples/parquet_read_async.rs b/examples/parquet_read_async.rs
@@ -2,8 +2,6 @@ use std::sync::Arc;
 use std::time::SystemTime;
 
 use futures::future::BoxFuture;
-use futures::FutureExt;
-use tokio;
 use tokio::fs::File;
 use tokio::io::BufReader;
 use tokio_util::compat::*;

diff --git a/src/bitmap/bitmap_ops.rs b/src/bitmap/bitmap_ops.rs
@@ -7,6 +7,7 @@ use super::{
     Bitmap,
 };
 
+/// Creates a [Vec<u8>] from an [`Iterator`] of [`BitChunk`].
 /// # Safety
 /// The iterator must be [`TrustedLen`].
 pub unsafe fn from_chunk_iter_unchecked<T: BitChunk, I: Iterator<Item = T>>(
@@ -35,7 +36,7 @@ pub unsafe fn from_chunk_iter_unchecked<T: BitChunk, I: Iterator<Item = T>>(
     buffer
 }
 
-/// Creates a Vec<u8> from a [`TrustedLen`] of [`BitChunk`],
+/// Creates a [`Vec<u8>`] from a [`TrustedLen`] of [`BitChunk`].
 pub fn chunk_iter_to_vec<T: BitChunk, I: TrustedLen<Item = T>>(iter: I) -> Vec<u8> {
     unsafe { from_chunk_iter_unchecked(iter) }
 }

diff --git a/src/bitmap/mutable.rs b/src/bitmap/mutable.rs
@@ -58,7 +58,7 @@ impl MutableBitmap {
         }
     }
 
-    /// Initializes an a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits.
+    /// Initializes a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits.
     #[inline]
     pub fn with_capacity(capacity: usize) -> Self {
         Self {
@@ -67,7 +67,7 @@ impl MutableBitmap {
         }
     }
 
-    /// Initializes an a pre-allocated [`MutableBitmap`] with capacity for `capacity` bits.
+    /// Reserves `additional` bits in the [`MutableBitmap`], potentially re-allocating its buffer.
     #[inline(always)]
     pub fn reserve(&mut self, additional: usize) {
         self.buffer

diff --git a/src/compute/like.rs b/src/compute/like.rs
@@ -17,6 +17,10 @@ fn is_like_pattern(c: char) -> bool {
     c == '%' || c == '_'
 }
 
+fn replace_pattern(pattern: &str) -> String {
+    pattern.replace('%', ".*").replace('_', ".")
+}
+
 #[inline]
 fn a_like_utf8<O: Offset, F: Fn(bool) -> bool>(
     lhs: &Utf8Array<O>,
@@ -40,7 +44,7 @@ fn a_like_utf8<O: Offset, F: Fn(bool) -> bool>(
                     let pattern = if let Some(pattern) = map.get(pattern) {
                         pattern
                     } else {
-                        let re_pattern = pattern.replace("%", ".*").replace("_", ".");
+                        let re_pattern = replace_pattern(pattern);
                         let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
                             ArrowError::InvalidArgumentError(format!(
                                 "Unable to build regex from LIKE pattern: {}",
@@ -113,7 +117,7 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(
         let ends_with = &rhs[1..];
         Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with))))
     } else {
-        let re_pattern = rhs.replace("%", ".*").replace("_", ".");
+        let re_pattern = replace_pattern(rhs);
         let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| {
             ArrowError::InvalidArgumentError(format!(
                 "Unable to build regex from LIKE pattern: {}",
@@ -187,10 +191,8 @@ fn a_like_binary<O: Offset, F: Fn(bool) -> bool>(
                     let pattern = if let Some(pattern) = map.get(pattern) {
                         pattern
                     } else {
-                        let re_pattern = simdutf8::basic::from_utf8(pattern)
-                            .unwrap()
-                            .replace("%", ".*")
-                            .replace("_", ".");
+                        let re_pattern = simdutf8::basic::from_utf8(pattern).unwrap();
+                        let re_pattern = replace_pattern(re_pattern);
                         let re = BytesRegex::new(&format!("^{}$", re_pattern)).map_err(|e| {
                             ArrowError::InvalidArgumentError(format!(
                                 "Unable to build regex from LIKE pattern: {}",
@@ -270,7 +272,7 @@ fn a_like_binary_scalar<O: Offset, F: Fn(bool) -> bool>(
         let ends_with = &rhs[1..];
         Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.ends_with(ends_with))))
     } else {
-        let re_pattern = pattern.replace("%", ".*").replace("_", ".");
+        let re_pattern = replace_pattern(pattern);
         let re = BytesRegex::new(&format!("^{}$", re_pattern)).map_err(|e| {
             ArrowError::InvalidArgumentError(format!(
                 "Unable to build regex from LIKE pattern: {}",

diff --git a/src/compute/sort/mod.rs b/src/compute/sort/mod.rs
@@ -355,7 +355,7 @@ where
         values.chain(null_indices.into_iter()).collect::<Vec<I>>()
     };
 
-    values.truncate(limit.unwrap_or_else(|| values.len()));
+    values.truncate(limit.unwrap_or(values.len()));
 
     let data_type = I::PRIMITIVE.into();
     PrimitiveArray::<I>::from_data(data_type, values.into(), None)

diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs
@@ -121,8 +121,7 @@ pub fn read_record_batch<R: Read + Seek>(
                     Ok(None)
                 }
             })
-            .map(|x| x.transpose())
-            .flatten()
+            .filter_map(|x| x.transpose())
             .collect::<Result<Vec<_>>>()?
     } else {
         fields

diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs
@@ -125,8 +125,7 @@ pub fn infer_rows(rows: &[Value]) -> Result<DataType> {
     // discard None values and deduplicate entries
     let types = types
         .into_iter()
-        .map(|x| x.transpose())
-        .flatten()
+        .filter_map(|x| x.transpose())
         .collect::<Result<HashSet<_>>>()?;
 
     Ok(if !types.is_empty() {

diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs
@@ -170,8 +170,7 @@ fn to_binary<O: Offset>(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc<
         .as_ref()
         .unwrap()
         .iter()
-        .map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
-        .flatten()
+        .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
         .collect();
     Arc::new(BinaryArray::from_data(data_type, offsets, values, validity))
 }
@@ -184,8 +183,7 @@ fn to_utf8<O: Offset>(json_col: &ArrowJsonColumn, data_type: DataType) -> Arc<dy
         .as_ref()
         .unwrap()
         .iter()
-        .map(|value| value.as_str().unwrap().as_bytes().to_vec())
-        .flatten()
+        .flat_map(|value| value.as_str().unwrap().as_bytes().to_vec())
         .collect();
     Arc::new(Utf8Array::from_data(data_type, offsets, values, validity))
 }
@@ -309,8 +307,7 @@ pub fn to_array(
                 .as_ref()
                 .unwrap()
                 .iter()
-                .map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
-                .flatten()
+                .flat_map(|value| value.as_str().map(|x| hex::decode(x).unwrap()).unwrap())
                 .collect();
             Ok(Arc::new(FixedSizeBinaryArray::from_data(
                 data_type, values, validity,

diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs
@@ -12,7 +12,7 @@ use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
 /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
 /// any physical column.
 pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
-    fields.iter().map(to_field).flatten().collect::<Vec<_>>()
+    fields.iter().filter_map(to_field).collect::<Vec<_>>()
 }
 
 fn from_int32(
@@ -224,11 +224,7 @@ fn non_repeated_group(
 /// Converts a parquet group type to an arrow [`DataType::Struct`].
 /// Returns [`None`] if all its fields are empty
 fn to_struct(fields: &[ParquetType]) -> Option<DataType> {
-    let fields = fields
-        .iter()
-        .map(to_field)
-        .flatten()
-        .collect::<Vec<Field>>();
+    let fields = fields.iter().filter_map(to_field).collect::<Vec<Field>>();
     if fields.is_empty() {
         None
     } else {

diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs
@@ -101,7 +101,7 @@ fn get_fields(field: &Field) -> Vec<&Field> {
     match field.data_type.to_logical_type() {
         DataType::List(inner) => get_fields(inner),
         DataType::LargeList(inner) => get_fields(inner),
-        DataType::Struct(fields) => fields.iter().map(get_fields).flatten().collect(),
+        DataType::Struct(fields) => fields.iter().flat_map(get_fields).collect(),
         _ => vec![field],
     }
 }

diff --git a/src/io/parquet/write/dictionary.rs b/src/io/parquet/write/dictionary.rs
@@ -60,19 +60,15 @@ fn encode_keys<K: DictionaryKey>(
     // encode indices
     // compute the required number of bits
     if let Some(validity) = validity {
-        let keys = array
-            .iter()
-            .flatten()
-            .map(|x| {
-                let index = x.to_usize().unwrap();
-                // discard indices whose values are null, since they are part of the def levels.
-                if validity.get_bit(index) {
-                    Some(index as u32)
-                } else {
-                    None
-                }
-            })
-            .flatten();
+        let keys = array.iter().flatten().filter_map(|x| {
+            let index = x.to_usize().unwrap();
+            // discard indices whose values are null, since they are part of the def levels.
+            if validity.get_bit(index) {
+                Some(index as u32)
+            } else {
+                None
+            }
+        });
         let num_bits = utils::get_bit_width(keys.clone().max().unwrap_or(0) as u64) as u8;
 
         let keys = utils::ExactSizedIter::new(keys, array.len() - null_count);