Fix Clippy for the Rust 1.80 release (#6116)

* Fix clippy lints in arrow-data * Fix clippy errors in arrow-array * fix clippy in concat * Clippy in arrow-string * remove unecessary feature in arrow-array * fix clippy in arrow-cast * Fix clippy in parquet crate * Fix clippy in arrow-flight
apache · Jul 25, 2024 · 3ebb033 · 3ebb033
1 parent fa2fbfd
commit 3ebb033
Show file tree

Hide file tree

Showing 21 changed files with 108 additions and 147 deletions.
diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
@@ -52,6 +52,7 @@ hashbrown = { version = "0.14.2", default-features = false }
 
 [features]
 ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
+force_validate = []
 
 [dev-dependencies]
 rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }

diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
@@ -66,7 +66,7 @@ use super::ByteArrayType;
 /// * Strings with length <= 12 are stored directly in the view.
 ///
 /// * Strings with length > 12: The first four bytes are stored inline in the
-/// view and the entire string is stored in one of the buffers.
+///   view and the entire string is stored in one of the buffers.
 ///
 /// Unlike [`GenericByteArray`], there are no constraints on the offsets other
 /// than they must point into a valid buffer. However, they can be out of order,

diff --git a/arrow-array/src/array/string_array.rs b/arrow-array/src/array/string_array.rs
@@ -376,70 +376,6 @@ mod tests {
             .expect("All null array has valid array data");
     }
 
-    #[cfg(feature = "test_utils")]
-    #[test]
-    fn bad_size_collect_string() {
-        use crate::util::test_util::BadIterator;
-        let data = vec![Some("foo"), None, Some("bar")];
-        let expected: StringArray = data.clone().into_iter().collect();
-
-        // Iterator reports too many items
-        let arr: StringArray = BadIterator::new(3, 10, data.clone()).collect();
-        assert_eq!(expected, arr);
-
-        // Iterator reports too few items
-        let arr: StringArray = BadIterator::new(3, 1, data.clone()).collect();
-        assert_eq!(expected, arr);
-    }
-
-    #[cfg(feature = "test_utils")]
-    #[test]
-    fn bad_size_collect_large_string() {
-        use crate::util::test_util::BadIterator;
-        let data = vec![Some("foo"), None, Some("bar")];
-        let expected: LargeStringArray = data.clone().into_iter().collect();
-
-        // Iterator reports too many items
-        let arr: LargeStringArray = BadIterator::new(3, 10, data.clone()).collect();
-        assert_eq!(expected, arr);
-
-        // Iterator reports too few items
-        let arr: LargeStringArray = BadIterator::new(3, 1, data.clone()).collect();
-        assert_eq!(expected, arr);
-    }
-
-    #[cfg(feature = "test_utils")]
-    #[test]
-    fn bad_size_iter_values_string() {
-        use crate::util::test_util::BadIterator;
-        let data = vec!["foo", "bar", "baz"];
-        let expected: StringArray = data.clone().into_iter().map(Some).collect();
-
-        // Iterator reports too many items
-        let arr = StringArray::from_iter_values(BadIterator::new(3, 10, data.clone()));
-        assert_eq!(expected, arr);
-
-        // Iterator reports too few items
-        let arr = StringArray::from_iter_values(BadIterator::new(3, 1, data.clone()));
-        assert_eq!(expected, arr);
-    }
-
-    #[cfg(feature = "test_utils")]
-    #[test]
-    fn bad_size_iter_values_large_string() {
-        use crate::util::test_util::BadIterator;
-        let data = vec!["foo", "bar", "baz"];
-        let expected: LargeStringArray = data.clone().into_iter().map(Some).collect();
-
-        // Iterator reports too many items
-        let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 10, data.clone()));
-        assert_eq!(expected, arr);
-
-        // Iterator reports too few items
-        let arr = LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone()));
-        assert_eq!(expected, arr);
-    }
-
     fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
         let values = b"HelloArrowAndParquet";
         // "ArrowAndParquet"

diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml
@@ -38,6 +38,7 @@ features = ["prettyprint"]
 
 [features]
 prettyprint = ["comfy-table"]
+force_validate = []
 
 [dependencies]
 arrow-array = { workspace = true }

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
@@ -568,28 +568,27 @@ fn timestamp_to_date32<T: ArrowTimestampType>(
 /// Accepts [`CastOptions`] to specify cast behavior. See also [`cast()`].
 ///
 /// # Behavior
-/// * Boolean to Utf8: `true` => '1', `false` => `0`
-/// * Utf8 to boolean: `true`, `yes`, `on`, `1` => `true`, `false`, `no`, `off`, `0` => `false`,
+/// * `Boolean` to `Utf8`: `true` => '1', `false` => `0`
+/// * `Utf8` to `Boolean`: `true`, `yes`, `on`, `1` => `true`, `false`, `no`, `off`, `0` => `false`,
 ///   short variants are accepted, other strings return null or error
-/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings
+/// * `Utf8` to Numeric: strings that can't be parsed to numbers return null, float strings
 ///   in integer casts return null
-/// * Numeric to boolean: 0 returns `false`, any other value returns `true`
-/// * List to List: the underlying data type is cast
-/// * List to FixedSizeList: the underlying data type is cast. If safe is true and a list element
-/// has the wrong length it will be replaced with NULL, otherwise an error will be returned
-/// * Primitive to List: a list array with 1 value per slot is created
-/// * Date32 and Date64: precision lost when going to higher interval
-/// * Time32 and Time64: precision lost when going to higher interval
-/// * Timestamp and Date{32|64}: precision lost when going to higher interval
-/// * Temporal to/from backing primitive: zero-copy with data type change
-/// * Casting from `float32/float64` to `Decimal(precision, scale)` rounds to the `scale` decimals
-///   (i.e. casting `6.4999` to Decimal(10, 1) becomes `6.5`). Prior to  version `26.0.0`,
-///   casting would truncate instead (i.e. outputs `6.4` instead)
+/// * Numeric to `Boolean`: 0 returns `false`, any other value returns `true`
+/// * `List` to `List`: the underlying data type is cast
+/// * `List` to `FixedSizeList`: the underlying data type is cast. If safe is true and a list element
+///   has the wrong length it will be replaced with NULL, otherwise an error will be returned
+/// * Primitive to `List`: a list array with 1 value per slot is created
+/// * `Date32` and `Date64`: precision lost when going to higher interval
+/// * `Time32 and `Time64`: precision lost when going to higher interval
+/// * `Timestamp` and `Date{32|64}`: precision lost when going to higher interval
+/// * Temporal to/from backing Primitive: zero-copy with data type change
+/// * `Float32/Float64` to `Decimal(precision, scale)` rounds to the `scale` decimals
+///   (i.e. casting `6.4999` to `Decimal(10, 1)` becomes `6.5`).
 ///
 /// Unsupported Casts (check with `can_cast_types` before calling):
 /// * To or from `StructArray`
-/// * List to primitive
-/// * Interval and duration
+/// * `List` to `Primitive`
+/// * `Interval` and `Duration`
 ///
 /// # Timestamps and Timezones
 ///

diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
@@ -285,12 +285,12 @@ fn to_timestamp_nanos(dt: NaiveDateTime) -> Result<i64, ArrowError> {
 /// variants and converts it to nanoseconds since midnight.
 ///
 /// Examples of accepted inputs:
+///
 /// * `09:26:56.123 AM`
 /// * `23:59:59`
 /// * `6:00 pm`
-//
-/// Internally, this function uses the `chrono` library for the
-/// time parsing
+///
+/// Internally, this function uses the `chrono` library for the time parsing
 ///
 /// ## Timezone / Offset Handling
 ///

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
@@ -1179,8 +1179,10 @@ impl ArrayData {
     ///
     /// Does not (yet) check
     /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
-    /// Validates the the null count is correct and that any
-    /// nullability requirements of its children are correct
+    /// 2. the the null count is correct and that any
+    /// 3. nullability requirements of its children are correct
+    ///
+    /// [#85]: https://github.com/apache/arrow-rs/issues/85
     pub fn validate_nulls(&self) -> Result<(), ArrowError> {
         if let Some(nulls) = &self.nulls {
             let actual = nulls.len() - nulls.inner().count_set_bits();

diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs
@@ -138,17 +138,24 @@ fn equal_range(
 }
 
 /// Logically compares two [ArrayData].
+///
 /// Two arrays are logically equal if and only if:
 /// * their data types are equal
 /// * their lengths are equal
 /// * their null counts are equal
 /// * their null bitmaps are equal
 /// * each of their items are equal
-/// two items are equal when their in-memory representation is physically equal (i.e. same bit content).
+///
+/// Two items are equal when their in-memory representation is physically equal
+/// (i.e. has the same bit content).
+///
 /// The physical comparison depend on the data type.
+///
 /// # Panics
-/// This function may panic whenever any of the [ArrayData] does not follow the Arrow specification.
-/// (e.g. wrong number of buffers, buffer `len` does not correspond to the declared `len`)
+///
+/// This function may panic whenever any of the [ArrayData] does not follow the
+/// Arrow specification. (e.g. wrong number of buffers, buffer `len` does not
+/// correspond to the declared `len`)
 pub fn equal(lhs: &ArrayData, rhs: &ArrayData) -> bool {
     utils::base_equal(lhs, rhs)
         && lhs.null_count() == rhs.null_count()

diff --git a/arrow-flight/src/decode.rs b/arrow-flight/src/decode.rs
@@ -225,8 +225,8 @@ impl futures::Stream for FlightRecordBatchStream {
 /// Example usecases
 ///
 /// 1. Using this low level stream it is possible to receive a steam
-/// of RecordBatches in FlightData that have different schemas by
-/// handling multiple schema messages separately.
+///    of RecordBatches in FlightData that have different schemas by
+///    handling multiple schema messages separately.
 pub struct FlightDataDecoder {
     /// Underlying data stream
     response: BoxStream<'static, Result<FlightData>>,

diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs
@@ -38,11 +38,14 @@ use futures::{ready, stream::BoxStream, Stream, StreamExt};
 /// several have already been successfully produced.
 ///
 /// # Caveats
-///   1. When [`DictionaryHandling`] is [`DictionaryHandling::Hydrate`], [`DictionaryArray`](arrow_array::array::DictionaryArray)s
-///   are converted to their underlying types prior to transport.
-///   When [`DictionaryHandling`] is [`DictionaryHandling::Resend`], Dictionary [`FlightData`] is sent with every
-///   [`RecordBatch`] that contains a [`DictionaryArray`](arrow_array::array::DictionaryArray).
-///   See <https://github.com/apache/arrow-rs/issues/3389>.
+/// 1. When [`DictionaryHandling`] is [`DictionaryHandling::Hydrate`],
+///    [`DictionaryArray`]s are converted to their underlying types prior to
+///    transport.
+///    When [`DictionaryHandling`] is [`DictionaryHandling::Resend`], Dictionary [`FlightData`] is sent with every
+///    [`RecordBatch`] that contains a [`DictionaryArray`](arrow_array::array::DictionaryArray).
+///    See <https://github.com/apache/arrow-rs/issues/3389>.
+///
+/// [`DictionaryArray`]: arrow_array::array::DictionaryArray
 ///
 /// # Example
 /// ```no_run

diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs
@@ -26,14 +26,14 @@
 //! This crate contains:
 //!
 //! 1. Low level [prost] generated structs
-//!  for Flight gRPC protobuf messages, such as [`FlightData`], [`FlightInfo`],
-//!  [`Location`] and [`Ticket`].
+//!    for Flight gRPC protobuf messages, such as [`FlightData`], [`FlightInfo`],
+//!    [`Location`] and [`Ticket`].
 //!
 //! 2. Low level [tonic] generated [`flight_service_client`] and
-//! [`flight_service_server`].
+//!    [`flight_service_server`].
 //!
 //! 3. Experimental support for [Flight SQL] in [`sql`]. Requires the
-//! `flight-sql-experimental` feature of this crate to be activated.
+//!    `flight-sql-experimental` feature of this crate to be activated.
 //!
 //! [Flight SQL]: https://arrow.apache.org/docs/format/FlightSql.html
 #![allow(rustdoc::invalid_html_tags)]

diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
@@ -612,13 +612,13 @@ mod tests {
     fn test_string_dictionary_merge() {
         let mut builder = StringDictionaryBuilder::<Int32Type>::new();
         for i in 0..20 {
-            builder.append(&i.to_string()).unwrap();
+            builder.append(i.to_string()).unwrap();
         }
         let input_1 = builder.finish();
 
         let mut builder = StringDictionaryBuilder::<Int32Type>::new();
         for i in 0..30 {
-            builder.append(&i.to_string()).unwrap();
+            builder.append(i.to_string()).unwrap();
         }
         let input_2 = builder.finish();
 

diff --git a/arrow-string/src/substring.rs b/arrow-string/src/substring.rs
@@ -34,11 +34,11 @@ use std::sync::Arc;
 /// # Arguments
 ///
 /// * `start` - The start index of all substrings.
-/// If `start >= 0`, then count from the start of the string,
-/// otherwise count from the end of the string.
+///   If `start >= 0`, then count from the start of the string,
+///   otherwise count from the end of the string.
 ///
 /// * `length`(option) - The length of all substrings.
-/// If `length` is [None], then the substring is from `start` to the end of the string.
+///   If `length` is [None], then the substring is from `start` to the end of the string.
 ///
 /// Attention: Both `start` and `length` are counted by byte, not by char.
 ///
@@ -53,10 +53,13 @@ use std::sync::Arc;
 /// ```
 ///
 /// # Error
-/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`]
-///   or [`DictionaryArray`] with supported array type as its value type.
-/// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array).
-/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars.
+/// - The function errors when the passed array is not a [`GenericStringArray`],
+///   [`GenericBinaryArray`], [`FixedSizeBinaryArray`] or [`DictionaryArray`]
+///   with supported array type as its value type.
+/// - The function errors if the offset of a substring in the input array is
+///   at invalid char boundary (only for \[Large\]String array).
+///   It is recommended to use [`substring_by_char`] if the input array may
+///   contain non-ASCII chars.
 ///
 /// ## Example of trying to get an invalid utf-8 format substring
 /// ```
@@ -155,22 +158,25 @@ pub fn substring(
     }
 }
 
+/// Substrings based on character index
+///
 /// # Arguments
 /// * `array` - The input string array
 ///
 /// * `start` - The start index of all substrings.
-/// If `start >= 0`, then count from the start of the string,
-/// otherwise count from the end of the string.
+///   If `start >= 0`, then count from the start of the string,
+///   otherwise count from the end of the string.
 ///
 /// * `length`(option) - The length of all substrings.
-/// If `length` is `None`, then the substring is from `start` to the end of the string.
+///   If `length` is `None`, then the substring is from `start` to the end of the string.
 ///
 /// Attention: Both `start` and `length` are counted by char.
 ///
 /// # Performance
-/// This function is slower than [substring].
-/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer.
-/// It is recommended to use [substring] if the input array only contains ASCII chars.
+///
+/// This function is slower than [substring]. Theoretically, the time complexity
+/// is `O(n)` where `n` is the length of the value buffer. It is recommended to
+/// use [substring] if the input array only contains ASCII chars.
 ///
 /// # Basic usage
 /// ```
@@ -396,7 +402,7 @@ mod tests {
     /// A helper macro to test the substring functions.
     /// # Arguments
     /// * `cases` - The test cases which is a vector of `(input, start, len, result)`.
-    /// Please look at [`gen_test_cases`] to find how to generate it.
+    ///   Please look at [`gen_test_cases`] to find how to generate it.
     /// * `array_ty` - The array type.
     /// * `substring_fn` - Either [`substring`] or [`substring_by_char`].
     macro_rules! do_test {

diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
@@ -75,7 +75,7 @@ pyarrow = ["pyo3", "ffi"]
 # force_validate runs full data validation for all arrays that are created
 # this is not enabled by default as it is too computationally expensive
 # but is run as part of our CI checks
-force_validate = ["arrow-data/force_validate"]
+force_validate = ["arrow-array/force_validate", "arrow-data/force_validate"]
 # Enable ffi support
 ffi = ["arrow-schema/ffi", "arrow-data/ffi", "arrow-array/ffi"]
 chrono-tz = ["arrow-array/chrono-tz"]

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
@@ -353,11 +353,11 @@ impl ArrowReaderOptions {
 /// This structure allows
 ///
 /// 1. Loading metadata for a file once and then using that same metadata to
-/// construct multiple separate readers, for example, to distribute readers
-/// across multiple threads
+///    construct multiple separate readers, for example, to distribute readers
+///    across multiple threads
 ///
 /// 2. Using a cached copy of the [`ParquetMetadata`] rather than reading it
-/// from the file each time a reader is constructed.
+///    from the file each time a reader is constructed.
 ///
 /// [`ParquetMetadata`]: crate::file::metadata::ParquetMetaData
 #[derive(Debug, Clone)]
@@ -553,10 +553,10 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
     /// This interface allows:
     ///
     /// 1. Loading metadata once and using it to create multiple builders with
-    /// potentially different settings or run on different threads
+    ///    potentially different settings or run on different threads
     ///
     /// 2. Using a cached copy of the metadata rather than re-reading it from the
-    /// file each time a reader is constructed.
+    ///    file each time a reader is constructed.
     ///
     /// See the docs on [`ArrowReaderMetadata`] for more details
     ///