diff --git a/src/array/fixed_size_binary/mutable.rs b/src/array/fixed_size_binary/mutable.rs index 7717e594dda..2e5b728035f 100644 --- a/src/array/fixed_size_binary/mutable.rs +++ b/src/array/fixed_size_binary/mutable.rs @@ -150,7 +150,7 @@ impl MutableFixedSizeBinaryArray { std::slice::from_raw_parts(self.values.as_ptr().add(i * self.size), self.size) } - /// Shrinks the capacity of the [`MutablePrimitive`] to fit its current length. + /// Shrinks the capacity of the [`MutableFixedSizeBinaryArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); if let Some(validity) = &mut self.validity { diff --git a/src/array/fixed_size_list/mutable.rs b/src/array/fixed_size_list/mutable.rs index 13924106f8e..cf0850ba92b 100644 --- a/src/array/fixed_size_list/mutable.rs +++ b/src/array/fixed_size_list/mutable.rs @@ -74,7 +74,7 @@ impl MutableFixedSizeListArray { None => self.init_validity(), } } - /// Shrinks the capacity of the [`MutableFixedSizeList`] to fit its current length. + /// Shrinks the capacity of the [`MutableFixedSizeListArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); if let Some(validity) = &mut self.validity { diff --git a/src/array/growable/fixed_size_list.rs b/src/array/growable/fixed_size_list.rs index cabd0b75928..09b72a88dc5 100644 --- a/src/array/growable/fixed_size_list.rs +++ b/src/array/growable/fixed_size_list.rs @@ -22,7 +22,7 @@ pub struct GrowableFixedSizeList<'a> { } impl<'a> GrowableFixedSizeList<'a> { - /// Creates a new [`GrowableList`] bound to `arrays` with a pre-allocated `capacity`. + /// Creates a new [`GrowableFixedSizeList`] bound to `arrays` with a pre-allocated `capacity`. /// # Panics /// If `arrays` is empty. pub fn new( diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs index 1892dc50955..fab320655c0 100644 --- a/src/array/list/mutable.rs +++ b/src/array/list/mutable.rs @@ -43,7 +43,7 @@ impl MutableListArray { } } - /// Shrinks the capacity of the [`MutableList`] to fit its current length. + /// Shrinks the capacity of the [`MutableListArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); self.offsets.shrink_to_fit(); diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs index e8d764fddc3..7c4f7d7c518 100644 --- a/src/array/primitive/mutable.rs +++ b/src/array/primitive/mutable.rs @@ -238,7 +238,7 @@ impl MutablePrimitiveArray { Arc::new(a) } - /// Shrinks the capacity of the [`MutablePrimitive`] to fit its current length. + /// Shrinks the capacity of the [`MutablePrimitiveArray`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); if let Some(validity) = &mut self.validity { diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index 25d4521d883..bf7c7f9d68a 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -165,7 +165,7 @@ impl MutableUtf8Array { Arc::new(a) } - /// Shrinks the capacity of the [`MutableUtf8`] to fit its current length. + /// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length. pub fn shrink_to_fit(&mut self) { self.values.shrink_to_fit(); self.offsets.shrink_to_fit(); diff --git a/src/io/README.md b/src/io/README.md index 5c97d1a6ee1..26be0b337a6 100644 --- a/src/io/README.md +++ b/src/io/README.md @@ -6,17 +6,19 @@ This document describes the overall design of this module. * Each directory in this module corresponds to a specific format such as `csv` and `json`. * directories that depend on external dependencies MUST be feature gated, with a feature named with a prefix `io_`. -* modules MUST re-export any API of external dependencies they require as part of their public API. E.g. +* modules MUST re-export any API of external dependencies they require as part of their public API. + E.g. * if a module as an API `write(writer: &mut csv:Writer, ...)`, it MUST contain `pub use csv::Writer;`. The rational is that adding this crate to `cargo.toml` must be sufficient to use it. -* Each directory SHOULD contain two directories, `read` and `write`, corresponding to functionality about -reading from the format and writing to the format respectively. +* Each directory SHOULD contain two directories, `read` and `write`, corresponding + to functionality about reading from the format and writing to the format respectively. * The base module SHOULD contain `use pub read;` and `use pub write;`. * Implementations SHOULD separate reading of "data" from reading of "metadata". Examples: * schema read or inference SHOULD be a separate function * functions that read "data" SHOULD consume a schema typically pre-read. -* Implementations SHOULD separate IO-bounded operations from CPU-bounded operations. I.e. implementations SHOULD: +* Implementations SHOULD separate IO-bounded operations from CPU-bounded operations. + I.e. implementations SHOULD: * contain functions that consume a `Read` implementor and output a "raw" struct, i.e. a struct that is e.g. compressed and serialized * contain functions that consume a "raw" struct and convert it into Arrow. * offer each of these functions as independent public APIs, so that consumers can decide how to balance CPU-bounds and IO-bounds. diff --git a/src/io/csv/read/deserialize.rs b/src/io/csv/read/deserialize.rs index 14172e09a6b..7a6bcaf2bb9 100644 --- a/src/io/csv/read/deserialize.rs +++ b/src/io/csv/read/deserialize.rs @@ -198,7 +198,7 @@ pub fn deserialize_column( /// Deserializes rows [`ByteRecord`] into a [`RecordBatch`]. /// Note that this is a convenience function: column deserialization -///is trivially parallelizable (e.g. rayon). +/// is trivially parallelizable (e.g. rayon). pub fn deserialize_batch( rows: &[ByteRecord], fields: &[Field], diff --git a/src/io/csv/read/infer_schema.rs b/src/io/csv/read/infer_schema.rs index 7618ab7cb27..1b8fd060d0b 100644 --- a/src/io/csv/read/infer_schema.rs +++ b/src/io/csv/read/infer_schema.rs @@ -50,6 +50,16 @@ fn is_datetime(string: &str) -> Option { } /// Infers [`DataType`] from `bytes` +/// # Implementation +/// * case insensitive "true" or "false" are mapped to [`DataType::Boolean`] +/// * parsable to integer is mapped to [`DataType::Int64`] +/// * parsable to float is mapped to [`DataType::Float64`] +/// * parsable to date is mapped to [`DataType::Date32`] +/// * parsable to time is mapped to [`DataType::Time32(TimeUnit::Millisecond)`] +/// * parsable to naive datetime is mapped to [`DataType::Timestamp(TimeUnit::Millisecond, None)`] +/// * parsable to time-aware datetime is mapped to [`DataType::Timestamp`] of milliseconds and parsed offset. +/// * other utf8 is mapped to [`DataType::Utf8`] +/// * invalid utf8 is mapped to [`DataType::Binary`] pub fn infer(bytes: &[u8]) -> DataType { if is_boolean(bytes) { DataType::Boolean @@ -75,9 +85,8 @@ pub fn infer(bytes: &[u8]) -> DataType { } } -/// Infer the schema of a CSV file by reading through the first n records up to `max_rows`. -/// -/// Return infered schema and number of records used for inference. +/// Infers a [`Schema`] of a CSV file by reading through the first n records up to `max_rows`. +/// Seeks back to the begining of the file _after_ the header pub fn infer_schema DataType>( reader: &mut Reader, max_rows: Option, diff --git a/src/io/csv/read/reader.rs b/src/io/csv/read/reader.rs index 14ddd8d07a1..e2b95a8e405 100644 --- a/src/io/csv/read/reader.rs +++ b/src/io/csv/read/reader.rs @@ -20,9 +20,9 @@ pub fn projected_schema(schema: &Schema, projection: Option<&[usize]>) -> Schema } } -/// Reads `len` rows from the CSV into Bytes, skiping `skip` +/// Reads `len` rows from `reader` into `row`, skiping `skip`. /// This operation has minimal CPU work and is thus the fastest way to read through a CSV -/// without deserializing the contents to arrow. +/// without deserializing the contents to Arrow. pub fn read_rows( reader: &mut Reader, skip: usize, diff --git a/src/record_batch.rs b/src/record_batch.rs index b7a53f4e1f3..02182af41df 100644 --- a/src/record_batch.rs +++ b/src/record_batch.rs @@ -51,7 +51,7 @@ impl RecordBatch { /// Creates a [`RecordBatch`] from a schema and columns, with additional options, /// such as whether to strictly validate field names. /// - /// See [`fn@try_new`] for the expected conditions. + /// See [`Self::try_new()`] for the expected conditions. pub fn try_new_with_options( schema: Arc, columns: Vec>, diff --git a/src/scalar/binary.rs b/src/scalar/binary.rs index f847e8f0c88..c5e1a40b978 100644 --- a/src/scalar/binary.rs +++ b/src/scalar/binary.rs @@ -2,6 +2,7 @@ use crate::{array::*, buffer::Buffer, datatypes::DataType}; use super::Scalar; +/// The [`Scalar`] implementation of binary (`Vec`). #[derive(Debug, Clone)] pub struct BinaryScalar { value: Buffer, @@ -16,6 +17,7 @@ impl PartialEq for BinaryScalar { } impl BinaryScalar { + /// Returns a new [`BinaryScalar`]. #[inline] pub fn new>(v: Option

) -> Self { let is_valid = v.is_some(); @@ -28,6 +30,7 @@ impl BinaryScalar { } } + /// Its value #[inline] pub fn value(&self) -> &[u8] { self.value.as_slice() diff --git a/src/scalar/boolean.rs b/src/scalar/boolean.rs index 7f0ffdaa0de..74df27534f3 100644 --- a/src/scalar/boolean.rs +++ b/src/scalar/boolean.rs @@ -2,6 +2,7 @@ use crate::datatypes::DataType; use super::Scalar; +/// The [`Scalar`] implementation of a boolean. #[derive(Debug, Clone)] pub struct BooleanScalar { value: bool, @@ -15,6 +16,7 @@ impl PartialEq for BooleanScalar { } impl BooleanScalar { + /// Returns a new [`BooleanScalar`] #[inline] pub fn new(v: Option) -> Self { let is_valid = v.is_some(); @@ -24,6 +26,7 @@ impl BooleanScalar { } } + /// The value irrespectively of the validity #[inline] pub fn value(&self) -> bool { self.value diff --git a/src/scalar/list.rs b/src/scalar/list.rs index 4623711a5b2..c8c505ef503 100644 --- a/src/scalar/list.rs +++ b/src/scalar/list.rs @@ -24,6 +24,7 @@ impl PartialEq for ListScalar { } impl ListScalar { + /// returns a new [`ListScalar`] /// # Panics /// iff /// * the `data_type` is not `List` or `LargeList` (depending on this scalar's offset `O`) @@ -46,6 +47,7 @@ impl ListScalar { } } + /// The values of the [`ListScalar`] pub fn values(&self) -> &Arc { &self.values } diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index aa826cab11e..abc5d1b6972 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -1,3 +1,4 @@ +#![warn(missing_docs)] //! contains the [`Scalar`] trait object representing individual items of [`Array`](crate::array::Array)s, //! as well as concrete implementations such as [`BooleanScalar`]. use std::any::Any; @@ -20,12 +21,16 @@ pub use null::*; mod struct_; pub use struct_::*; -/// Trait object declaring an optional value with a logical type. +/// Trait object declaring an optional value with a [`DataType`]. +/// This strait is often used in APIs that accept multiple scalar types. pub trait Scalar: std::fmt::Debug { + /// convert itself to fn as_any(&self) -> &dyn Any; + /// whether it is valid fn is_valid(&self) -> bool; + /// the logical type. fn data_type(&self) -> &DataType; } diff --git a/src/scalar/null.rs b/src/scalar/null.rs index 3751c6cfbd6..8957a150e95 100644 --- a/src/scalar/null.rs +++ b/src/scalar/null.rs @@ -2,10 +2,12 @@ use crate::datatypes::DataType; use super::Scalar; -#[derive(Debug, Clone, PartialEq)] +/// The representation of a single entry of a [`crate::array::NullArray`]. +#[derive(Debug, Clone, PartialEq, Eq)] pub struct NullScalar {} impl NullScalar { + /// A new [`NullScalar`] #[inline] pub fn new() -> Self { Self {} diff --git a/src/scalar/primitive.rs b/src/scalar/primitive.rs index 90265bf7cb0..f5796fffbb7 100644 --- a/src/scalar/primitive.rs +++ b/src/scalar/primitive.rs @@ -1,10 +1,13 @@ use crate::{ datatypes::DataType, types::{NativeType, NaturalDataType}, + error::ArrowError, }; use super::Scalar; +/// The implementation of [`Scalar`] for primitive, semantically equivalent to [`Option`] +/// with [`DataType`]. #[derive(Debug, Clone)] pub struct PrimitiveScalar { // Not Option because this offers a stabler pointer offset on the struct @@ -22,8 +25,17 @@ impl PartialEq for PrimitiveScalar { } impl PrimitiveScalar { + /// Returns a new [`PrimitiveScalar`]. #[inline] pub fn new(data_type: DataType, v: Option) -> Self { + if !T::is_valid(&data_type) { + Err(ArrowError::InvalidArgumentError(format!( + "Type {} does not support logical type {}", + std::any::type_name::(), + data_type + ))) + .unwrap() + } let is_valid = v.is_some(); Self { value: v.unwrap_or_default(), @@ -32,6 +44,7 @@ impl PrimitiveScalar { } } + /// Returns the value irrespectively of its validity. #[inline] pub fn value(&self) -> T { self.value diff --git a/src/scalar/struct_.rs b/src/scalar/struct_.rs index eab4671f1dc..b7822b3ae02 100644 --- a/src/scalar/struct_.rs +++ b/src/scalar/struct_.rs @@ -4,6 +4,7 @@ use crate::datatypes::DataType; use super::Scalar; +/// A single entry of a [`crate::array::StructArray`]. #[derive(Debug, Clone)] pub struct StructScalar { values: Vec>, @@ -20,6 +21,7 @@ impl PartialEq for StructScalar { } impl StructScalar { + /// Returns a new [`StructScalar`] #[inline] pub fn new(data_type: DataType, values: Option>>) -> Self { let is_valid = values.is_some(); @@ -30,6 +32,7 @@ impl StructScalar { } } + /// Returns the values irrespectively of the validity. #[inline] pub fn values(&self) -> &[Arc] { &self.values diff --git a/src/scalar/utf8.rs b/src/scalar/utf8.rs index 207fe084f0c..60417cb7813 100644 --- a/src/scalar/utf8.rs +++ b/src/scalar/utf8.rs @@ -2,9 +2,10 @@ use crate::{array::*, buffer::Buffer, datatypes::DataType}; use super::Scalar; +/// The implementation of [`Scalar`] for utf8, semantically equivalent to [`Option<&str>`]. #[derive(Debug, Clone)] pub struct Utf8Scalar { - value: Buffer, + value: Buffer, // safety: valid utf8 is_valid: bool, phantom: std::marker::PhantomData, } @@ -16,6 +17,7 @@ impl PartialEq for Utf8Scalar { } impl Utf8Scalar { + /// Returns a new [`Utf8Scalar`] #[inline] pub fn new>(v: Option

) -> Self { let is_valid = v.is_some(); @@ -28,8 +30,10 @@ impl Utf8Scalar { } } + /// Returns the value irrespectively of the validity. #[inline] pub fn value(&self) -> &str { + // Safety: invariant of the struct unsafe { std::str::from_utf8_unchecked(self.value.as_slice()) } } }