From 45eb11c490b2c953f969ba2b00e944c728ef86ca Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 20 Sep 2021 14:48:37 +0000 Subject: [PATCH] Improved docs. --- src/array/mod.rs | 3 +- src/array/primitive/from_natural.rs | 16 +++-- src/array/primitive/mod.rs | 23 +++---- src/datatypes/mod.rs | 6 +- src/datatypes/physical_type.rs | 99 +++++++++++++++-------------- src/types/mod.rs | 5 +- 6 files changed, 80 insertions(+), 72 deletions(-) diff --git a/src/array/mod.rs b/src/array/mod.rs index fa5ea4d0794..5dbadb28846 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -47,8 +47,7 @@ pub trait Array: std::fmt::Debug + Send + Sync { /// When the validity is [`None`], all slots are valid. fn validity(&self) -> &Option; - /// The number of null slots on this [`Array`]. This is usually used to branch - /// implementations to cases where optimizations can be made. + /// The number of null slots on this [`Array`]. /// # Implementation /// This is `O(1)`. #[inline] diff --git a/src/array/primitive/from_natural.rs b/src/array/primitive/from_natural.rs index f07a219a577..93717f4a223 100644 --- a/src/array/primitive/from_natural.rs +++ b/src/array/primitive/from_natural.rs @@ -23,7 +23,9 @@ impl>> FromI } impl PrimitiveArray { - /// Creates a new array out an iterator over values + /// Creates a (non-null) [`PrimitiveArray`] from an iterator of values. + /// # Implementation + /// This does not assume that the iterator has a known length. pub fn from_values>(iter: I) -> Self { Self::from_data( T::DATA_TYPE, @@ -32,14 +34,18 @@ impl PrimitiveArray { ) } - /// Creates a new array out an iterator over values + /// Creates a (non-null) [`PrimitiveArray`] from a slice of values. + /// # Implementation + /// This is essentially a memcopy and is the fastest way to create a [`PrimitiveArray`]. pub fn from_slice>(slice: P) -> Self { Self::from_data(T::DATA_TYPE, Buffer::::from(slice), None) } } impl PrimitiveArray { - /// Creates a new array out an iterator over values + /// Creates a (non-null) [`PrimitiveArray`] from a [`TrustedLen`] of values. + /// # Implementation + /// This does not assume that the iterator has a known length. pub fn from_trusted_len_values_iter>(iter: I) -> Self { MutablePrimitiveArray::::from_trusted_len_values_iter(iter).into() } @@ -52,12 +58,12 @@ impl PrimitiveArray { MutablePrimitiveArray::::from_trusted_len_values_iter_unchecked(iter).into() } - /// Creates a new [`PrimitiveArray`] from an iterator over optional values + /// Creates a [`PrimitiveArray`] from a [`TrustedLen`] of optional values. pub fn from_trusted_len_iter>>(iter: I) -> Self { MutablePrimitiveArray::::from_trusted_len_iter(iter).into() } - /// Creates a new [`PrimitiveArray`] from an iterator over optional values + /// Creates a [`PrimitiveArray`] from an iterator of optional values. /// # Safety /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). /// I.e. that `size_hint().1` correctly reports its length. diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 074047ec35d..f2458f86e97 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -18,15 +18,15 @@ pub use mutable::*; /// A [`PrimitiveArray`] is arrow's equivalent to `Vec>`, i.e. /// an array designed for highly performant operations on optionally nullable slots, -/// backed by a physical type of a physical byte-width, such as `i32` or `f64`. +/// backed by a physical type of a fixed byte-width, such as `i32` or `f64`. /// The size of this struct is `O(1)` as all data is stored behind an [`std::sync::Arc`]. /// # Example /// ``` -/// use arrow2::array::PrimitiveArray; +/// use arrow2::array::{PrimitiveArray, Array}; /// # fn main() { -/// let array = PrimitiveArray::::from([Some(1), None, Some(2)]); -/// assert_eq!(array.value(0), 1); -/// assert_eq!(array.values().as_slice(), &[1, 0, 2]); +/// let array = Int32Array::from([Some(1), None, Some(10)]); +/// assert_eq!(array.values().as_slice(), &[1, 0, 10]); +/// assert_eq!(array.validity(), &Some(Bitmap::from([true, false, true]))); /// # } /// ``` #[derive(Debug, Clone)] @@ -95,7 +95,7 @@ impl PrimitiveArray { } /// Sets the validity bitmap on this [`PrimitiveArray`]. - /// # Panic + /// # Panics /// This function panics iff `validity.len() != self.len()`. pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { @@ -106,21 +106,22 @@ impl PrimitiveArray { arr } - /// The values [`Buffer`]. + /// The values. + /// Values on null slots are undetermined (they can be anything). #[inline] pub fn values(&self) -> &Buffer { &self.values } - /// Safe method to retrieve the value at slot `i`. - /// Equivalent to `self.values()[i]`. + /// Returns the value at slot `i`. Equivalent to `self.values()[i]`. + /// The value on null slots is undetermined (it can be anything). #[inline] pub fn value(&self, i: usize) -> T { self.values()[i] } - /// Returns the element at index `i` as `T` - /// + /// Returns the element at index `i` as `T`. + /// The value on null slots is undetermined (it can be anything). /// # Safety /// Caller must be sure that `i < self.len()` #[inline] diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index de3a3494226..01c62e5c55f 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -11,8 +11,8 @@ pub(crate) use field::{get_extension, Extension, Metadata}; /// The set of supported logical types. /// Each variant uniquely identifies a logical type, which define specific semantics to the data (e.g. how it should be represented). -/// A [`DataType`] has an unique corresponding [`PhysicalType`], obtained via [`DataType::to_physical_type`], -/// which uniquely identifies an in-memory representation of data. +/// Each variant has a corresponding [`PhysicalType`], obtained via [`DataType::to_physical_type`], +/// which declares the in-memory representation of data. /// The [`DataType::Extension`] is special in that it augments a [`DataType`] with metadata to support custom types. /// Use `to_logical_type` to desugar such type and return its correspoding logical type. #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -119,7 +119,7 @@ impl std::fmt::Display for DataType { } } -/// Time units defined in Arrow. +/// The time units defined in Arrow. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum TimeUnit { /// Time in seconds. diff --git a/src/datatypes/physical_type.rs b/src/datatypes/physical_type.rs index 11dde7e1829..aee6f99d122 100644 --- a/src/datatypes/physical_type.rs +++ b/src/datatypes/physical_type.rs @@ -1,6 +1,42 @@ -/// the set of valid indices used to index a dictionary-encoded Array. +/// The set of physical types: unique in-memory representations of an Arrow array. +/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and +/// a one-to-one mapping to each struct in this crate that implements [`crate::array::Array`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum DictionaryIndexType { +pub enum PhysicalType { + /// A Null with no allocation. + Null, + /// A boolean represented as a single bit. + Boolean, + /// An array where each slot has a known compile-time size. + Primitive(PrimitiveType), + /// Opaque binary data of variable length. + Binary, + /// Opaque binary data of fixed size. + FixedSizeBinary, + /// Opaque binary data of variable length and 64-bit offsets. + LargeBinary, + /// A variable-length string in Unicode with UTF-8 encoding. + Utf8, + /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. + LargeUtf8, + /// A list of some data type with variable length. + List, + /// A list of some data type with fixed length. + FixedSizeList, + /// A list of some data type with variable length and 64-bit offsets. + LargeList, + /// A nested type that contains an arbitrary number of fields. + Struct, + /// A nested type that represents slots of differing types. + Union, + /// A dictionary encoded array by `DictionaryIndexType`. + Dictionary(DictionaryIndexType), +} + +/// The set of all (physical) primitive types. +/// Each type corresponds to a variant of [`crate::array::PrimitiveArray`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum PrimitiveType { /// A signed 8-bit integer. Int8, /// A signed 16-bit integer. @@ -9,6 +45,8 @@ pub enum DictionaryIndexType { Int32, /// A signed 64-bit integer. Int64, + /// A signed 128-bit integer. + Int128, /// An unsigned 8-bit integer. UInt8, /// An unsigned 16-bit integer. @@ -17,10 +55,20 @@ pub enum DictionaryIndexType { UInt32, /// An unsigned 64-bit integer. UInt64, + /// A 32-bit floating point number. + Float32, + /// A 64-bit floating point number. + Float64, + /// Two i32 representing days and ms + DaysMs, + /// months_days_ns(i32, i32, i64) + MonthDayNano, } +/// the set of valid indices types of a dictionary-encoded Array. +/// Each type corresponds to a variant of [`crate::array::DictionaryArray`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum PrimitiveType { +pub enum DictionaryIndexType { /// A signed 8-bit integer. Int8, /// A signed 16-bit integer. @@ -29,8 +77,6 @@ pub enum PrimitiveType { Int32, /// A signed 64-bit integer. Int64, - /// A signed 128-bit integer. - Int128, /// An unsigned 8-bit integer. UInt8, /// An unsigned 16-bit integer. @@ -39,47 +85,4 @@ pub enum PrimitiveType { UInt32, /// An unsigned 64-bit integer. UInt64, - /// A 32-bit floating point number. - Float32, - /// A 64-bit floating point number. - Float64, - /// Two i32 representing days and ms - DaysMs, - /// months_days_ns(i32, i32, i64) - MonthDayNano, -} - -/// The set of physical types: unique in-memory representations of an Arrow array. -/// A physical type has a one-to-many relationship with a [`crate::datatypes::DataType`] and -/// a one-to-one mapping with each struct in this crate that implements [`crate::array::Array`]. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum PhysicalType { - /// A Null with no allocation. - Null, - /// A boolean represented as a single bit. - Boolean, - /// An array where each slot has a known compile-time size. - Primitive(PrimitiveType), - /// Opaque binary data of variable length. - Binary, - /// Opaque binary data of fixed size. - FixedSizeBinary, - /// Opaque binary data of variable length and 64-bit offsets. - LargeBinary, - /// A variable-length string in Unicode with UTF-8 encoding. - Utf8, - /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. - LargeUtf8, - /// A list of some data type with variable length. - List, - /// A list of some data type with fixed length. - FixedSizeList, - /// A list of some data type with variable length and 64-bit offsets. - LargeList, - /// A nested type that contains an arbitrary number of fields. - Struct, - /// A nested type that represents slots of differing types. - Union, - /// A dictionary encoded array by `DictionaryIndexType`. - Dictionary(DictionaryIndexType), } diff --git a/src/types/mod.rs b/src/types/mod.rs index 8e120ee274a..27dc0183996 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,7 +1,6 @@ -//! traits to handle _all physical types_ used in this crate. +//! traits to handle _all native types_ used in this crate. //! Most physical types used in this crate are native Rust types, like `i32`. -//! The most important trait is [`NativeType`], implemented for all Arrow types -//! with a Rust correspondence (such as `i32` or `f64`). +//! The most important trait is [`NativeType`], the generic trait of [`crate::array::PrimitiveArray`]. //! //! Another important trait is [`BitChunk`], describing types that can be used to //! represent chunks of bits (e.g. `u8`, `u16`), and [`BitChunkIter`], that can be used to