diff --git a/src/array/mod.rs b/src/array/mod.rs index 7f27fe650cd..ff3a26c735b 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! Contains the [`Array`] and [`MutableArray`] trait objects declaring arrays, //! as well as concrete arrays (such as [`Utf8Array`] and [`MutableUtf8Array`]). //! diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index 78b5da1ece2..1ee5c3f3018 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! contains [`Bitmap`] and [`MutableBitmap`], containers of `bool`. mod immutable; pub use immutable::*; diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 5841a147fd4..2ae6ff09909 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! Contains [`Buffer`], an immutable container for all Arrow physical types (e.g. i32, f64). mod immutable; diff --git a/src/compute/mod.rs b/src/compute/mod.rs index 99568b0aaf9..7667b320e53 100644 --- a/src/compute/mod.rs +++ b/src/compute/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! contains a wide range of compute operations (e.g. //! [`arithmetics`], [`aggregate`], //! [`filter`], [`comparison`], and [`sort`]) diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 10cdfecf4c0..8af8245d6c4 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] #![forbid(unsafe_code)] //! Contains all metadata, such as [`PhysicalType`], [`DataType`], [`Field`] and [`Schema`]. diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs index 802939382d5..bcb20c06331 100644 --- a/src/ffi/mod.rs +++ b/src/ffi/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! contains FFI bindings to import and export [`Array`](crate::array::Array) via //! Arrow's [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) mod array; diff --git a/src/io/avro/mod.rs b/src/io/avro/mod.rs index 5f267abcb60..1ced5eda784 100644 --- a/src/io/avro/mod.rs +++ b/src/io/avro/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! Read and write from and to Apache Avro pub mod read; diff --git a/src/io/csv/mod.rs b/src/io/csv/mod.rs index 00b14185051..32539fecdef 100644 --- a/src/io/csv/mod.rs +++ b/src/io/csv/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! Convert data between the Arrow and CSV (comma-separated values). use crate::error::ArrowError; diff --git a/src/io/flight/mod.rs b/src/io/flight/mod.rs index 5db4c70770e..f0bf1d73cd1 100644 --- a/src/io/flight/mod.rs +++ b/src/io/flight/mod.rs @@ -1,3 +1,4 @@ +//! Serialization and deserialization to Arrow's flight protocol use std::sync::Arc; use arrow_format::flight::data::{FlightData, SchemaResult}; diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs index 64443afc002..e99d1a87ac8 100644 --- a/src/io/ipc/mod.rs +++ b/src/io/ipc/mod.rs @@ -91,15 +91,18 @@ const CONTINUATION_MARKER: [u8; 4] = [0xff; 4]; /// to specify the dictionary ids of the IPC fields when writing to IPC. #[derive(Debug, Clone, PartialEq, Default)] pub struct IpcField { - // optional children + /// optional children pub fields: Vec, - // dictionary id + /// dictionary id pub dictionary_id: Option, } +/// Struct containing fields and whether the file is written in little or big endian. #[derive(Debug, Clone, PartialEq)] pub struct IpcSchema { + /// The fields in the schema pub fields: Vec, + /// Endianness of the file pub is_little_endian: bool, } diff --git a/src/io/ipc/read/mod.rs b/src/io/ipc/read/mod.rs index 710376a8d95..22b3a2fe448 100644 --- a/src/io/ipc/read/mod.rs +++ b/src/io/ipc/read/mod.rs @@ -22,7 +22,7 @@ pub use reader::{read_file_metadata, FileMetadata, FileReader}; pub use schema::deserialize_schema; pub use stream::{read_stream_metadata, StreamMetadata, StreamReader, StreamState}; -// how dictionaries are tracked in this crate +/// how dictionaries are tracked in this crate pub type Dictionaries = HashMap>; pub(crate) type Node<'a> = arrow_format::ipc::FieldNodeRef<'a>; diff --git a/src/io/ipc/read/reader.rs b/src/io/ipc/read/reader.rs index e109ca9d202..3254eb4a7ef 100644 --- a/src/io/ipc/read/reader.rs +++ b/src/io/ipc/read/reader.rs @@ -14,6 +14,7 @@ use super::schema::fb_to_schema; use super::Dictionaries; use arrow_format::ipc::planus::{ReadAsRoot, Vector}; +/// Metadata of an Arrow IPC file, written in the footer of the file. #[derive(Debug, Clone)] pub struct FileMetadata { /// The schema that is read from the file footer diff --git a/src/io/ipc/read/stream.rs b/src/io/ipc/read/stream.rs index 3439afc24ec..370ea9f429d 100644 --- a/src/io/ipc/read/stream.rs +++ b/src/io/ipc/read/stream.rs @@ -15,13 +15,16 @@ use super::common::*; use super::schema::fb_to_schema; use super::Dictionaries; +/// Metadata of an Arrow IPC stream, written at the start of the stream #[derive(Debug, Clone)] pub struct StreamMetadata { /// The schema that is read from the stream's first message pub schema: Schema, + /// The IPC version of the stream pub version: arrow_format::ipc::MetadataVersion, + /// The IPC fields tracking dictionaries pub ipc_schema: IpcSchema, } diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs index c8f535c850f..bb7b245e974 100644 --- a/src/io/ipc/write/serialize.rs +++ b/src/io/ipc/write/serialize.rs @@ -458,6 +458,7 @@ pub fn _write_dictionary( } } +/// Writes a dictionary array #[allow(clippy::too_many_arguments)] pub fn write_dictionary( array: &dyn Array, @@ -488,6 +489,7 @@ pub fn write_dictionary( } } +/// Writes an [`Array`] to `arrow_data` pub fn write( array: &dyn Array, buffers: &mut Vec, diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs index 19546fc74aa..7c3527931de 100644 --- a/src/io/ipc/write/writer.rs +++ b/src/io/ipc/write/writer.rs @@ -74,6 +74,7 @@ impl FileWriter { }) } + /// Consumes itself into the inner writer pub fn into_inner(self) -> W { self.writer } diff --git a/src/io/json/mod.rs b/src/io/json/mod.rs index e52d8a9f45d..69eb364b65d 100644 --- a/src/io/json/mod.rs +++ b/src/io/json/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! Convert data between the Arrow memory format and JSON line-delimited records. pub mod read; diff --git a/src/io/json_integration/mod.rs b/src/io/json_integration/mod.rs index 8c5f3e2556e..279f2ec55f0 100644 --- a/src/io/json_integration/mod.rs +++ b/src/io/json_integration/mod.rs @@ -11,8 +11,11 @@ pub mod write; /// A struct that represents an Arrow file with a schema and record batches #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJson { + /// The schema pub schema: ArrowJsonSchema, + /// The batches pub batches: Vec, + /// The dictionaries #[serde(skip_serializing_if = "Option::is_none")] pub dictionaries: Option>, } @@ -22,7 +25,9 @@ pub struct ArrowJson { /// Fields are left as JSON `Value` as they vary by `DataType` #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonSchema { + /// The fields pub fields: Vec, + /// The metadata #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option, } @@ -30,31 +35,45 @@ pub struct ArrowJsonSchema { /// Fields are left as JSON `Value` as they vary by `DataType` #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonField { + /// The name pub name: String, + /// The type #[serde(rename = "type")] pub field_type: Value, + /// whether it is nullable pub nullable: bool, + /// the children pub children: Vec, + /// the dictionary #[serde(skip_serializing_if = "Option::is_none")] pub dictionary: Option, + /// the fields' metadata #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option, } +/// Dictionary metadata #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonFieldDictionary { + /// the dictionary id pub id: i64, + /// the index type #[serde(rename = "indexType")] pub index_type: IntegerType, + /// whether it is ordered #[serde(rename = "isOrdered")] pub is_ordered: bool, } +/// the type of the integer in the dictionary #[derive(Deserialize, Serialize, Debug)] pub struct IntegerType { + /// its name pub name: String, + /// whether it is signed #[serde(rename = "isSigned")] pub is_signed: bool, + /// the bit width #[serde(rename = "bitWidth")] pub bit_width: i64, } @@ -63,6 +82,7 @@ pub struct IntegerType { #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonBatch { count: usize, + /// the columns pub columns: Vec, } @@ -70,7 +90,9 @@ pub struct ArrowJsonBatch { #[derive(Deserialize, Serialize, Debug)] #[allow(non_snake_case)] pub struct ArrowJsonDictionaryBatch { + /// the id pub id: i64, + /// the dictionary batch pub data: ArrowJsonBatch, } @@ -78,14 +100,20 @@ pub struct ArrowJsonDictionaryBatch { #[derive(Deserialize, Serialize, Clone, Debug)] pub struct ArrowJsonColumn { name: String, + /// the number of elements pub count: usize, + /// the validity bitmap #[serde(rename = "VALIDITY")] pub validity: Option>, + /// the data #[serde(rename = "DATA")] pub data: Option>, + /// the offsets #[serde(rename = "OFFSET")] pub offset: Option>, // leaving as Value as 64-bit offsets are strings + /// the type id for union types #[serde(rename = "TYPE_ID")] - pub type_id: Option>, // for union types + pub type_id: Option>, + /// the children pub children: Option>, } diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index 1118bb1978b..d8210c89dfc 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -410,6 +410,7 @@ pub fn to_array( } } +/// Deserializes a [`ArrowJsonBatch`] to a [`Chunk`] pub fn deserialize_chunk( schema: &Schema, ipc_fields: &[IpcField], diff --git a/src/io/json_integration/read/mod.rs b/src/io/json_integration/read/mod.rs index 9a4e5318639..55e05752bc3 100644 --- a/src/io/json_integration/read/mod.rs +++ b/src/io/json_integration/read/mod.rs @@ -1,3 +1,4 @@ +//! API to read from Arrow JSON integration format mod array; pub use array::*; mod schema; diff --git a/src/io/json_integration/write/mod.rs b/src/io/json_integration/write/mod.rs index 9a4e5318639..9b0c22d8df8 100644 --- a/src/io/json_integration/write/mod.rs +++ b/src/io/json_integration/write/mod.rs @@ -1,3 +1,4 @@ +//! API to write to Arrow JSON integration format mod array; pub use array::*; mod schema; diff --git a/src/io/parquet/read/file.rs b/src/io/parquet/read/file.rs index 420e5fc7158..451131bb6c9 100644 --- a/src/io/parquet/read/file.rs +++ b/src/io/parquet/read/file.rs @@ -184,6 +184,7 @@ pub struct RowGroupReader { } impl RowGroupReader { + /// Returns a new [`RowGroupReader`] pub fn new( reader: R, schema: Schema, diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 24c86444446..278610e298c 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -59,6 +59,7 @@ pub use schema::{get_schema, FileMetaData}; use self::nested_utils::{InitNested, NestedArrayIter, NestedState}; use deserialize::page_iter_to_arrays; +/// Trait describing a [`FallibleStreamingIterator`] of [`DataPage`] pub trait DataPages: FallibleStreamingIterator + Send + Sync { diff --git a/src/io/parquet/read/statistics/binary.rs b/src/io/parquet/read/statistics/binary.rs index fc6d887a68d..12421994925 100644 --- a/src/io/parquet/read/statistics/binary.rs +++ b/src/io/parquet/read/statistics/binary.rs @@ -10,9 +10,13 @@ use crate::error::{ArrowError, Result}; /// Represents a `Binary` or `LargeBinary` #[derive(Debug, Clone, PartialEq)] pub struct BinaryStatistics { + /// number of nulls pub null_count: Option, + /// number of dictinct values pub distinct_count: Option, + /// Minimum pub min_value: Option>, + /// Maximum pub max_value: Option>, } @@ -41,11 +45,16 @@ impl From<&ParquetByteArrayStatistics> for BinaryStatistics { } } +/// Statistics of a string parquet column #[derive(Debug, Clone, PartialEq)] pub struct Utf8Statistics { + /// number of nulls pub null_count: Option, + /// number of dictinct values pub distinct_count: Option, + /// Minimum pub min_value: Option, + /// Maximum pub max_value: Option, } diff --git a/src/io/parquet/read/statistics/boolean.rs b/src/io/parquet/read/statistics/boolean.rs index 4817dbde3b3..30c462e1b9f 100644 --- a/src/io/parquet/read/statistics/boolean.rs +++ b/src/io/parquet/read/statistics/boolean.rs @@ -4,11 +4,16 @@ use std::any::Any; use super::Statistics; +/// Statistics of a boolean parquet column #[derive(Debug, Clone, PartialEq)] pub struct BooleanStatistics { + /// number of nulls pub null_count: Option, + /// number of dictinct values pub distinct_count: Option, + /// Minimum pub min_value: Option, + /// Maximum pub max_value: Option, } diff --git a/src/io/parquet/read/statistics/fixlen.rs b/src/io/parquet/read/statistics/fixlen.rs index 867c7e84e09..6c8ee9ddb8b 100644 --- a/src/io/parquet/read/statistics/fixlen.rs +++ b/src/io/parquet/read/statistics/fixlen.rs @@ -15,12 +15,18 @@ use parquet2::{ use super::Statistics; +/// Arrow-deserialized parquet Statistics of a fixed-len binary #[derive(Debug, Clone, PartialEq)] pub struct FixedLenStatistics { + /// number of nulls pub null_count: Option, + /// number of dictinct values pub distinct_count: Option, + /// Minimum pub min_value: Option>, + /// Maximum pub max_value: Option>, + /// data type pub data_type: DataType, } diff --git a/src/io/parquet/read/statistics/primitive.rs b/src/io/parquet/read/statistics/primitive.rs index 0693c6ac9ed..91a630692df 100644 --- a/src/io/parquet/read/statistics/primitive.rs +++ b/src/io/parquet/read/statistics/primitive.rs @@ -10,12 +10,18 @@ use std::any::Any; use super::Statistics; use crate::error::Result; +/// Arrow-deserialized parquet Statistics of a primitive type #[derive(Debug, Clone, PartialEq)] pub struct PrimitiveStatistics { + /// the data type pub data_type: DataType, + /// number of nulls pub null_count: Option, + /// number of dictinct values pub distinct_count: Option, + /// Minimum pub min_value: Option, + /// Maximum pub max_value: Option, } diff --git a/src/io/parquet/write/file.rs b/src/io/parquet/write/file.rs index 649d6805334..47f595a1717 100644 --- a/src/io/parquet/write/file.rs +++ b/src/io/parquet/write/file.rs @@ -22,6 +22,7 @@ pub fn add_arrow_schema( .or_else(|| Some(vec![schema_to_metadata_key(schema)])) } +/// An interface to write a parquet to a [`Write`] pub struct FileWriter { writer: parquet2::write::FileWriter, schema: Schema, diff --git a/src/lib.rs b/src/lib.rs index e00876b8d16..f96f933c39a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![doc = include_str!("doc/lib.md")] +#![deny(missing_docs)] // So that we have more control over what is `unsafe` inside an `unsafe` block #![allow(unused_unsafe)] // diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index 4910bb67c29..44a909c990f 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -1,4 +1,3 @@ -#![warn(missing_docs)] //! contains the [`Scalar`] trait object representing individual items of [`Array`](crate::array::Array)s, //! as well as concrete implementations such as [`BooleanScalar`]. use std::any::Any; diff --git a/src/types/mod.rs b/src/types/mod.rs index 0125f119b08..4ba1584acf9 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,4 +1,3 @@ -#![deny(missing_docs)] //! Sealed traits and implementations to handle all _physical types_ used in this crate. //! //! Most physical types used in this crate are native Rust types, such as `i32`.