jorgecarleitao · jorgecarleitao · Feb 5, 2022 · Feb 5, 2022
diff --git a/src/array/mod.rs b/src/array/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! Contains the [`Array`] and [`MutableArray`] trait objects declaring arrays,
 //! as well as concrete arrays (such as [`Utf8Array`] and [`MutableUtf8Array`]).
 //!

diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! contains [`Bitmap`] and [`MutableBitmap`], containers of `bool`.
 mod immutable;
 pub use immutable::*;

diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! Contains [`Buffer`], an immutable container for all Arrow physical types (e.g. i32, f64).
 
 mod immutable;

diff --git a/src/compute/mod.rs b/src/compute/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! contains a wide range of compute operations (e.g.
 //! [`arithmetics`], [`aggregate`],
 //! [`filter`], [`comparison`], and [`sort`])

diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 #![forbid(unsafe_code)]
 //! Contains all metadata, such as [`PhysicalType`], [`DataType`], [`Field`] and [`Schema`].
 

diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! contains FFI bindings to import and export [`Array`](crate::array::Array) via
 //! Arrow's [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
 mod array;

diff --git a/src/io/avro/mod.rs b/src/io/avro/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! Read and write from and to Apache Avro
 
 pub mod read;

diff --git a/src/io/csv/mod.rs b/src/io/csv/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! Convert data between the Arrow and CSV (comma-separated values).
 
 use crate::error::ArrowError;

diff --git a/src/io/flight/mod.rs b/src/io/flight/mod.rs
@@ -1,3 +1,4 @@
+//! Serialization and deserialization to Arrow's flight protocol
 use std::sync::Arc;
 
 use arrow_format::flight::data::{FlightData, SchemaResult};

diff --git a/src/io/ipc/mod.rs b/src/io/ipc/mod.rs
@@ -91,15 +91,18 @@ const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
 /// to specify the dictionary ids of the IPC fields when writing to IPC.
 #[derive(Debug, Clone, PartialEq, Default)]
 pub struct IpcField {
-    // optional children
+    /// optional children
     pub fields: Vec<IpcField>,
-    // dictionary id
+    /// dictionary id
     pub dictionary_id: Option<i64>,
 }
 
+/// Struct containing fields and whether the file is written in little or big endian.
 #[derive(Debug, Clone, PartialEq)]
 pub struct IpcSchema {
+    /// The fields in the schema
     pub fields: Vec<IpcField>,
+    /// Endianness of the file
     pub is_little_endian: bool,
 }
 

diff --git a/src/io/ipc/read/mod.rs b/src/io/ipc/read/mod.rs
@@ -22,7 +22,7 @@ pub use reader::{read_file_metadata, FileMetadata, FileReader};
 pub use schema::deserialize_schema;
 pub use stream::{read_stream_metadata, StreamMetadata, StreamReader, StreamState};
 
-// how dictionaries are tracked in this crate
+/// how dictionaries are tracked in this crate
 pub type Dictionaries = HashMap<i64, Arc<dyn Array>>;
 
 pub(crate) type Node<'a> = arrow_format::ipc::FieldNodeRef<'a>;

diff --git a/src/io/ipc/read/reader.rs b/src/io/ipc/read/reader.rs
@@ -14,6 +14,7 @@ use super::schema::fb_to_schema;
 use super::Dictionaries;
 use arrow_format::ipc::planus::{ReadAsRoot, Vector};
 
+/// Metadata of an Arrow IPC file, written in the footer of the file.
 #[derive(Debug, Clone)]
 pub struct FileMetadata {
     /// The schema that is read from the file footer

diff --git a/src/io/ipc/read/stream.rs b/src/io/ipc/read/stream.rs
@@ -15,13 +15,16 @@ use super::common::*;
 use super::schema::fb_to_schema;
 use super::Dictionaries;
 
+/// Metadata of an Arrow IPC stream, written at the start of the stream
 #[derive(Debug, Clone)]
 pub struct StreamMetadata {
     /// The schema that is read from the stream's first message
     pub schema: Schema,
 
+    /// The IPC version of the stream
     pub version: arrow_format::ipc::MetadataVersion,
 
+    /// The IPC fields tracking dictionaries
     pub ipc_schema: IpcSchema,
 }
 

diff --git a/src/io/ipc/write/serialize.rs b/src/io/ipc/write/serialize.rs
@@ -458,6 +458,7 @@ pub fn _write_dictionary<K: DictionaryKey>(
     }
 }
 
+/// Writes a dictionary array
 #[allow(clippy::too_many_arguments)]
 pub fn write_dictionary(
     array: &dyn Array,
@@ -488,6 +489,7 @@ pub fn write_dictionary(
     }
 }
 
+/// Writes an [`Array`] to `arrow_data`
 pub fn write(
     array: &dyn Array,
     buffers: &mut Vec<ipc::Buffer>,

diff --git a/src/io/ipc/write/writer.rs b/src/io/ipc/write/writer.rs
@@ -74,6 +74,7 @@ impl<W: Write> FileWriter<W> {
         })
     }
 
+    /// Consumes itself into the inner writer
     pub fn into_inner(self) -> W {
         self.writer
     }

diff --git a/src/io/json/mod.rs b/src/io/json/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! Convert data between the Arrow memory format and JSON line-delimited records.
 
 pub mod read;

diff --git a/src/io/json_integration/mod.rs b/src/io/json_integration/mod.rs
@@ -11,8 +11,11 @@ pub mod write;
 /// A struct that represents an Arrow file with a schema and record batches
 #[derive(Deserialize, Serialize, Debug)]
 pub struct ArrowJson {
+    /// The schema
     pub schema: ArrowJsonSchema,
+    /// The batches
     pub batches: Vec<ArrowJsonBatch>,
+    /// The dictionaries
     #[serde(skip_serializing_if = "Option::is_none")]
     pub dictionaries: Option<Vec<ArrowJsonDictionaryBatch>>,
 }
@@ -22,39 +25,55 @@ pub struct ArrowJson {
 /// Fields are left as JSON `Value` as they vary by `DataType`
 #[derive(Deserialize, Serialize, Debug)]
 pub struct ArrowJsonSchema {
+    /// The fields
     pub fields: Vec<ArrowJsonField>,
+    /// The metadata
     #[serde(skip_serializing_if = "Option::is_none")]
     pub metadata: Option<Value>,
 }
 
 /// Fields are left as JSON `Value` as they vary by `DataType`
 #[derive(Deserialize, Serialize, Debug)]
 pub struct ArrowJsonField {
+    /// The name
     pub name: String,
+    /// The type
     #[serde(rename = "type")]
     pub field_type: Value,
+    /// whether it is nullable
     pub nullable: bool,
+    /// the children
     pub children: Vec<ArrowJsonField>,
+    /// the dictionary
     #[serde(skip_serializing_if = "Option::is_none")]
     pub dictionary: Option<ArrowJsonFieldDictionary>,
+    /// the fields' metadata
     #[serde(skip_serializing_if = "Option::is_none")]
     pub metadata: Option<Value>,
 }
 
+/// Dictionary metadata
 #[derive(Deserialize, Serialize, Debug)]
 pub struct ArrowJsonFieldDictionary {
+    /// the dictionary id
     pub id: i64,
+    /// the index type
     #[serde(rename = "indexType")]
     pub index_type: IntegerType,
+    /// whether it is ordered
     #[serde(rename = "isOrdered")]
     pub is_ordered: bool,
 }
 
+/// the type of the integer in the dictionary
 #[derive(Deserialize, Serialize, Debug)]
 pub struct IntegerType {
+    /// its name
     pub name: String,
+    /// whether it is signed
     #[serde(rename = "isSigned")]
     pub is_signed: bool,
+    /// the bit width
     #[serde(rename = "bitWidth")]
     pub bit_width: i64,
 }
@@ -63,29 +82,38 @@ pub struct IntegerType {
 #[derive(Deserialize, Serialize, Debug)]
 pub struct ArrowJsonBatch {
     count: usize,
+    /// the columns
     pub columns: Vec<ArrowJsonColumn>,
 }
 
 /// A struct that partially reads the Arrow JSON dictionary batch
 #[derive(Deserialize, Serialize, Debug)]
 #[allow(non_snake_case)]
 pub struct ArrowJsonDictionaryBatch {
+    /// the id
     pub id: i64,
+    /// the dictionary batch
     pub data: ArrowJsonBatch,
 }
 
 /// A struct that partially reads the Arrow JSON column/array
 #[derive(Deserialize, Serialize, Clone, Debug)]
 pub struct ArrowJsonColumn {
     name: String,
+    /// the number of elements
     pub count: usize,
+    /// the validity bitmap
     #[serde(rename = "VALIDITY")]
     pub validity: Option<Vec<u8>>,
+    /// the data
     #[serde(rename = "DATA")]
     pub data: Option<Vec<Value>>,
+    /// the offsets
     #[serde(rename = "OFFSET")]
     pub offset: Option<Vec<Value>>, // leaving as Value as 64-bit offsets are strings
+    /// the type id for union types
     #[serde(rename = "TYPE_ID")]
-    pub type_id: Option<Vec<Value>>, // for union types
+    pub type_id: Option<Vec<Value>>,
+    /// the children
     pub children: Option<Vec<ArrowJsonColumn>>,
 }
diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs
@@ -410,6 +410,7 @@ pub fn to_array(
     }
 }
 
+/// Deserializes a [`ArrowJsonBatch`] to a [`Chunk`]
 pub fn deserialize_chunk(
     schema: &Schema,
     ipc_fields: &[IpcField],

diff --git a/src/io/json_integration/read/mod.rs b/src/io/json_integration/read/mod.rs
@@ -1,3 +1,4 @@
+//! API to read from Arrow JSON integration format
 mod array;
 pub use array::*;
 mod schema;

diff --git a/src/io/json_integration/write/mod.rs b/src/io/json_integration/write/mod.rs
@@ -1,3 +1,4 @@
+//! API to write to Arrow JSON integration format
 mod array;
 pub use array::*;
 mod schema;

diff --git a/src/io/parquet/read/file.rs b/src/io/parquet/read/file.rs
@@ -184,6 +184,7 @@ pub struct RowGroupReader<R: Read + Seek> {
 }
 
 impl<R: Read + Seek> RowGroupReader<R> {
+    /// Returns a new [`RowGroupReader`]
     pub fn new(
         reader: R,
         schema: Schema,

diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs
@@ -59,6 +59,7 @@ pub use schema::{get_schema, FileMetaData};
 use self::nested_utils::{InitNested, NestedArrayIter, NestedState};
 use deserialize::page_iter_to_arrays;
 
+/// Trait describing a [`FallibleStreamingIterator`] of [`DataPage`]
 pub trait DataPages:
     FallibleStreamingIterator<Item = DataPage, Error = ParquetError> + Send + Sync
 {

diff --git a/src/io/parquet/read/statistics/binary.rs b/src/io/parquet/read/statistics/binary.rs
@@ -10,9 +10,13 @@ use crate::error::{ArrowError, Result};
 /// Represents a `Binary` or `LargeBinary`
 #[derive(Debug, Clone, PartialEq)]
 pub struct BinaryStatistics {
+    /// number of nulls
     pub null_count: Option<i64>,
+    /// number of dictinct values
     pub distinct_count: Option<i64>,
+    /// Minimum
     pub min_value: Option<Vec<u8>>,
+    /// Maximum
     pub max_value: Option<Vec<u8>>,
 }
 
@@ -41,11 +45,16 @@ impl From<&ParquetByteArrayStatistics> for BinaryStatistics {
     }
 }
 
+/// Statistics of a string parquet column
 #[derive(Debug, Clone, PartialEq)]
 pub struct Utf8Statistics {
+    /// number of nulls
     pub null_count: Option<i64>,
+    /// number of dictinct values
     pub distinct_count: Option<i64>,
+    /// Minimum
     pub min_value: Option<String>,
+    /// Maximum
     pub max_value: Option<String>,
 }
 

diff --git a/src/io/parquet/read/statistics/boolean.rs b/src/io/parquet/read/statistics/boolean.rs
@@ -4,11 +4,16 @@ use std::any::Any;
 
 use super::Statistics;
 
+/// Statistics of a boolean parquet column
 #[derive(Debug, Clone, PartialEq)]
 pub struct BooleanStatistics {
+    /// number of nulls
     pub null_count: Option<i64>,
+    /// number of dictinct values
     pub distinct_count: Option<i64>,
+    /// Minimum
     pub min_value: Option<bool>,
+    /// Maximum
     pub max_value: Option<bool>,
 }
 

diff --git a/src/io/parquet/read/statistics/fixlen.rs b/src/io/parquet/read/statistics/fixlen.rs
@@ -15,12 +15,18 @@ use parquet2::{
 
 use super::Statistics;
 
+/// Arrow-deserialized parquet Statistics of a fixed-len binary
 #[derive(Debug, Clone, PartialEq)]
 pub struct FixedLenStatistics {
+    /// number of nulls
     pub null_count: Option<i64>,
+    /// number of dictinct values
     pub distinct_count: Option<i64>,
+    /// Minimum
     pub min_value: Option<Vec<u8>>,
+    /// Maximum
     pub max_value: Option<Vec<u8>>,
+    /// data type
     pub data_type: DataType,
 }
 

diff --git a/src/io/parquet/read/statistics/primitive.rs b/src/io/parquet/read/statistics/primitive.rs
@@ -10,12 +10,18 @@ use std::any::Any;
 use super::Statistics;
 use crate::error::Result;
 
+/// Arrow-deserialized parquet Statistics of a primitive type
 #[derive(Debug, Clone, PartialEq)]
 pub struct PrimitiveStatistics<T: NativeType> {
+    /// the data type
     pub data_type: DataType,
+    /// number of nulls
     pub null_count: Option<i64>,
+    /// number of dictinct values
     pub distinct_count: Option<i64>,
+    /// Minimum
     pub min_value: Option<T>,
+    /// Maximum
     pub max_value: Option<T>,
 }
 

diff --git a/src/io/parquet/write/file.rs b/src/io/parquet/write/file.rs
@@ -22,6 +22,7 @@ pub fn add_arrow_schema(
         .or_else(|| Some(vec![schema_to_metadata_key(schema)]))
 }
 
+/// An interface to write a parquet to a [`Write`]
 pub struct FileWriter<W: Write> {
     writer: parquet2::write::FileWriter<W>,
     schema: Schema,

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,4 +1,5 @@
 #![doc = include_str!("doc/lib.md")]
+#![deny(missing_docs)]
 // So that we have more control over what is `unsafe` inside an `unsafe` block
 #![allow(unused_unsafe)]
 //

diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs
@@ -1,4 +1,3 @@
-#![warn(missing_docs)]
 //! contains the [`Scalar`] trait object representing individual items of [`Array`](crate::array::Array)s,
 //! as well as concrete implementations such as [`BooleanScalar`].
 use std::any::Any;

diff --git a/src/types/mod.rs b/src/types/mod.rs
@@ -1,4 +1,3 @@
-#![deny(missing_docs)]
 //! Sealed traits and implementations to handle all _physical types_ used in this crate.
 //!
 //! Most physical types used in this crate are native Rust types, such as `i32`.
-Original file line number
+Diff line change
@@ Expand Up / @@ -74,6 +74,7 @@ impl<W: Write> FileWriter<W> { @@
             })
         }
+        /// Consumes itself into the inner writer
         pub fn into_inner(self) -> W {
             self.writer
         }
@@ Expand Down @@