Improved docs.

jorgecarleitao · Oct 2, 2021 · 2be5d34 · 2be5d34
1 parent 2d84c2d
commit 2be5d34
Show file tree

Hide file tree

Showing 19 changed files with 66 additions and 20 deletions.
diff --git a/src/array/fixed_size_binary/mutable.rs b/src/array/fixed_size_binary/mutable.rs
@@ -150,7 +150,7 @@ impl MutableFixedSizeBinaryArray {
         std::slice::from_raw_parts(self.values.as_ptr().add(i * self.size), self.size)
     }
 
-    /// Shrinks the capacity of the [`MutablePrimitive`] to fit its current length.
+    /// Shrinks the capacity of the [`MutableFixedSizeBinaryArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
         self.values.shrink_to_fit();
         if let Some(validity) = &mut self.validity {

diff --git a/src/array/fixed_size_list/mutable.rs b/src/array/fixed_size_list/mutable.rs
@@ -74,7 +74,7 @@ impl<M: MutableArray> MutableFixedSizeListArray<M> {
             None => self.init_validity(),
         }
     }
-    /// Shrinks the capacity of the [`MutableFixedSizeList`] to fit its current length.
+    /// Shrinks the capacity of the [`MutableFixedSizeListArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
         self.values.shrink_to_fit();
         if let Some(validity) = &mut self.validity {

diff --git a/src/array/growable/fixed_size_list.rs b/src/array/growable/fixed_size_list.rs
@@ -22,7 +22,7 @@ pub struct GrowableFixedSizeList<'a> {
 }
 
 impl<'a> GrowableFixedSizeList<'a> {
-    /// Creates a new [`GrowableList`] bound to `arrays` with a pre-allocated `capacity`.
+    /// Creates a new [`GrowableFixedSizeList`] bound to `arrays` with a pre-allocated `capacity`.
     /// # Panics
     /// If `arrays` is empty.
     pub fn new(

diff --git a/src/array/list/mutable.rs b/src/array/list/mutable.rs
@@ -43,7 +43,7 @@ impl<O: Offset, M: MutableArray + Default> MutableListArray<O, M> {
         }
     }
 
-    /// Shrinks the capacity of the [`MutableList`] to fit its current length.
+    /// Shrinks the capacity of the [`MutableListArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
         self.values.shrink_to_fit();
         self.offsets.shrink_to_fit();

diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs
@@ -238,7 +238,7 @@ impl<T: NativeType> MutablePrimitiveArray<T> {
         Arc::new(a)
     }
 
-    /// Shrinks the capacity of the [`MutablePrimitive`] to fit its current length.
+    /// Shrinks the capacity of the [`MutablePrimitiveArray`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
         self.values.shrink_to_fit();
         if let Some(validity) = &mut self.validity {

diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs
@@ -165,7 +165,7 @@ impl<O: Offset> MutableUtf8Array<O> {
         Arc::new(a)
     }
 
-    /// Shrinks the capacity of the [`MutableUtf8`] to fit its current length.
+    /// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length.
     pub fn shrink_to_fit(&mut self) {
         self.values.shrink_to_fit();
         self.offsets.shrink_to_fit();

diff --git a/src/io/README.md b/src/io/README.md
@@ -6,17 +6,19 @@ This document describes the overall design of this module.
 
 * Each directory in this module corresponds to a specific format such as `csv` and `json`.
 * directories that depend on external dependencies MUST be feature gated, with a feature named with a prefix `io_`.
-* modules MUST re-export any API of external dependencies they require as part of their public API. E.g.
+* modules MUST re-export any API of external dependencies they require as part of their public API.
+  E.g.
     * if a module as an API `write(writer: &mut csv:Writer<W>, ...)`, it MUST contain `pub use csv::Writer;`.
 
     The rational is that adding this crate to `cargo.toml` must be sufficient to use it.
-* Each directory SHOULD contain two directories, `read` and `write`, corresponding to functionality about 
-reading from the format and writing to the format respectively.
+* Each directory SHOULD contain two directories, `read` and `write`, corresponding
+  to functionality about reading from the format and writing to the format respectively.
 * The base module SHOULD contain `use pub read;` and `use pub write;`.
 * Implementations SHOULD separate reading of "data" from reading of "metadata". Examples:
     * schema read or inference SHOULD be a separate function
     * functions that read "data" SHOULD consume a schema typically pre-read.
-* Implementations SHOULD separate IO-bounded operations from CPU-bounded operations. I.e. implementations SHOULD:
+* Implementations SHOULD separate IO-bounded operations from CPU-bounded operations.
+  I.e. implementations SHOULD:
     * contain functions that consume a `Read` implementor and output a "raw" struct, i.e. a struct that is e.g. compressed and serialized
     * contain functions that consume a "raw" struct and convert it into Arrow.
     * offer each of these functions as independent public APIs, so that consumers can decide how to balance CPU-bounds and IO-bounds.
diff --git a/src/io/csv/read/deserialize.rs b/src/io/csv/read/deserialize.rs
@@ -198,7 +198,7 @@ pub fn deserialize_column(
 
 /// Deserializes rows [`ByteRecord`] into a [`RecordBatch`].
 /// Note that this is a convenience function: column deserialization
-///is trivially parallelizable (e.g. rayon).
+/// is trivially parallelizable (e.g. rayon).
 pub fn deserialize_batch<F>(
     rows: &[ByteRecord],
     fields: &[Field],

diff --git a/src/io/csv/read/infer_schema.rs b/src/io/csv/read/infer_schema.rs
@@ -50,6 +50,16 @@ fn is_datetime(string: &str) -> Option<String> {
 }
 
 /// Infers [`DataType`] from `bytes`
+/// # Implementation
+/// * case insensitive "true" or "false" are mapped to [`DataType::Boolean`]
+/// * parsable to integer is mapped to [`DataType::Int64`]
+/// * parsable to float is mapped to [`DataType::Float64`]
+/// * parsable to date is mapped to [`DataType::Date32`]
+/// * parsable to time is mapped to [`DataType::Time32(TimeUnit::Millisecond)`]
+/// * parsable to naive datetime is mapped to [`DataType::Timestamp(TimeUnit::Millisecond, None)`]
+/// * parsable to time-aware datetime is mapped to [`DataType::Timestamp`] of milliseconds and parsed offset.
+/// * other utf8 is mapped to [`DataType::Utf8`]
+/// * invalid utf8 is mapped to [`DataType::Binary`]
 pub fn infer(bytes: &[u8]) -> DataType {
     if is_boolean(bytes) {
         DataType::Boolean
@@ -75,9 +85,8 @@ pub fn infer(bytes: &[u8]) -> DataType {
     }
 }
 
-/// Infer the schema of a CSV file by reading through the first n records up to `max_rows`.
-///
-/// Return infered schema and number of records used for inference.
+/// Infers a [`Schema`] of a CSV file by reading through the first n records up to `max_rows`.
+/// Seeks back to the begining of the file _after_ the header
 pub fn infer_schema<R: Read + Seek, F: Fn(&[u8]) -> DataType>(
     reader: &mut Reader<R>,
     max_rows: Option<usize>,

diff --git a/src/io/csv/read/reader.rs b/src/io/csv/read/reader.rs
@@ -20,9 +20,9 @@ pub fn projected_schema(schema: &Schema, projection: Option<&[usize]>) -> Schema
     }
 }
 
-/// Reads `len` rows from the CSV into Bytes, skiping `skip`
+/// Reads `len` rows from `reader` into `row`, skiping `skip`.
 /// This operation has minimal CPU work and is thus the fastest way to read through a CSV
-/// without deserializing the contents to arrow.
+/// without deserializing the contents to Arrow.
 pub fn read_rows<R: Read>(
     reader: &mut Reader<R>,
     skip: usize,

diff --git a/src/record_batch.rs b/src/record_batch.rs
@@ -51,7 +51,7 @@ impl RecordBatch {
     /// Creates a [`RecordBatch`] from a schema and columns, with additional options,
     /// such as whether to strictly validate field names.
     ///
-    /// See [`fn@try_new`] for the expected conditions.
+    /// See [`Self::try_new()`] for the expected conditions.
     pub fn try_new_with_options(
         schema: Arc<Schema>,
         columns: Vec<Arc<dyn Array>>,

diff --git a/src/scalar/binary.rs b/src/scalar/binary.rs
@@ -2,6 +2,7 @@ use crate::{array::*, buffer::Buffer, datatypes::DataType};
 
 use super::Scalar;
 
+/// The [`Scalar`] implementation of binary (`Vec<u8>`).
 #[derive(Debug, Clone)]
 pub struct BinaryScalar<O: Offset> {
     value: Buffer<u8>,
@@ -16,6 +17,7 @@ impl<O: Offset> PartialEq for BinaryScalar<O> {
 }
 
 impl<O: Offset> BinaryScalar<O> {
+    /// Returns a new [`BinaryScalar`].
     #[inline]
     pub fn new<P: AsRef<[u8]>>(v: Option<P>) -> Self {
         let is_valid = v.is_some();
@@ -28,6 +30,7 @@ impl<O: Offset> BinaryScalar<O> {
         }
     }
 
+    /// Its value
     #[inline]
     pub fn value(&self) -> &[u8] {
         self.value.as_slice()

diff --git a/src/scalar/boolean.rs b/src/scalar/boolean.rs
@@ -2,6 +2,7 @@ use crate::datatypes::DataType;
 
 use super::Scalar;
 
+/// The [`Scalar`] implementation of a boolean.
 #[derive(Debug, Clone)]
 pub struct BooleanScalar {
     value: bool,
@@ -15,6 +16,7 @@ impl PartialEq for BooleanScalar {
 }
 
 impl BooleanScalar {
+    /// Returns a new [`BooleanScalar`]
     #[inline]
     pub fn new(v: Option<bool>) -> Self {
         let is_valid = v.is_some();
@@ -24,6 +26,7 @@ impl BooleanScalar {
         }
     }
 
+    /// The value irrespectively of the validity
     #[inline]
     pub fn value(&self) -> bool {
         self.value

diff --git a/src/scalar/list.rs b/src/scalar/list.rs
@@ -24,6 +24,7 @@ impl<O: Offset> PartialEq for ListScalar<O> {
 }
 
 impl<O: Offset> ListScalar<O> {
+    /// returns a new [`ListScalar`]
     /// # Panics
     /// iff
     /// * the `data_type` is not `List` or `LargeList` (depending on this scalar's offset `O`)
@@ -46,6 +47,7 @@ impl<O: Offset> ListScalar<O> {
         }
     }
 
+    /// The values of the [`ListScalar`]
     pub fn values(&self) -> &Arc<dyn Array> {
         &self.values
     }

diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs
@@ -1,3 +1,4 @@
+#![warn(missing_docs)]
 //! contains the [`Scalar`] trait object representing individual items of [`Array`](crate::array::Array)s,
 //! as well as concrete implementations such as [`BooleanScalar`].
 use std::any::Any;
@@ -20,12 +21,16 @@ pub use null::*;
 mod struct_;
 pub use struct_::*;
 
-/// Trait object declaring an optional value with a logical type.
+/// Trait object declaring an optional value with a [`DataType`].
+/// This strait is often used in APIs that accept multiple scalar types.
 pub trait Scalar: std::fmt::Debug {
+    /// convert itself to
     fn as_any(&self) -> &dyn Any;
 
+    /// whether it is valid
     fn is_valid(&self) -> bool;
 
+    /// the logical type.
     fn data_type(&self) -> &DataType;
 }
 

diff --git a/src/scalar/null.rs b/src/scalar/null.rs
@@ -2,10 +2,12 @@ use crate::datatypes::DataType;
 
 use super::Scalar;
 
-#[derive(Debug, Clone, PartialEq)]
+/// The representation of a single entry of a [`crate::array::NullArray`].
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct NullScalar {}
 
 impl NullScalar {
+    /// A new [`NullScalar`]
     #[inline]
     pub fn new() -> Self {
         Self {}

diff --git a/src/scalar/primitive.rs b/src/scalar/primitive.rs
@@ -1,10 +1,13 @@
 use crate::{
     datatypes::DataType,
     types::{NativeType, NaturalDataType},
+    error::ArrowError,
 };
 
 use super::Scalar;
 
+/// The implementation of [`Scalar`] for primitive, semantically equivalent to [`Option<T>`]
+/// with [`DataType`].
 #[derive(Debug, Clone)]
 pub struct PrimitiveScalar<T: NativeType> {
     // Not Option<T> because this offers a stabler pointer offset on the struct
@@ -22,8 +25,17 @@ impl<T: NativeType> PartialEq for PrimitiveScalar<T> {
 }
 
 impl<T: NativeType> PrimitiveScalar<T> {
+    /// Returns a new [`PrimitiveScalar`].
     #[inline]
     pub fn new(data_type: DataType, v: Option<T>) -> Self {
+        if !T::is_valid(&data_type) {
+            Err(ArrowError::InvalidArgumentError(format!(
+                "Type {} does not support logical type {}",
+                std::any::type_name::<T>(),
+                data_type
+            )))
+            .unwrap()
+        }
         let is_valid = v.is_some();
         Self {
             value: v.unwrap_or_default(),
@@ -32,6 +44,7 @@ impl<T: NativeType> PrimitiveScalar<T> {
         }
     }
 
+    /// Returns the value irrespectively of its validity.
     #[inline]
     pub fn value(&self) -> T {
         self.value

diff --git a/src/scalar/struct_.rs b/src/scalar/struct_.rs
@@ -4,6 +4,7 @@ use crate::datatypes::DataType;
 
 use super::Scalar;
 
+/// A single entry of a [`crate::array::StructArray`].
 #[derive(Debug, Clone)]
 pub struct StructScalar {
     values: Vec<Arc<dyn Scalar>>,
@@ -20,6 +21,7 @@ impl PartialEq for StructScalar {
 }
 
 impl StructScalar {
+    /// Returns a new [`StructScalar`]
     #[inline]
     pub fn new(data_type: DataType, values: Option<Vec<Arc<dyn Scalar>>>) -> Self {
         let is_valid = values.is_some();
@@ -30,6 +32,7 @@ impl StructScalar {
         }
     }
 
+    /// Returns the values irrespectively of the validity.
     #[inline]
     pub fn values(&self) -> &[Arc<dyn Scalar>] {
         &self.values

diff --git a/src/scalar/utf8.rs b/src/scalar/utf8.rs
@@ -2,9 +2,10 @@ use crate::{array::*, buffer::Buffer, datatypes::DataType};
 
 use super::Scalar;
 
+/// The implementation of [`Scalar`] for utf8, semantically equivalent to [`Option<&str>`].
 #[derive(Debug, Clone)]
 pub struct Utf8Scalar<O: Offset> {
-    value: Buffer<u8>,
+    value: Buffer<u8>, // safety: valid utf8
     is_valid: bool,
     phantom: std::marker::PhantomData<O>,
 }
@@ -16,6 +17,7 @@ impl<O: Offset> PartialEq for Utf8Scalar<O> {
 }
 
 impl<O: Offset> Utf8Scalar<O> {
+    /// Returns a new [`Utf8Scalar`]
     #[inline]
     pub fn new<P: AsRef<str>>(v: Option<P>) -> Self {
         let is_valid = v.is_some();
@@ -28,8 +30,10 @@ impl<O: Offset> Utf8Scalar<O> {
         }
     }
 
+    /// Returns the value irrespectively of the validity.
     #[inline]
     pub fn value(&self) -> &str {
+        // Safety: invariant of the struct
         unsafe { std::str::from_utf8_unchecked(self.value.as_slice()) }
     }
 }