diff --git a/src/array/growable/list.rs b/src/array/growable/list.rs index bd2f805cf77..676b7cc07d4 100644 --- a/src/array/growable/list.rs +++ b/src/array/growable/list.rs @@ -65,7 +65,7 @@ pub struct GrowableList<'a, O: Offset> { } impl<'a, O: Offset> GrowableList<'a, O> { - /// Creates a new [`GrowableFixedSizeBinary`] bound to `arrays` with a pre-allocated `capacity`. + /// Creates a new [`GrowableList`] bound to `arrays` with a pre-allocated `capacity`. /// # Panics /// If `arrays` is empty. pub fn new(arrays: Vec<&'a ListArray>, mut use_validity: bool, capacity: usize) -> Self { diff --git a/src/array/mod.rs b/src/array/mod.rs index 4fce51612c1..0ab4030debc 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -1,18 +1,21 @@ -//! fixed-length and immutable containers with optional values +//! Contains the [`Array`] and [`MutableArray`] trait objects declaring arrays, +//! as well as concrete arrays (such as [`Utf8Array`] and [`MutableUtf8Array`]). +//! +//! Fixed-length containers with optional values //! that are layed in memory according to the Arrow specification. //! Each array type has its own `struct`. The following are the main array types: -//! * [`PrimitiveArray`], an array of values with a fixed length such as integers, floats, etc. -//! * [`BooleanArray`], an array of boolean values (stored as a bitmap) -//! * [`Utf8Array`], an array of utf8 values -//! * [`BinaryArray`], an array of binary values -//! * [`ListArray`], an array of arrays (e.g. `[[1, 2], None, [], [None]]`) +//! * [`PrimitiveArray`] and [`MutablePrimitiveArray`], an array of values with a fixed length such as integers, floats, etc. +//! * [`BooleanArray`] and [`MutableBooleanArray`], an array of boolean values (stored as a bitmap) +//! * [`Utf8Array`] and [`MutableUtf8Array`], an array of variable length utf8 values +//! * [`BinaryArray`] and [`MutableBinaryArray`], an array of opaque variable length values +//! * [`ListArray`] and [`MutableListArray`], an array of arrays (e.g. `[[1, 2], None, [], [None]]`) //! * [`StructArray`], an array of arrays identified by a string (e.g. `{"a": [1, 2], "b": [true, false]}`) -//! All arrays implement the trait [`Array`] and are often trait objects that can be downcasted -//! to a concrete struct based on [`DataType`] available from [`Array::data_type`]. -//! Arrays share memory via [`crate::buffer::Buffer`] and thus cloning and slicing them `O(1)`. +//! All immutable arrays implement the trait object [`Array`] and that can be downcasted +//! to a concrete struct based on [`PhysicalType`](crate::datatypes::PhysicalType) available from [`Array::data_type`]. +//! All immutable arrays are backed by [`Buffer`](crate::buffer::Buffer) and thus cloning and slicing them is `O(1)`. //! -//! This module also contains the mutable counterparts of arrays, that are neither clonable nor slicable, but that -//! can be operated in-place, such as [`MutablePrimitiveArray`] and [`MutableUtf8Array`]. +//! Most arrays contain a [`MutableArray`] counterpart that is neither clonable nor slicable, but +//! can be operated in-place. use std::any::Any; use std::fmt::Display; diff --git a/src/bitmap/mod.rs b/src/bitmap/mod.rs index c8974b0a031..78b5da1ece2 100644 --- a/src/bitmap/mod.rs +++ b/src/bitmap/mod.rs @@ -1,7 +1,5 @@ #![deny(missing_docs)] -//! Contains efficient containers of booleans: [`Bitmap`] and [`MutableBitmap`]. -//! The memory backing these containers is cache-aligned and optimized for both vertical -//! and horizontal operations over booleans. +//! contains [`Bitmap`] and [`MutableBitmap`], containers of `bool`. mod immutable; pub use immutable::*; diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 6a75438a4d1..f64aad44fd7 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -1,6 +1,6 @@ #![deny(missing_docs)] -//! Contains containers for all Arrow sized types (e.g. `i32`), -//! [`Buffer`] and [`MutableBuffer`]. +//! Contains [`Buffer`] and [`MutableBuffer`], containers for all Arrow +//! physical types (e.g. i32, f64). mod immutable; mod mutable; diff --git a/src/compute/comparison/mod.rs b/src/compute/comparison/mod.rs index 35e3d324f15..c688e2a1fe7 100644 --- a/src/compute/comparison/mod.rs +++ b/src/compute/comparison/mod.rs @@ -23,7 +23,7 @@ //! inputs the two items for comparison and an [`Operator`] which specifies the //! type of comparison that will be conducted, such as `<=` ([`Operator::LtEq`]). //! -//! Much like the parent module [`crate::compute`](compute), the comparison functions +//! Much like the parent module [`compute`](crate::compute), the comparison functions //! have two variants - a statically typed one ([`primitive_compare`]) //! which expects concrete types such as [`Int8Array`] and a dynamically typed //! variant ([`compare`]) that compares values of type `&dyn Array` and errors diff --git a/src/compute/mod.rs b/src/compute/mod.rs index f117cd426b2..51fcca52ef5 100644 --- a/src/compute/mod.rs +++ b/src/compute/mod.rs @@ -1,7 +1,11 @@ -//! Contains operators over arrays. This module's general design is +//! contains a wide range of compute operations (e.g. +//! [`arithmetics`], [`aggregate`], +//! [`filter`], [`comparison`], and [`sort`]) +//! +//! This module's general design is //! that each operator has two interfaces, a statically-typed version and a dynamically-typed //! version. -//! The statically-typed version expects concrete arrays (like `PrimitiveArray`); +//! The statically-typed version expects concrete arrays (such as [`PrimitiveArray`](crate::array::PrimitiveArray)); //! the dynamically-typed version expects `&dyn Array` and errors if the the type is not //! supported. //! Some dynamically-typed operators have an auxiliary function, `can_*`, that returns diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 01c62e5c55f..5aadaee994d 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -1,4 +1,4 @@ -//! Metadata declarations such as [`DataType`], [`Field`] and [`Schema`]. +//! Contains all metadata, such as [`PhysicalType`], [`DataType`], [`Field`] and [`Schema`]. mod field; mod physical_type; mod schema; diff --git a/src/doc/lib.md b/src/doc/lib.md new file mode 100644 index 00000000000..83db4ae84b4 --- /dev/null +++ b/src/doc/lib.md @@ -0,0 +1,67 @@ +Welcome to arrow2's documentation. Thanks for checking it out! + +This is a library for efficient in-memory data operations using +[Arrow in-memory format](https://arrow.apache.org/docs/format/Columnar.html). +It is a re-write from the bottom up of the official `arrow` crate with soundness +and type safety in mind. + +Check out [the guide](https://jorgecarleitao.github.io/arrow2/) for an introduction. +Below is an example of some of the things you can do with it: + +```rust +use std::sync::Arc; + +use arrow2::array::*; +use arrow2::compute::arithmetics; +use arrow2::error::Result; +use arrow2::io::parquet::write::*; +use arrow2::record_batch::RecordBatch; + +fn main() -> Result<()> { + // declare arrays + let a = Int32Array::from(&[Some(1), None, Some(3)]); + let b = Int32Array::from(&[Some(2), None, Some(6)]); + + // compute (probably the fastest implementation of a nullable op you can find out there) + let c = arithmetics::basic::mul_scalar(&a, &2); + assert_eq!(c, b); + + // declare records + let batch = RecordBatch::try_from_iter([ + ("c1", Arc::new(a) as Arc), + ("c2", Arc::new(b) as Arc), + ])?; + // with metadata + println!("{:?}", batch.schema()); + + // write to parquet (probably the fastest implementation of writing to parquet out there) + let schema = batch.schema().clone(); + + let options = WriteOptions { + write_statistics: true, + compression: Compression::Snappy, + version: Version::V1, + }; + + let row_groups = RowGroupIterator::try_new( + vec![Ok(batch)].into_iter(), + &schema, + options, + vec![Encoding::Plain, Encoding::Plain], + )?; + + let mut file = std::fs::File::create("test.parquet")?; + + let parquet_schema = row_groups.parquet_schema().clone(); + let _ = write_file( + &mut file, + row_groups, + &schema, + parquet_schema, + options, + None, + )?; + + Ok(()) +} +``` diff --git a/src/ffi/mod.rs b/src/ffi/mod.rs index 8c3dac73fc6..788cc7635d5 100644 --- a/src/ffi/mod.rs +++ b/src/ffi/mod.rs @@ -1,5 +1,5 @@ -//! Contains interfaces to use the -//! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). +//! contains FFI bindings to import and export [`Array`](crate::array::Array) via +//! Arrow's [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) mod array; #[allow(clippy::module_inception)] mod ffi; @@ -19,16 +19,16 @@ pub use schema::Ffi_ArrowSchema; use self::schema::to_field; -/// Exports an `Array` to the C data interface. +/// Exports an [`Arc`] to the C data interface. /// # Safety -/// The pointer must be allocated and valid +/// The pointer `ptr` must be allocated and valid pub unsafe fn export_array_to_c(array: Arc, ptr: *mut Ffi_ArrowArray) { *ptr = Ffi_ArrowArray::new(array); } /// Exports a [`Field`] to the C data interface. /// # Safety -/// The pointer must be allocated and valid +/// The pointer `ptr` must be allocated and valid pub unsafe fn export_field_to_c(field: &Field, ptr: *mut Ffi_ArrowSchema) { *ptr = Ffi_ArrowSchema::new(field) } diff --git a/src/io/mod.rs b/src/io/mod.rs index 3bddcbc20a9..27cc75e229f 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -1,4 +1,5 @@ -//! Interact with different formats such as Arrow, CSV, parquet, etc. +//! Contains modules to interface with other formats such as [`csv`], +//! [`parquet`], [`json`], [`ipc`], [`mod@print`] and [`avro`]. #[cfg(any(feature = "io_csv_read", feature = "io_csv_write"))] #[cfg_attr(docsrs, doc(cfg(feature = "io_csv")))] pub mod csv; diff --git a/src/lib.rs b/src/lib.rs index f7bc3d339b6..c329968da9e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,4 @@ -//! Doc provided by README - +#![doc = include_str!("doc/lib.md")] // So that we have more control over what is `unsafe` inside an `unsafe` block #![allow(unused_unsafe)] #![cfg_attr(docsrs, feature(doc_cfg))] diff --git a/src/record_batch.rs b/src/record_batch.rs index 40164d59eb8..b7a53f4e1f3 100644 --- a/src/record_batch.rs +++ b/src/record_batch.rs @@ -1,58 +1,27 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! A two-dimensional batch of column-oriented data with a defined -//! [schema](crate::datatypes::Schema). - +//! Contains [`RecordBatch`]. use std::sync::Arc; use crate::array::*; use crate::datatypes::*; use crate::error::{ArrowError, Result}; -type ArrayRef = Arc; - -/// A two-dimensional batch of column-oriented data with a defined -/// [schema](crate::datatypes::Schema). -/// -/// A `RecordBatch` is a two-dimensional dataset of a number of -/// contiguous arrays, each the same length. -/// A record batch has a schema which must match its arrays' -/// datatypes. -/// -/// Record batches are a convenient unit of work for various -/// serialization and computation functions, possibly incremental. +/// A two-dimensional dataset with a number of +/// columns ([`Array`]) and rows and defined [`Schema`](crate::datatypes::Schema). +/// # Implementation +/// Cloning is `O(C)` where `C` is the number of columns. #[derive(Clone, Debug, PartialEq)] pub struct RecordBatch { schema: Arc, - columns: Vec, + columns: Vec>, } impl RecordBatch { - /// Creates a `RecordBatch` from a schema and columns. - /// - /// Expects the following: - /// * the vec of columns to not be empty - /// * the schema and column data types to have equal lengths - /// and match - /// * each array in columns to have the same length - /// - /// If the conditions are not met, an error is returned. - /// + /// Creates a [`RecordBatch`] from a schema and columns. + /// # Errors + /// This function errors iff + /// * `columns` is empty + /// * the schema and column data types do not match + /// * `columns` have a different length /// # Example /// /// ``` @@ -73,22 +42,22 @@ impl RecordBatch { /// # Ok(()) /// # } /// ``` - pub fn try_new(schema: Arc, columns: Vec) -> Result { + pub fn try_new(schema: Arc, columns: Vec>) -> Result { let options = RecordBatchOptions::default(); Self::validate_new_batch(&schema, columns.as_slice(), &options)?; Ok(RecordBatch { schema, columns }) } - /// Creates a `RecordBatch` from a schema and columns, with additional options, + /// Creates a [`RecordBatch`] from a schema and columns, with additional options, /// such as whether to strictly validate field names. /// - /// See [`RecordBatch::try_new`] for the expected conditions. + /// See [`fn@try_new`] for the expected conditions. pub fn try_new_with_options( schema: Arc, - columns: Vec, + columns: Vec>, options: &RecordBatchOptions, ) -> Result { - Self::validate_new_batch(&schema, columns.as_slice(), options)?; + Self::validate_new_batch(&schema, &columns, options)?; Ok(RecordBatch { schema, columns }) } @@ -106,7 +75,7 @@ impl RecordBatch { /// if any validation check fails. fn validate_new_batch( schema: &Schema, - columns: &[ArrayRef], + columns: &[Arc], options: &RecordBatchOptions, ) -> Result<()> { // check that there are some columns @@ -229,12 +198,12 @@ impl RecordBatch { /// # Panics /// /// Panics if `index` is outside of `0..num_columns`. - pub fn column(&self, index: usize) -> &ArrayRef { + pub fn column(&self, index: usize) -> &Arc { &self.columns[index] } /// Get a reference to all columns in the record batch. - pub fn columns(&self) -> &[ArrayRef] { + pub fn columns(&self) -> &[Arc] { &self.columns[..] } @@ -255,8 +224,8 @@ impl RecordBatch { /// use arrow2::datatypes::DataType; /// use arrow2::record_batch::RecordBatch; /// - /// let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2])); - /// let b: ArrayRef = Arc::new(Utf8Array::::from_slice(&["a", "b"])); + /// let a: Arc = Arc::new(Int32Array::from_slice(&[1, 2])); + /// let b: Arc = Arc::new(Utf8Array::::from_slice(&["a", "b"])); /// /// let record_batch = RecordBatch::try_from_iter(vec![ /// ("a", a), @@ -265,7 +234,7 @@ impl RecordBatch { /// ``` pub fn try_from_iter(value: I) -> Result where - I: IntoIterator, + I: IntoIterator)>, F: AsRef, { // TODO: implement `TryFrom` trait, once @@ -292,8 +261,8 @@ impl RecordBatch { /// use arrow2::datatypes::DataType; /// use arrow2::record_batch::RecordBatch; /// - /// let a: ArrayRef = Arc::new(Int32Array::from_slice(&[1, 2])); - /// let b: ArrayRef = Arc::new(Utf8Array::::from_slice(&["a", "b"])); + /// let a: Arc = Arc::new(Int32Array::from_slice(&[1, 2])); + /// let b: Arc = Arc::new(Utf8Array::::from_slice(&["a", "b"])); /// /// // Note neither `a` nor `b` has any actual nulls, but we mark /// // b an nullable @@ -304,7 +273,7 @@ impl RecordBatch { /// ``` pub fn try_from_iter_with_nullable(value: I) -> Result where - I: IntoIterator, + I: IntoIterator, bool)>, F: AsRef, { // TODO: implement `TryFrom` trait, once diff --git a/src/scalar/mod.rs b/src/scalar/mod.rs index ad5e1acf1c3..aa826cab11e 100644 --- a/src/scalar/mod.rs +++ b/src/scalar/mod.rs @@ -1,5 +1,5 @@ -//! Declares the [`Scalar`] API, an optional, trait object representing -//! the zero-dimension of an [`crate::array::Array`]. +//! contains the [`Scalar`] trait object representing individual items of [`Array`](crate::array::Array)s, +//! as well as concrete implementations such as [`BooleanScalar`]. use std::any::Any; use crate::{array::*, datatypes::*}; diff --git a/src/types/mod.rs b/src/types/mod.rs index b990b56ede7..0bf096fe9f5 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,4 +1,5 @@ -//! traits to handle _all native types_ used in this crate. +//! Traits and implementations to handle _all types_ used in this crate. +//! //! Most physical types used in this crate are native Rust types, like `i32`. //! The most important trait is [`NativeType`], the generic trait of [`crate::array::PrimitiveArray`]. //!