databendlabs · mergify · Feb 23, 2022 · Feb 21, 2022 · Feb 23, 2022 · Feb 23, 2022
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/common/arrow/Cargo.toml b/common/arrow/Cargo.toml
@@ -11,7 +11,7 @@ doctest = false
 test = false
 
 [features]
-arrow-default = ["arrow/compute", "arrow/regex", "arrow/io_csv", "arrow/io_parquet", "arrow/io_json", "arrow/io_flight"]
+arrow-default = ["arrow/compute", "arrow/regex", "arrow/io_csv", "arrow/io_parquet", "arrow/io_json", "arrow/io_flight", "arrow/compute_filter"]
 default = ["arrow-default", "parquet-default"]
 parquet-default = ["parquet2/stream", "parquet2/lz4"]
 simd = ["arrow/simd"]
@@ -20,9 +20,10 @@ simd = ["arrow/simd"]
 # Workspace dependencies
 
 # Github dependencies
-arrow = { package = "arrow2", git = "https://github.com/datafuse-extras/arrow2", default-features = false, rev = "d14ae86"}
-arrow-format = { version = "0.3.0", features = ["flight-data", "flight-service"] }
-parquet2 = { version = "0.8.1", default_features = false }
+arrow = { package = "arrow2", git = "https://github.com/datafuse-extras/arrow2", default-features = false, rev = "22bf173"}
+arrow-format = { version = "0.4.0", features = ["flight-data", "flight-service", "ipc"] }
+parquet2 = { version = "0.10.2", default_features = false }
+futures = { version = "0.3"}
 # Crates.io dependencies
 
 [dev-dependencies]
diff --git a/common/arrow/src/lib.rs b/common/arrow/src/lib.rs
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod parquet_read;
+mod parquet_write;
+
 pub use arrow;
 pub use arrow_format;
 pub use parquet2 as parquet;
+pub use parquet_read::read_columns_many_async;
+pub use parquet_write::write_parquet_file;
diff --git a/common/arrow/src/parquet_read.rs b/common/arrow/src/parquet_read.rs
@@ -0,0 +1,81 @@
+// Copyright 2022 Datafuse Labs.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use arrow::datatypes::Field;
+use arrow::error::Result;
+use arrow::io::parquet::read::to_deserializer;
+use arrow::io::parquet::read::ArrayIter;
+use futures::AsyncRead;
+use futures::AsyncReadExt;
+use futures::AsyncSeek;
+use futures::AsyncSeekExt;
+use parquet2::metadata::ColumnChunkMetaData;
+use parquet2::metadata::RowGroupMetaData;
+
+fn get_field_columns<'a>(
+    columns: &'a [ColumnChunkMetaData],
+    field_name: &str,
+) -> Vec<&'a ColumnChunkMetaData> {
+    columns
+        .iter()
+        .filter(|x| x.descriptor().path_in_schema()[0] == field_name)
+        .collect()
+}
+
+async fn _read_single_column_async<R>(
+    reader: &mut R,
+    meta: &ColumnChunkMetaData,
+) -> Result<Vec<u8>>
+where
+    R: AsyncRead + AsyncSeek + Send + Unpin,
+{
+    let (start, len) = meta.byte_range();
+    reader.seek(std::io::SeekFrom::Start(start)).await?;
+    let mut chunk = vec![0; len as usize];
+    reader.read_exact(&mut chunk).await?;
+    Result::Ok(chunk)
+}
+
+async fn read_columns_async<'a, R: AsyncRead + AsyncSeek + Send + Unpin>(
+    reader: &mut R,
+    columns: &'a [ColumnChunkMetaData],
+    field_name: &str,
+) -> Result<Vec<(&'a ColumnChunkMetaData, Vec<u8>)>> {
+    let col_metas = get_field_columns(columns, field_name);
+    let mut cols = Vec::with_capacity(col_metas.len());
+    for meta in col_metas {
+        cols.push((meta, _read_single_column_async(reader, meta).await?))
+    }
+    Ok(cols)
+}
+
+// used when we can not use arrow::io::parquet::read::read_columns_many_async which need a factory of reader
+pub async fn read_columns_many_async<'a, R: AsyncRead + AsyncSeek + Send + Unpin>(
+    reader: &mut R,
+    row_group: &RowGroupMetaData,
+    fields: Vec<&Field>,
+    chunk_size: Option<usize>,
+) -> Result<Vec<ArrayIter<'a>>> {
+    let mut arrays = Vec::with_capacity(fields.len());
+    for field in fields {
+        let columns = read_columns_async(reader, row_group.columns(), &field.name).await?;
+        arrays.push(to_deserializer(
+            columns,
+            field.to_owned(),
+            row_group.num_rows() as usize,
+            chunk_size,
+        )?);
+    }
+    Ok(arrays)
+}
diff --git a/common/arrow/src/parquet_write.rs b/common/arrow/src/parquet_write.rs
@@ -0,0 +1,46 @@
+// Copyright 2022 Datafuse Labs.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::io::Write;
+
+use arrow::array::Array;
+use arrow::chunk::Chunk;
+use arrow::datatypes::Schema;
+use arrow::error::Result;
+use arrow::io::parquet::write::FileWriter;
+use arrow::io::parquet::write::RowGroupIterator;
+use parquet2::write::WriteOptions;
+
+// a simple wrapper for code reuse
+pub fn write_parquet_file<W: Write, A, I>(
+    writer: &mut W,
+    row_groups: RowGroupIterator<A, I>,
+    schema: Schema,
+    options: WriteOptions,
+) -> Result<u64>
+where
+    W: Write,
+    A: AsRef<dyn Array> + 'static + Send + Sync,
+    I: Iterator<Item = Result<Chunk<A>>>,
+{
+    let mut file_writer = FileWriter::try_new(writer, schema, options)?;
+
+    file_writer.start()?;
+    for group in row_groups {
+        let (group, len) = group?;
+        file_writer.write(group, len)?;
+    }
+    let (size, _) = file_writer.end(None)?;
+    Ok(size)
+}
diff --git a/common/datablocks/src/data_block.rs b/common/datablocks/src/data_block.rs
@@ -16,8 +16,9 @@ use std::convert::TryFrom;
 use std::fmt;
 use std::sync::Arc;
 
-use common_arrow::arrow;
-use common_arrow::arrow::record_batch::RecordBatch;
+use common_arrow::arrow::array::Array;
+use common_arrow::arrow::array::ArrayRef;
+use common_arrow::arrow::chunk::Chunk;
 use common_datavalues::prelude::*;
 use common_exception::ErrorCode;
 use common_exception::Result;
@@ -174,38 +175,36 @@ impl DataBlock {
 
         Ok(Self { columns, schema })
     }
-}
-
-impl TryFrom<DataBlock> for RecordBatch {
-    type Error = ErrorCode;
 
-    fn try_from(v: DataBlock) -> Result<RecordBatch> {
-        let arrays = v
+    pub fn from_chunk<A: AsRef<dyn Array>>(
+        schema: &DataSchemaRef,
+        chuck: &Chunk<A>,
+    ) -> Result<DataBlock> {
+        let columns = chuck
             .columns()
             .iter()
-            .map(|c| c.as_arrow_array())
-            .collect::<Vec<_>>();
+            .zip(schema.fields().iter())
+            .map(|(col, f)| match f.is_nullable() {
+                true => col.into_nullable_column(),
+                false => col.into_column(),
+            })
+            .collect();
 
-        Ok(RecordBatch::try_new(Arc::new(v.schema.to_arrow()), arrays)?)
+        Ok(DataBlock::create(schema.clone(), columns))
     }
 }
 
-impl TryFrom<arrow::record_batch::RecordBatch> for DataBlock {
+impl TryFrom<DataBlock> for Chunk<ArrayRef> {
     type Error = ErrorCode;
 
-    fn try_from(v: arrow::record_batch::RecordBatch) -> Result<DataBlock> {
-        let schema: DataSchemaRef = Arc::new(v.schema().as_ref().into());
-        let columns = v
+    fn try_from(v: DataBlock) -> Result<Chunk<ArrayRef>> {
+        let arrays = v
             .columns()
             .iter()
-            .zip(schema.fields().iter())
-            .map(|(col, f)| match f.is_nullable() {
-                true => col.into_nullable_column(),
-                false => col.into_column(),
-            })
-            .collect();
+            .map(|c| c.as_arrow_array())
+            .collect::<Vec<_>>();
 
-        Ok(DataBlock::create(schema, columns))
+        Ok(Chunk::try_new(arrays)?)
     }
 }
 

diff --git a/common/datablocks/tests/it/data_block.rs b/common/datablocks/tests/it/data_block.rs
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use common_arrow::arrow::record_batch::RecordBatch;
+use common_arrow::arrow::array::ArrayRef;
+use common_arrow::arrow::chunk::Chunk;
 use common_datablocks::DataBlock;
 use common_datavalues::prelude::*;
 use common_exception::Result;
@@ -60,18 +61,15 @@ fn test_data_block_convert() -> Result<()> {
     assert_eq!(3, block.num_rows());
     assert_eq!(4, block.num_columns());
 
-    let record_batch: RecordBatch = block.try_into().unwrap();
+    let chunk: Chunk<ArrayRef> = block.try_into().unwrap();
 
     // first and last test.
-    assert_eq!(3, record_batch.num_rows());
-    assert_eq!(4, record_batch.num_columns());
+    assert_eq!(3, chunk.len());
+    assert_eq!(4, chunk.columns().len());
 
-    let new_block: DataBlock = record_batch.try_into().unwrap();
+    let new_block: DataBlock = DataBlock::from_chunk(&schema, &chunk).unwrap();
     assert_eq!(3, new_block.num_rows());
     assert_eq!(4, new_block.num_columns());
 
-    let new_schema = new_block.schema();
-
-    assert_eq!(new_schema, &schema);
     Ok(())
 }