From 18eb3d81cb0a52b82da05db64f8aaf6857f6b9f8 Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Wed, 19 Jan 2022 19:06:44 +0100 Subject: [PATCH] Added support to read binary dict encoded from parquet (#781) --- src/io/parquet/read/binary/dictionary.rs | 26 +++++++++++++++++------- src/io/parquet/read/mod.rs | 6 ++++-- tests/it/io/parquet/mod.rs | 7 ++++++- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/io/parquet/read/binary/dictionary.rs b/src/io/parquet/read/binary/dictionary.rs index b1f295fa22e..434036716a4 100644 --- a/src/io/parquet/read/binary/dictionary.rs +++ b/src/io/parquet/read/binary/dictionary.rs @@ -9,7 +9,9 @@ use parquet2::{ use super::super::utils as other_utils; use crate::{ - array::{Array, DictionaryArray, DictionaryKey, Offset, PrimitiveArray, Utf8Array}, + array::{ + Array, BinaryArray, DictionaryArray, DictionaryKey, Offset, PrimitiveArray, Utf8Array, + }, bitmap::{utils::BitmapIter, MutableBitmap}, datatypes::DataType, error::{ArrowError, Result}, @@ -156,11 +158,21 @@ where }; let keys = PrimitiveArray::from_data(K::PRIMITIVE.into(), indices.into(), validity.into()); let data_type = DictionaryArray::::get_child(&data_type).clone(); - let values = Arc::new(Utf8Array::from_data( - data_type, - offsets.into(), - values.into(), - None, - )); + use crate::datatypes::PhysicalType::*; + let values = match data_type.to_physical_type() { + Binary | LargeBinary => Arc::new(BinaryArray::from_data( + data_type, + offsets.into(), + values.into(), + None, + )) as Arc, + Utf8 | LargeUtf8 => Arc::new(Utf8Array::from_data( + data_type, + offsets.into(), + values.into(), + None, + )), + _ => unreachable!(), + }; Ok(Box::new(DictionaryArray::::from_data(keys, values))) } diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 76eec05e0e8..32c1b941811 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -196,8 +196,10 @@ fn dict_read< Float64 => { primitive::iter_to_dict_array::(iter, metadata, data_type, |x: f64| x) } - Utf8 => binary::iter_to_dict_array::(iter, metadata, data_type), - LargeUtf8 => binary::iter_to_dict_array::(iter, metadata, data_type), + Utf8 | Binary => binary::iter_to_dict_array::(iter, metadata, data_type), + LargeUtf8 | LargeBinary => { + binary::iter_to_dict_array::(iter, metadata, data_type) + } other => Err(ArrowError::NotYetImplemented(format!( "Reading dictionaries of type {:?}", other diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 65a79319a43..5f5493e957d 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -719,17 +719,22 @@ fn arrow_type() -> Result<()> { let indices = PrimitiveArray::from_values((0..3u64).map(|x| x % 2)); let values = PrimitiveArray::from_slice([1.0f32, 3.0]); - let array3 = DictionaryArray::from_data(indices, std::sync::Arc::new(values)); + let array3 = DictionaryArray::from_data(indices.clone(), std::sync::Arc::new(values)); + + let values = BinaryArray::::from_slice([b"ab", b"ac"]); + let array4 = DictionaryArray::from_data(indices, std::sync::Arc::new(values)); let schema = Schema::from(vec![ Field::new("a1", dt1, true), Field::new("a2", array2.data_type().clone(), true), Field::new("a3", array3.data_type().clone(), true), + Field::new("a4", array4.data_type().clone(), true), ]); let batch = Chunk::try_new(vec![ Arc::new(array) as Arc, Arc::new(array2), Arc::new(array3), + Arc::new(array4), ])?; let r = integration_write(&schema, &[batch.clone()])?;