Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support to read binary dict encoded from parquet (#781)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Jan 19, 2022
1 parent 98f6e45 commit 18eb3d8
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 10 deletions.
26 changes: 19 additions & 7 deletions src/io/parquet/read/binary/dictionary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ use parquet2::{

use super::super::utils as other_utils;
use crate::{
array::{Array, DictionaryArray, DictionaryKey, Offset, PrimitiveArray, Utf8Array},
array::{
Array, BinaryArray, DictionaryArray, DictionaryKey, Offset, PrimitiveArray, Utf8Array,
},
bitmap::{utils::BitmapIter, MutableBitmap},
datatypes::DataType,
error::{ArrowError, Result},
Expand Down Expand Up @@ -156,11 +158,21 @@ where
};
let keys = PrimitiveArray::from_data(K::PRIMITIVE.into(), indices.into(), validity.into());
let data_type = DictionaryArray::<K>::get_child(&data_type).clone();
let values = Arc::new(Utf8Array::from_data(
data_type,
offsets.into(),
values.into(),
None,
));
use crate::datatypes::PhysicalType::*;
let values = match data_type.to_physical_type() {
Binary | LargeBinary => Arc::new(BinaryArray::from_data(
data_type,
offsets.into(),
values.into(),
None,
)) as Arc<dyn Array>,
Utf8 | LargeUtf8 => Arc::new(Utf8Array::from_data(
data_type,
offsets.into(),
values.into(),
None,
)),
_ => unreachable!(),
};
Ok(Box::new(DictionaryArray::<K>::from_data(keys, values)))
}
6 changes: 4 additions & 2 deletions src/io/parquet/read/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,10 @@ fn dict_read<
Float64 => {
primitive::iter_to_dict_array::<K, _, _, _, _, _>(iter, metadata, data_type, |x: f64| x)
}
Utf8 => binary::iter_to_dict_array::<K, i32, _, _>(iter, metadata, data_type),
LargeUtf8 => binary::iter_to_dict_array::<K, i64, _, _>(iter, metadata, data_type),
Utf8 | Binary => binary::iter_to_dict_array::<K, i32, _, _>(iter, metadata, data_type),
LargeUtf8 | LargeBinary => {
binary::iter_to_dict_array::<K, i64, _, _>(iter, metadata, data_type)
}
other => Err(ArrowError::NotYetImplemented(format!(
"Reading dictionaries of type {:?}",
other
Expand Down
7 changes: 6 additions & 1 deletion tests/it/io/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -719,17 +719,22 @@ fn arrow_type() -> Result<()> {

let indices = PrimitiveArray::from_values((0..3u64).map(|x| x % 2));
let values = PrimitiveArray::from_slice([1.0f32, 3.0]);
let array3 = DictionaryArray::from_data(indices, std::sync::Arc::new(values));
let array3 = DictionaryArray::from_data(indices.clone(), std::sync::Arc::new(values));

let values = BinaryArray::<i32>::from_slice([b"ab", b"ac"]);
let array4 = DictionaryArray::from_data(indices, std::sync::Arc::new(values));

let schema = Schema::from(vec![
Field::new("a1", dt1, true),
Field::new("a2", array2.data_type().clone(), true),
Field::new("a3", array3.data_type().clone(), true),
Field::new("a4", array4.data_type().clone(), true),
]);
let batch = Chunk::try_new(vec![
Arc::new(array) as Arc<dyn Array>,
Arc::new(array2),
Arc::new(array3),
Arc::new(array4),
])?;

let r = integration_write(&schema, &[batch.clone()])?;
Expand Down

0 comments on commit 18eb3d8

Please sign in to comment.