From fa2c51c632dafecf1032a1778f00940e7858eace Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 3 Mar 2023 09:21:55 +0100 Subject: [PATCH] Added support for JSON serialization of dictionary (#1424) --- src/io/json/write/serialize.rs | 32 +++++++++++++++++++++++++++++++- tests/it/io/json/write.rs | 17 +++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/src/io/json/write/serialize.rs b/src/io/json/write/serialize.rs index 2f5da0e63e2..ea868435b9f 100644 --- a/src/io/json/write/serialize.rs +++ b/src/io/json/write/serialize.rs @@ -4,7 +4,7 @@ use std::io::Write; use streaming_iterator::StreamingIterator; use crate::bitmap::utils::ZipValidity; -use crate::datatypes::TimeUnit; +use crate::datatypes::{IntegerType, TimeUnit}; use crate::io::iterator::BufStreamingIterator; use crate::offset::Offset; use crate::temporal_conversions::{ @@ -69,6 +69,24 @@ where )) } +fn dictionary_utf8_serializer<'a, K: DictionaryKey, O: Offset>( + array: &'a DictionaryArray, +) -> Box + 'a + Send + Sync> { + let iter = array.iter_typed::>().unwrap(); + + Box::new(BufStreamingIterator::new( + iter, + |x, buf| { + if let Some(x) = x { + utf8::write_str(buf, x).unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }, + vec![], + )) +} + fn utf8_serializer<'a, O: Offset>( array: &'a Utf8Array, ) -> Box + 'a + Send + Sync> { @@ -257,6 +275,18 @@ pub(crate) fn new_serializer<'a>( } DataType::List(_) => list_serializer::(array.as_any().downcast_ref().unwrap()), DataType::LargeList(_) => list_serializer::(array.as_any().downcast_ref().unwrap()), + other @ DataType::Dictionary(k, v, _) => match (k, &**v) { + (IntegerType::UInt32, DataType::LargeUtf8) => { + let array = array + .as_any() + .downcast_ref::>() + .unwrap(); + dictionary_utf8_serializer::(array) + } + _ => { + todo!("Writing {:?} to JSON", other) + } + }, DataType::Date32 => date_serializer(array.as_any().downcast_ref().unwrap(), date32_to_date), DataType::Date64 => date_serializer(array.as_any().downcast_ref().unwrap(), date64_to_date), DataType::Timestamp(tu, tz) => { diff --git a/tests/it/io/json/write.rs b/tests/it/io/json/write.rs index cf09178ca8e..895f0f59731 100644 --- a/tests/it/io/json/write.rs +++ b/tests/it/io/json/write.rs @@ -1,3 +1,4 @@ +use arrow2::datatypes::IntegerType; use arrow2::{ array::*, bitmap::Bitmap, @@ -52,6 +53,22 @@ fn utf8() -> Result<()> { test!(array, expected) } +#[test] +fn dictionary_utf8() -> Result<()> { + let values = Utf8Array::::from([Some("a"), Some("b"), Some("c"), Some("d")]); + let keys = PrimitiveArray::from_slice([0u32, 1, 2, 3, 1]); + let array = DictionaryArray::try_new( + DataType::Dictionary(IntegerType::UInt32, Box::new(DataType::LargeUtf8), false), + keys, + Box::new(values), + ) + .unwrap(); + + let expected = r#"["a","b","c","d","b"]"#; + + test!(array, expected) +} + #[test] fn struct_() -> Result<()> { let c1 = Int32Array::from([Some(1), Some(2), Some(3), None, Some(5)]);