From 8ed2bdae337501134f4e276dd6b7f1965bbcac39 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 9 Oct 2021 21:08:07 +0200 Subject: [PATCH 1/2] add dictionary serialization for csv-writer --- src/io/csv/write/serialize.rs | 51 +++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/src/io/csv/write/serialize.rs b/src/io/csv/write/serialize.rs index b2880d0aa43..5840fa49520 100644 --- a/src/io/csv/write/serialize.rs +++ b/src/io/csv/write/serialize.rs @@ -1,7 +1,7 @@ use lexical_core::ToLexical; use crate::temporal_conversions; -use crate::types::NativeType; +use crate::types::{Index, NativeType}; use crate::util::lexical_to_bytes_mut; use crate::{ array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, @@ -10,6 +10,10 @@ use crate::{ }; use super::iterator::{BufStreamingIterator, StreamingIterator}; +use crate::array::{DictionaryArray, DictionaryKey, Offset}; +use crate::bitmap::utils::ZipValidity; +use std::any::Any; +use std::slice::Iter; /// Options to serialize logical types to CSV #[derive(Debug, PartialEq, Eq, Hash, Clone)] @@ -265,6 +269,49 @@ pub fn new_serializer<'a>( vec![], )) } - _ => todo!(), + DataType::Dictionary(keys_dt, values_dt) => match &**values_dt { + DataType::LargeUtf8 => match &**keys_dt { + DataType::UInt32 => serialize_utf8_dict::(array.as_any()), + DataType::UInt64 => serialize_utf8_dict::(array.as_any()), + _ => todo!(), + }, + DataType::Utf8 => match &**keys_dt { + DataType::UInt32 => serialize_utf8_dict::(array.as_any()), + DataType::UInt64 => serialize_utf8_dict::(array.as_any()), + _ => todo!(), + }, + _ => { + panic!("only dictionary with string values are supported by csv writer") + } + }, + dt => panic!("data type: {} not supported by csv writer", dt), }) } + +/// Helper for serializing a dictonary array. The generic parameters are: +/// - `K` for the type of the keys of the dictionary +/// - `O` for the type of the offsets in the Utf8Array: {i32, i64} +fn serialize_utf8_dict<'a, K: DictionaryKey + Index, O: Offset>( + array: &'a dyn Any, +) -> Box + 'a> { + let array = array.downcast_ref::>().unwrap(); + let keys = array.keys(); + let values = array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + Box::new(BufStreamingIterator::new( + keys.iter(), + move |x, buf| { + if let Some(x) = x { + let i = Index::to_usize(x); + if !values.is_null(i) { + let val = values.value(i); + buf.extend_from_slice(val.as_bytes()); + } + } + }, + vec![], + )) +} From a63fbe4f2ebc4c252ff3de9b9ca0562d28602059 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 10 Oct 2021 09:23:14 +0200 Subject: [PATCH 2/2] add dictionary to test --- tests/it/io/csv/write.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/it/io/csv/write.rs b/tests/it/io/csv/write.rs index 9ac6d03e183..6098b60bbb2 100644 --- a/tests/it/io/csv/write.rs +++ b/tests/it/io/csv/write.rs @@ -15,6 +15,11 @@ fn data() -> RecordBatch { Field::new("c4", DataType::Boolean, true), Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true), Field::new("c6", DataType::Time32(TimeUnit::Second), false), + Field::new( + "c7", + DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), + false, + ), ]); let c1 = Utf8Array::::from_slice([ @@ -29,6 +34,8 @@ fn data() -> RecordBatch { .to(DataType::Timestamp(TimeUnit::Millisecond, None)); let c6 = PrimitiveArray::::from_slice(&[1234, 24680, 85563]) .to(DataType::Time32(TimeUnit::Second)); + let keys = UInt32Array::from_slice(&[2, 0, 1]); + let c7 = DictionaryArray::from_data(keys, Arc::new(c1.clone())); RecordBatch::try_new( Arc::new(schema), @@ -39,6 +46,7 @@ fn data() -> RecordBatch { Arc::new(c4), Arc::new(c5), Arc::new(c6), + Arc::new(c7), ], ) .unwrap() @@ -61,13 +69,13 @@ fn write_csv() -> Result<()> { // check let buffer = writer.into_inner().unwrap().into_inner(); assert_eq!( - r#"c1,c2,c3,c4,c5,c6 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34 -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20 -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03 -Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34 -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20 -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03 + r#"c1,c2,c3,c4,c5,c6,c7 +Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,sed do eiusmod tempor +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,Lorem ipsum dolor sit amet +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,consectetur adipiscing elit +Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,sed do eiusmod tempor +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,Lorem ipsum dolor sit amet +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,consectetur adipiscing elit "# .to_string(), String::from_utf8(buffer).unwrap(), @@ -92,9 +100,9 @@ fn write_csv_custom_options() -> Result<()> { // check let buffer = writer.into_inner().unwrap().into_inner(); assert_eq!( - r#"Lorem ipsum dolor sit amet|123.564532|3|true||12:20:34 AM -consectetur adipiscing elit||2|false|2019-04-18T10:54:47.378000000|06:51:20 AM -sed do eiusmod tempor|-556132.25|1||2019-04-18T02:45:55.555000000|11:46:03 PM + r#"Lorem ipsum dolor sit amet|123.564532|3|true||12:20:34 AM|sed do eiusmod tempor +consectetur adipiscing elit||2|false|2019-04-18T10:54:47.378000000|06:51:20 AM|Lorem ipsum dolor sit amet +sed do eiusmod tempor|-556132.25|1||2019-04-18T02:45:55.555000000|11:46:03 PM|consectetur adipiscing elit "# .to_string(), String::from_utf8(buffer).unwrap(),