From 2987a29abd54ad72ac6a88365c2a422e6e399c40 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sat, 9 Oct 2021 21:08:07 +0200 Subject: [PATCH] add dictionary serialization for csv-writer --- src/io/csv/write/serialize.rs | 55 +++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/src/io/csv/write/serialize.rs b/src/io/csv/write/serialize.rs index b2880d0aa43..488b3344b9e 100644 --- a/src/io/csv/write/serialize.rs +++ b/src/io/csv/write/serialize.rs @@ -1,7 +1,7 @@ use lexical_core::ToLexical; use crate::temporal_conversions; -use crate::types::NativeType; +use crate::types::{Index, NativeType}; use crate::util::lexical_to_bytes_mut; use crate::{ array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array}, @@ -10,6 +10,10 @@ use crate::{ }; use super::iterator::{BufStreamingIterator, StreamingIterator}; +use crate::array::{DictionaryArray, DictionaryKey, Offset}; +use crate::bitmap::utils::ZipValidity; +use std::any::Any; +use std::slice::Iter; /// Options to serialize logical types to CSV #[derive(Debug, PartialEq, Eq, Hash, Clone)] @@ -265,6 +269,53 @@ pub fn new_serializer<'a>( vec![], )) } - _ => todo!(), + DataType::Dictionary(keys_dt, values_dt) => match &**values_dt { + DataType::LargeUtf8 => match &**keys_dt { + DataType::UInt32 => serialize_utf8_dict::(array.as_any()), + DataType::UInt64 => serialize_utf8_dict::(array.as_any()), + _ => todo!(), + }, + DataType::Utf8 => match &**keys_dt { + DataType::UInt32 => serialize_utf8_dict::(array.as_any()), + DataType::UInt64 => serialize_utf8_dict::(array.as_any()), + _ => todo!(), + }, + _ => { + panic!("only dictionary with string values are supported by csv writer") + } + }, + dt => panic!("data type: {} not supported by csv writer", dt), }) } + +/// Helper for serializing a dictonary array. The generic parameters are: +/// - `K` for the type of the keys of the dictionary +/// - `O` for the type of the offsets in the Utf8Array: {i32, i64} +fn serialize_utf8_dict<'a, K: DictionaryKey + Index, O: Offset>( + array: &'a dyn Any, +) -> Box + 'a> { + let array = array.downcast_ref::>().unwrap(); + let keys = array.keys(); + let values = array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + Box::new(BufStreamingIterator::new( + keys.iter(), + move |x, buf| { + if let Some(x) = x { + let i = Index::to_usize(x); + if !values.is_null(i) { + // Safety: + // all keys of a dictionary array should be in bounds + // invariant of the struct + // and we just checked bounds via `is_null` + let val = unsafe { values.value_unchecked(i) }; + buf.extend_from_slice(val.as_bytes()); + } + } + }, + vec![], + )) +}