Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support to write utf8-dictionaries to csv (#515)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Oct 10, 2021
1 parent bfa1e5c commit 25ddb0d
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 12 deletions.
51 changes: 49 additions & 2 deletions src/io/csv/write/serialize.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use lexical_core::ToLexical;

use crate::temporal_conversions;
use crate::types::NativeType;
use crate::types::{Index, NativeType};
use crate::util::lexical_to_bytes_mut;
use crate::{
array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array},
Expand All @@ -10,6 +10,10 @@ use crate::{
};

use super::iterator::{BufStreamingIterator, StreamingIterator};
use crate::array::{DictionaryArray, DictionaryKey, Offset};
use crate::bitmap::utils::ZipValidity;
use std::any::Any;
use std::slice::Iter;

/// Options to serialize logical types to CSV
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
Expand Down Expand Up @@ -265,6 +269,49 @@ pub fn new_serializer<'a>(
vec![],
))
}
_ => todo!(),
DataType::Dictionary(keys_dt, values_dt) => match &**values_dt {
DataType::LargeUtf8 => match &**keys_dt {
DataType::UInt32 => serialize_utf8_dict::<u32, i64>(array.as_any()),
DataType::UInt64 => serialize_utf8_dict::<u64, i64>(array.as_any()),
_ => todo!(),
},
DataType::Utf8 => match &**keys_dt {
DataType::UInt32 => serialize_utf8_dict::<u32, i32>(array.as_any()),
DataType::UInt64 => serialize_utf8_dict::<u64, i32>(array.as_any()),
_ => todo!(),
},
_ => {
panic!("only dictionary with string values are supported by csv writer")
}
},
dt => panic!("data type: {} not supported by csv writer", dt),
})
}

/// Helper for serializing a dictonary array. The generic parameters are:
/// - `K` for the type of the keys of the dictionary
/// - `O` for the type of the offsets in the Utf8Array: {i32, i64}
fn serialize_utf8_dict<'a, K: DictionaryKey + Index, O: Offset>(
array: &'a dyn Any,
) -> Box<dyn StreamingIterator<Item = [u8]> + 'a> {
let array = array.downcast_ref::<DictionaryArray<K>>().unwrap();
let keys = array.keys();
let values = array
.values()
.as_any()
.downcast_ref::<Utf8Array<O>>()
.unwrap();
Box::new(BufStreamingIterator::new(
keys.iter(),
move |x, buf| {
if let Some(x) = x {
let i = Index::to_usize(x);
if !values.is_null(i) {
let val = values.value(i);
buf.extend_from_slice(val.as_bytes());
}
}
},
vec![],
))
}
28 changes: 18 additions & 10 deletions tests/it/io/csv/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ fn data() -> RecordBatch {
Field::new("c4", DataType::Boolean, true),
Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true),
Field::new("c6", DataType::Time32(TimeUnit::Second), false),
Field::new(
"c7",
DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
false,
),
]);

let c1 = Utf8Array::<i32>::from_slice([
Expand All @@ -29,6 +34,8 @@ fn data() -> RecordBatch {
.to(DataType::Timestamp(TimeUnit::Millisecond, None));
let c6 = PrimitiveArray::<i32>::from_slice(&[1234, 24680, 85563])
.to(DataType::Time32(TimeUnit::Second));
let keys = UInt32Array::from_slice(&[2, 0, 1]);
let c7 = DictionaryArray::from_data(keys, Arc::new(c1.clone()));

RecordBatch::try_new(
Arc::new(schema),
Expand All @@ -39,6 +46,7 @@ fn data() -> RecordBatch {
Arc::new(c4),
Arc::new(c5),
Arc::new(c6),
Arc::new(c7),
],
)
.unwrap()
Expand All @@ -61,13 +69,13 @@ fn write_csv() -> Result<()> {
// check
let buffer = writer.into_inner().unwrap().into_inner();
assert_eq!(
r#"c1,c2,c3,c4,c5,c6
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
r#"c1,c2,c3,c4,c5,c6,c7
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,sed do eiusmod tempor
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,Lorem ipsum dolor sit amet
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,consectetur adipiscing elit
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,sed do eiusmod tempor
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,Lorem ipsum dolor sit amet
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,consectetur adipiscing elit
"#
.to_string(),
String::from_utf8(buffer).unwrap(),
Expand All @@ -92,9 +100,9 @@ fn write_csv_custom_options() -> Result<()> {
// check
let buffer = writer.into_inner().unwrap().into_inner();
assert_eq!(
r#"Lorem ipsum dolor sit amet|123.564532|3|true||12:20:34 AM
consectetur adipiscing elit||2|false|2019-04-18T10:54:47.378000000|06:51:20 AM
sed do eiusmod tempor|-556132.25|1||2019-04-18T02:45:55.555000000|11:46:03 PM
r#"Lorem ipsum dolor sit amet|123.564532|3|true||12:20:34 AM|sed do eiusmod tempor
consectetur adipiscing elit||2|false|2019-04-18T10:54:47.378000000|06:51:20 AM|Lorem ipsum dolor sit amet
sed do eiusmod tempor|-556132.25|1||2019-04-18T02:45:55.555000000|11:46:03 PM|consectetur adipiscing elit
"#
.to_string(),
String::from_utf8(buffer).unwrap(),
Expand Down

0 comments on commit 25ddb0d

Please sign in to comment.