From 4fe54d6e0f90d3fd2e99faa0edb929f9651b1bdf Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Mon, 15 Jan 2024 10:53:35 +0100 Subject: [PATCH] feat: implement binview for polars-json (#13737) --- crates/polars-json/src/json/deserialize.rs | 220 +++--------------- .../polars-json/src/json/write/serialize.rs | 21 +- 2 files changed, 53 insertions(+), 188 deletions(-) diff --git a/crates/polars-json/src/json/deserialize.rs b/crates/polars-json/src/json/deserialize.rs index 834d1e22c478..9a4c9e27d0cb 100644 --- a/crates/polars-json/src/json/deserialize.rs +++ b/crates/polars-json/src/json/deserialize.rs @@ -3,11 +3,10 @@ use std::fmt::Write; use arrow::array::*; use arrow::bitmap::MutableBitmap; -use arrow::chunk::Chunk; -use arrow::datatypes::{ArrowDataType, ArrowSchema, Field, IntervalUnit}; +use arrow::datatypes::{ArrowDataType, IntervalUnit}; use arrow::offset::{Offset, Offsets}; use arrow::temporal_conversions; -use arrow::types::{f16, NativeType}; +use arrow::types::NativeType; use num_traits::NumCast; use simd_json::{BorrowedValue, StaticNode}; @@ -69,6 +68,27 @@ fn deserialize_utf8_into<'a, O: Offset, A: Borrow>>( } } +fn deserialize_utf8view_into<'a, A: Borrow>>( + target: &mut MutableBinaryViewArray, + rows: &[A], +) { + let mut scratch = String::new(); + for row in rows { + match row.borrow() { + BorrowedValue::String(v) => target.push_value(v.as_ref()), + BorrowedValue::Static(StaticNode::Bool(v)) => { + target.push_value(if *v { "true" } else { "false" }) + }, + BorrowedValue::Static(node) if !matches!(node, StaticNode::Null) => { + write!(scratch, "{node}").unwrap(); + target.push_value(scratch.as_str()); + scratch.clear(); + }, + _ => target.push_null(), + } + } +} + fn deserialize_list<'a, A: Borrow>>( rows: &[A], data_type: ArrowDataType, @@ -106,104 +126,6 @@ fn deserialize_list<'a, A: Borrow>>( ListArray::::new(data_type, offsets.into(), values, validity.into()) } -// TODO: due to nesting, deduplicating this from the above is trickier than -// other `deserialize_xxx_into` functions. Punting on that for now. -fn deserialize_list_into<'a, A: Borrow>>( - target: &mut MutableListArray>, - rows: &[A], -) { - let empty = vec![]; - let inner: Vec<_> = rows - .iter() - .flat_map(|row| match row.borrow() { - BorrowedValue::Array(value) => value.iter(), - _ => empty.iter(), - }) - .collect(); - - deserialize_into(target.mut_values(), &inner); - - let lengths = rows.iter().map(|row| match row.borrow() { - BorrowedValue::Array(value) => Some(value.len()), - _ => None, - }); - - target - .try_extend_from_lengths(lengths) - .expect("Offsets overflow"); -} - -fn primitive_dispatch<'a, A: Borrow>, T: NativeType>( - target: &mut Box, - rows: &[A], - deserialize_into: fn(&mut MutablePrimitiveArray, &[A]) -> (), -) { - generic_deserialize_into(target, rows, deserialize_into) -} - -fn generic_deserialize_into<'a, A: Borrow>, M: 'static>( - target: &mut Box, - rows: &[A], - deserialize_into: fn(&mut M, &[A]) -> (), -) { - deserialize_into(target.as_mut_any().downcast_mut::().unwrap(), rows); -} - -/// Deserialize `rows` by extending them into the given `target` -fn deserialize_into<'a, A: Borrow>>( - target: &mut Box, - rows: &[A], -) { - match target.data_type() { - ArrowDataType::Boolean => generic_deserialize_into(target, rows, deserialize_boolean_into), - ArrowDataType::Float32 => { - primitive_dispatch::<_, f32>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::Float64 => { - primitive_dispatch::<_, f64>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::Int8 => { - primitive_dispatch::<_, i8>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::Int16 => { - primitive_dispatch::<_, i16>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::Int32 => { - primitive_dispatch::<_, i32>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::Int64 => { - primitive_dispatch::<_, i64>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::UInt8 => { - primitive_dispatch::<_, u8>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::UInt16 => { - primitive_dispatch::<_, u16>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::UInt32 => { - primitive_dispatch::<_, u32>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::UInt64 => { - primitive_dispatch::<_, u64>(target, rows, deserialize_primitive_into) - }, - ArrowDataType::LargeUtf8 => generic_deserialize_into::<_, MutableUtf8Array>( - target, - rows, - deserialize_utf8_into, - ), - ArrowDataType::LargeList(_) => deserialize_list_into( - target - .as_mut_any() - .downcast_mut::>>() - .unwrap(), - rows, - ), - _ => { - todo!() - }, - } -} - fn deserialize_struct<'a, A: Borrow>>( rows: &[A], data_type: ArrowDataType, @@ -287,6 +209,15 @@ impl Container for MutableFixedSizeBinaryArray { } } +impl Container for MutableBinaryViewArray { + fn with_capacity(capacity: usize) -> Self + where + Self: Sized, + { + MutableBinaryViewArray::with_capacity(capacity) + } +} + impl Container for MutableListArray { fn with_capacity(capacity: usize) -> Self { MutableListArray::with_capacity(capacity) @@ -399,6 +330,9 @@ pub(crate) fn _deserialize<'a, A: Borrow>>( ArrowDataType::LargeUtf8 => { fill_generic_array_from::<_, _, Utf8Array>(deserialize_utf8_into, rows) }, + ArrowDataType::Utf8View => { + fill_generic_array_from::<_, _, Utf8ViewArray>(deserialize_utf8view_into, rows) + }, ArrowDataType::LargeList(_) => Box::new(deserialize_list(rows, data_type)), ArrowDataType::LargeBinary => Box::new(deserialize_binary(rows)), ArrowDataType::Struct(_) => Box::new(deserialize_struct(rows, data_type)), @@ -415,87 +349,3 @@ pub fn deserialize(json: &BorrowedValue, data_type: ArrowDataType) -> PolarsResu _ => Ok(_deserialize(&[json], data_type)), } } - -fn allocate_array(f: &Field) -> Box { - match f.data_type() { - ArrowDataType::Int8 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::Int16 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::Int32 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::Int64 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::UInt8 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::UInt16 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::UInt32 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::UInt64 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::Float16 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::Float32 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::Float64 => Box::new(MutablePrimitiveArray::::new()), - ArrowDataType::LargeList(inner) => match inner.data_type() { - ArrowDataType::LargeList(_) => Box::new(MutableListArray::::new_from( - allocate_array(inner), - inner.data_type().clone(), - 0, - )), - _ => allocate_array(inner), - }, - _ => todo!(), - } -} - -/// Deserializes a `json` [`simd_json::value::Value`] serialized in Pandas record format into -/// a [`Chunk`]. -/// -/// Uses the `Schema` provided, which can be inferred from arbitrary JSON with -/// [`infer_records_schema`]. -/// -/// This is CPU-bounded. -/// -/// # Errors -/// -/// This function errors iff either: -/// -/// * `json` is not an [`Array`] -/// * `data_type` contains any incompatible types: -/// * [`ArrowDataType::Struct`] -/// * [`ArrowDataType::Dictionary`] -/// * [`ArrowDataType::LargeList`] -pub fn deserialize_records( - json: &BorrowedValue, - schema: &ArrowSchema, -) -> PolarsResult> { - let mut results = schema - .fields - .iter() - .map(|f| (f.name.as_str(), allocate_array(f))) - .collect::>(); - - match json { - BorrowedValue::Array(rows) => { - for row in rows.iter() { - match row { - BorrowedValue::Object(record) => { - for (key, value) in record.iter() { - let arr = results.get_mut(key.as_ref()).ok_or_else(|| { - PolarsError::ComputeError(format!("unexpected key: '{key}'").into()) - })?; - deserialize_into(arr, &[value]); - } - }, - _ => { - return Err(PolarsError::ComputeError( - "each row must be an Object".into(), - )) - }, - } - } - }, - _ => { - return Err(PolarsError::ComputeError( - "outer type must be an Array".into(), - )) - }, - } - - Ok(Chunk::new( - results.into_values().map(|mut ma| ma.as_box()).collect(), - )) -} diff --git a/crates/polars-json/src/json/write/serialize.rs b/crates/polars-json/src/json/write/serialize.rs index bb21a5bdd443..6347e014c722 100644 --- a/crates/polars-json/src/json/write/serialize.rs +++ b/crates/polars-json/src/json/write/serialize.rs @@ -143,6 +143,21 @@ fn utf8_serializer<'a, O: Offset>( materialize_serializer(f, array.iter(), offset, take) } +fn utf8view_serializer<'a>( + array: &'a Utf8ViewArray, + offset: usize, + take: usize, +) -> Box + 'a + Send + Sync> { + let f = |x: Option<&str>, buf: &mut Vec| { + if let Some(x) = x { + utf8::write_str(buf, x).unwrap(); + } else { + buf.extend_from_slice(b"null") + } + }; + materialize_serializer(f, array.iter(), offset, take) +} + fn struct_serializer<'a>( array: &'a StructArray, offset: usize, @@ -406,12 +421,12 @@ pub(crate) fn new_serializer<'a>( ArrowDataType::Float64 => { float_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) }, - ArrowDataType::Utf8 => { - utf8_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) - }, ArrowDataType::LargeUtf8 => { utf8_serializer::(array.as_any().downcast_ref().unwrap(), offset, take) }, + ArrowDataType::Utf8View => { + utf8view_serializer(array.as_any().downcast_ref().unwrap(), offset, take) + }, ArrowDataType::Struct(_) => { struct_serializer(array.as_any().downcast_ref().unwrap(), offset, take) },