From 41fcaf3f450171d686d371daa4b2848e168e3fd3 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 12 Nov 2021 18:42:28 +0000 Subject: [PATCH] Added support to read decimal from csv. --- src/io/csv/read_utils.rs | 61 ++++++++++++++++++++++++++++++++++++++++ tests/it/io/csv/read.rs | 27 ++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/src/io/csv/read_utils.rs b/src/io/csv/read_utils.rs index df6b4e07bbd..b9dfd36232d 100644 --- a/src/io/csv/read_utils.rs +++ b/src/io/csv/read_utils.rs @@ -42,6 +42,64 @@ where Arc::new(PrimitiveArray::::from_trusted_len_iter(iter).to(datatype)) } +#[inline] +fn significant_bytes(bytes: &[u8]) -> usize { + let mut count = 0; + for byte in bytes { + if *byte != b'0' { + count += 1 + } + } + count +} + +/* +let significant_lhs = significant_bytes(lhs); + let significant_rhs = significant_bytes(rhs); + if significant_lhs + significant_rhs != precision || significant_rhs != scale { + return None; + } +*/ + +/// Deserializes bytes to a single i128 representing a decimal +/// The decimal precision and scale are not checked. +#[inline] +fn deserialize_decimal(bytes: &[u8], precision: usize, scale: usize) -> Option { + let mut a = bytes.split(|x| *x == b'.'); + let lhs = a.next(); + let rhs = a.next(); + match (lhs, rhs) { + (Some(lhs), Some(rhs)) => lexical_core::parse::(lhs).ok().and_then(|x| { + lexical_core::parse::(rhs) + .ok() + .map(|y| (x, lhs, y, rhs)) + .and_then(|(lhs, lhs_b, rhs, rhs_b)| { + let lhs_s = significant_bytes(lhs_b); + let rhs_s = significant_bytes(rhs_b); + if lhs_s + rhs_s > precision || rhs_s > scale { + None + } else { + Some((lhs, rhs, rhs_s)) + } + }) + .map(|(lhs, rhs, rhs_s)| lhs * 10i128.pow(rhs_s as u32) + rhs) + }), + (None, Some(rhs)) => { + if rhs.len() != precision || rhs.len() != scale { + return None; + } + lexical_core::parse::(rhs).ok() + } + (Some(lhs), None) => { + if lhs.len() != precision || scale != 0 { + return None; + } + lexical_core::parse::(lhs).ok() + } + (None, None) => None, + } +} + fn deserialize_boolean(rows: &[B], column: usize, op: F) -> Arc where B: ByteRecordGeneric, @@ -193,6 +251,9 @@ pub(crate) fn deserialize_column( }) }) } + Decimal(precision, scale) => deserialize_primitive(rows, column, datatype, |x| { + deserialize_decimal(x, precision, scale) + }), Utf8 => deserialize_utf8::(rows, column), LargeUtf8 => deserialize_utf8::(rows, column), Binary => deserialize_binary::(rows, column), diff --git a/tests/it/io/csv/read.rs b/tests/it/io/csv/read.rs index db4c534a34d..40e5344ea3c 100644 --- a/tests/it/io/csv/read.rs +++ b/tests/it/io/csv/read.rs @@ -144,6 +144,33 @@ fn date64() -> Result<()> { Ok(()) } +#[test] +fn decimal() -> Result<()> { + let result = test_deserialize("1.1,\n1.2,\n1.22,\n1.3,\n", DataType::Decimal(2, 1))?; + let expected = + Int128Array::from(&[Some(11), Some(12), None, Some(13)]).to(DataType::Decimal(2, 1)); + assert_eq!(expected, result.as_ref()); + Ok(()) +} + +#[test] +fn decimal_only_scale() -> Result<()> { + let result = test_deserialize("0.01,\n0.12,\n0.222,\n0.13,\n", DataType::Decimal(2, 2))?; + let expected = + Int128Array::from(&[Some(1), Some(12), None, Some(13)]).to(DataType::Decimal(2, 2)); + assert_eq!(expected, result.as_ref()); + Ok(()) +} + +#[test] +fn decimal_only_integer() -> Result<()> { + let result = test_deserialize("1,\n1.0,\n1.1,\n10.0,\n", DataType::Decimal(1, 0))?; + let expected = + Int128Array::from(&[Some(1), Some(1), None, Some(10)]).to(DataType::Decimal(1, 0)); + assert_eq!(expected, result.as_ref()); + Ok(()) +} + #[test] fn boolean() -> Result<()> { let input = vec!["true", "True", "False", "F", "t"];