From 2d4509c037acbb90a71c2a673668b93c8c6794e8 Mon Sep 17 00:00:00 2001 From: Qingping Hou Date: Tue, 9 Feb 2021 17:29:14 -0500 Subject: [PATCH] ARROW-11542: [Rust] fix validity bitmap buffer length count in json reader Closes #9436 from houqp/qp_json_read Authored-by: Qingping Hou Signed-off-by: Andrew Lamb --- rust/arrow/src/json/reader.rs | 83 ++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs index 55e4012e83815..a26cf84379d13 100644 --- a/rust/arrow/src/json/reader.rs +++ b/rust/arrow/src/json/reader.rs @@ -959,8 +959,14 @@ impl Decoder { } DataType::Struct(fields) => { // extract list values, with non-lists converted to Value::Null - let len = rows.len(); - let num_bytes = bit_util::ceil(len, 8); + let array_item_count = rows + .iter() + .map(|row| match row { + Value::Array(values) => values.len(), + _ => 1, + }) + .sum(); + let num_bytes = bit_util::ceil(array_item_count, 8); let mut null_buffer = MutableBuffer::from_len_zeroed(num_bytes); let mut struct_index = 0; let rows: Vec = rows @@ -2673,4 +2679,77 @@ mod tests { assert_eq!(1, aa.value(3)); assert_eq!(5, aa.value(7)); } + + #[test] + fn test_json_read_nested_list() { + let schema = Schema::new(vec![Field::new( + "c1", + DataType::List(Box::new(Field::new( + "item", + DataType::List(Box::new(Field::new("item", DataType::Utf8, true))), + true, + ))), + true, + )]); + + let decoder = Decoder::new(Arc::new(schema), 1024, None); + let batch = decoder + .next_batch( + &mut vec![ + Ok(serde_json::json!({ + "c1": [], + })), + Ok(serde_json::json!({ + "c1": [["a", "b"], ["c"], ["e", "f"], ["g"], ["h"], ["i"], ["j"], ["k"]], + })), + Ok(serde_json::json!({ + "c1": [["foo"], ["bar"]], + })), + ] + .into_iter(), + ) + .unwrap() + .unwrap(); + + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.num_rows(), 3); + } + + #[test] + fn test_json_read_list_of_structs() { + let schema = Schema::new(vec![Field::new( + "c1", + DataType::List(Box::new(Field::new( + "item", + DataType::Struct(vec![Field::new("a", DataType::Int64, true)]), + true, + ))), + true, + )]); + + let decoder = Decoder::new(Arc::new(schema), 1024, None); + let batch = decoder + .next_batch( + // NOTE: total struct element count needs to be greater than + // bit_util::ceil(array_count, 8) to test validity bit buffer length calculation + // logic + &mut vec![ + Ok(serde_json::json!({ + "c1": [{"a": 1}], + })), + Ok(serde_json::json!({ + "c1": [{"a": 2}, {"a": 3}, {"a": 4}, {"a": 5}, {"a": 6}, {"a": 7}], + })), + Ok(serde_json::json!({ + "c1": [{"a": 10}, {"a": 11}], + })), + ] + .into_iter(), + ) + .unwrap() + .unwrap(); + + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.num_rows(), 3); + } }