Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fixed empty reader panic for NDJSON type infer (#974)
Browse files Browse the repository at this point in the history
  • Loading branch information
Roberto-XY authored May 1, 2022
1 parent fa34796 commit 3c64e7a
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 2 deletions.
16 changes: 16 additions & 0 deletions src/io/json/read/infer_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,18 @@ fn infer_number(n: &serde_json::Number) -> DataType {
}

/// Coerce an heterogeneous set of [`DataType`] into a single one. Rules:
/// * The empty set is coerced to `Null`
/// * `Int64` and `Float64` are `Float64`
/// * Lists and scalars are coerced to a list of a compatible scalar
/// * Structs contain the union of all fields
/// * All other types are coerced to `Utf8`
pub(crate) fn coerce_data_type<A: Borrow<DataType>>(datatypes: &[A]) -> DataType {
use DataType::*;

if datatypes.is_empty() {
return DataType::Null;
}

let are_all_equal = datatypes.windows(2).all(|w| w[0].borrow() == w[1].borrow());

if are_all_equal {
Expand Down Expand Up @@ -186,4 +191,15 @@ mod test {
List(Box::new(Field::new(ITEM_NAME, Utf8, true))),
);
}

#[test]
fn test_coersion_of_nulls() {
assert_eq!(coerce_data_type(&[DataType::Null]), DataType::Null);
assert_eq!(
coerce_data_type(&[DataType::Null, DataType::Boolean]),
DataType::Utf8
);
let vec: Vec<DataType> = vec![];
assert_eq!(coerce_data_type(vec.as_slice()), DataType::Null);
}
}
7 changes: 7 additions & 0 deletions src/io/ndjson/read/deserialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ use super::super::super::json::read::_deserialize;
/// # Errors
/// This function errors iff any of the rows is not a valid JSON (i.e. the format is not valid NDJSON).
pub fn deserialize(rows: &[String], data_type: DataType) -> Result<Arc<dyn Array>, ArrowError> {
if rows.is_empty() {
return Err(ArrowError::ExternalFormat(
"Cannot deserialize 0 NDJSON rows because empty string is not a valid JSON value"
.to_string(),
));
}

// deserialize strings to `Value`s
let rows = rows
.iter()
Expand Down
8 changes: 7 additions & 1 deletion src/io/ndjson/read/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,19 @@ impl<R: BufRead> FallibleStreamingIterator for FileReader<R> {

/// Infers the [`DataType`] from an NDJSON file, optionally only using `number_of_rows` rows.
///
/// # Implementantion
/// # Implementation
/// This implementation reads the file line by line and infers the type of each line.
/// It performs both `O(N)` IO and CPU-bounded operations where `N` is the number of rows.
pub fn infer<R: std::io::BufRead>(
reader: &mut R,
number_of_rows: Option<usize>,
) -> Result<DataType> {
if reader.fill_buf().map(|b| b.is_empty())? {
return Err(ArrowError::ExternalFormat(
"Cannot infer NDJSON types on empty reader because empty string is not a valid JSON value".to_string(),
));
}

let rows = vec!["".to_string(); 1]; // 1 <=> read row by row
let mut reader = FileReader::new(reader, rows, number_of_rows);

Expand Down
31 changes: 30 additions & 1 deletion tests/it/io/ndjson/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::sync::Arc;

use arrow2::array::*;
use arrow2::datatypes::{DataType, Field};
use arrow2::error::Result;
use arrow2::error::{ArrowError, Result};
use arrow2::io::ndjson::read as ndjson_read;
use arrow2::io::ndjson::read::FallibleStreamingIterator;

Expand Down Expand Up @@ -57,6 +57,35 @@ fn infer_nullable() -> Result<()> {
Ok(())
}

#[test]
fn read_null() -> Result<()> {
let ndjson = "null";
let expected_data_type = DataType::Null;

let data_type = infer(ndjson)?;
assert_eq!(expected_data_type, data_type);

let arrays = read_and_deserialize(ndjson, &data_type, 1000)?;
let expected = NullArray::new(data_type, 1);
assert_eq!(expected, arrays[0].as_ref());
Ok(())
}

#[test]
fn read_empty_reader() -> Result<()> {
let ndjson = "";

let infer_error = infer(ndjson);
assert!(matches!(infer_error, Err(ArrowError::ExternalFormat(_))));

let deserialize_error = ndjson_read::deserialize(&[], DataType::Null);
assert!(matches!(
deserialize_error,
Err(ArrowError::ExternalFormat(_))
));
Ok(())
}

fn case_nested_struct() -> (String, Arc<dyn Array>) {
let ndjson = r#"{"a": {"a": 2.0, "b": 2}}
{"a": {"b": 2}}
Expand Down

0 comments on commit 3c64e7a

Please sign in to comment.