diff --git a/src/io/json/read/infer_schema.rs b/src/io/json/read/infer_schema.rs index 0bfbaeb31c1..336264c4313 100644 --- a/src/io/json/read/infer_schema.rs +++ b/src/io/json/read/infer_schema.rs @@ -78,6 +78,7 @@ fn infer_number(n: &serde_json::Number) -> DataType { } /// Coerce an heterogeneous set of [`DataType`] into a single one. Rules: +/// * The empty set is coerced to `Null` /// * `Int64` and `Float64` are `Float64` /// * Lists and scalars are coerced to a list of a compatible scalar /// * Structs contain the union of all fields @@ -85,6 +86,10 @@ fn infer_number(n: &serde_json::Number) -> DataType { pub(crate) fn coerce_data_type>(datatypes: &[A]) -> DataType { use DataType::*; + if datatypes.is_empty() { + return DataType::Null; + } + let are_all_equal = datatypes.windows(2).all(|w| w[0].borrow() == w[1].borrow()); if are_all_equal { @@ -186,4 +191,15 @@ mod test { List(Box::new(Field::new(ITEM_NAME, Utf8, true))), ); } + + #[test] + fn test_coersion_of_nulls() { + assert_eq!(coerce_data_type(&[DataType::Null]), DataType::Null); + assert_eq!( + coerce_data_type(&[DataType::Null, DataType::Boolean]), + DataType::Utf8 + ); + let vec: Vec = vec![]; + assert_eq!(coerce_data_type(vec.as_slice()), DataType::Null); + } } diff --git a/src/io/ndjson/read/deserialize.rs b/src/io/ndjson/read/deserialize.rs index 8364fec15db..e02927d0248 100644 --- a/src/io/ndjson/read/deserialize.rs +++ b/src/io/ndjson/read/deserialize.rs @@ -15,6 +15,13 @@ use super::super::super::json::read::_deserialize; /// # Errors /// This function errors iff any of the rows is not a valid JSON (i.e. the format is not valid NDJSON). pub fn deserialize(rows: &[String], data_type: DataType) -> Result, ArrowError> { + if rows.is_empty() { + return Err(ArrowError::ExternalFormat( + "Cannot deserialize 0 NDJSON rows because empty string is not a valid JSON value" + .to_string(), + )); + } + // deserialize strings to `Value`s let rows = rows .iter() diff --git a/src/io/ndjson/read/file.rs b/src/io/ndjson/read/file.rs index 9ec9a73a8c6..c45dc139e1d 100644 --- a/src/io/ndjson/read/file.rs +++ b/src/io/ndjson/read/file.rs @@ -97,13 +97,19 @@ impl FallibleStreamingIterator for FileReader { /// Infers the [`DataType`] from an NDJSON file, optionally only using `number_of_rows` rows. /// -/// # Implementantion +/// # Implementation /// This implementation reads the file line by line and infers the type of each line. /// It performs both `O(N)` IO and CPU-bounded operations where `N` is the number of rows. pub fn infer( reader: &mut R, number_of_rows: Option, ) -> Result { + if reader.fill_buf().map(|b| b.is_empty())? { + return Err(ArrowError::ExternalFormat( + "Cannot infer NDJSON types on empty reader because empty string is not a valid JSON value".to_string(), + )); + } + let rows = vec!["".to_string(); 1]; // 1 <=> read row by row let mut reader = FileReader::new(reader, rows, number_of_rows); diff --git a/tests/it/io/ndjson/read.rs b/tests/it/io/ndjson/read.rs index 489fd548503..4b74248c569 100644 --- a/tests/it/io/ndjson/read.rs +++ b/tests/it/io/ndjson/read.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use arrow2::array::*; use arrow2::datatypes::{DataType, Field}; -use arrow2::error::Result; +use arrow2::error::{ArrowError, Result}; use arrow2::io::ndjson::read as ndjson_read; use arrow2::io::ndjson::read::FallibleStreamingIterator; @@ -57,6 +57,35 @@ fn infer_nullable() -> Result<()> { Ok(()) } +#[test] +fn read_null() -> Result<()> { + let ndjson = "null"; + let expected_data_type = DataType::Null; + + let data_type = infer(ndjson)?; + assert_eq!(expected_data_type, data_type); + + let arrays = read_and_deserialize(ndjson, &data_type, 1000)?; + let expected = NullArray::new(data_type, 1); + assert_eq!(expected, arrays[0].as_ref()); + Ok(()) +} + +#[test] +fn read_empty_reader() -> Result<()> { + let ndjson = ""; + + let infer_error = infer(ndjson); + assert!(matches!(infer_error, Err(ArrowError::ExternalFormat(_)))); + + let deserialize_error = ndjson_read::deserialize(&[], DataType::Null); + assert!(matches!( + deserialize_error, + Err(ArrowError::ExternalFormat(_)) + )); + Ok(()) +} + fn case_nested_struct() -> (String, Arc) { let ndjson = r#"{"a": {"a": 2.0, "b": 2}} {"a": {"b": 2}}