Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support to infer nested json
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Jan 10, 2022
1 parent 4c29966 commit a388691
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 27 deletions.
91 changes: 69 additions & 22 deletions src/io/json/read/infer_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ use super::iterator::ValueIter;

type Tracker = HashMap<String, HashSet<DataType>>;

const ITEM_NAME: &str = "item";

/// Infers the fields of a JSON file by reading the first `number_of_rows` rows.
/// # Examples
/// ```
Expand Down Expand Up @@ -92,10 +94,12 @@ fn infer_value(value: &Value) -> Result<DataType> {
Value::Null => DataType::Null,
Value::Number(number) => infer_number(number),
Value::String(_) => DataType::Utf8,
Value::Object(_) => {
return Err(ArrowError::NotYetImplemented(
"Inferring schema from nested JSON structs currently not supported".to_string(),
))
Value::Object(inner) => {
let fields = inner
.iter()
.map(|(key, value)| infer_value(value).map(|dt| Field::new(key, dt, true)))
.collect::<Result<Vec<_>>>()?;
DataType::Struct(fields)
}
})
}
Expand Down Expand Up @@ -127,7 +131,7 @@ fn infer_array(array: &[Value]) -> Result<DataType> {
Ok(if !types.is_empty() {
let types = types.into_iter().collect::<Vec<_>>();
let dt = coerce_data_type(&types);
DataType::List(Box::new(Field::new("item", dt, true)))
DataType::List(Box::new(Field::new(ITEM_NAME, dt, true)))
} else {
DataType::Null
})
Expand Down Expand Up @@ -168,29 +172,66 @@ fn resolve_fields(spec: HashMap<String, HashSet<DataType>>) -> Vec<Field> {
/// Coerce an heterogeneous set of [`DataType`] into a single one. Rules:
/// * `Int64` and `Float64` are `Float64`
/// * Lists and scalars are coerced to a list of a compatible scalar
/// * Structs contain the union of all fields
/// * All other types are coerced to `Utf8`
fn coerce_data_type<A: Borrow<DataType>>(dt: &[A]) -> DataType {
fn coerce_data_type<A: Borrow<DataType>>(datatypes: &[A]) -> DataType {
use DataType::*;
if dt.len() == 1 {
return dt[0].borrow().clone();
} else if dt.len() > 2 {
return List(Box::new(Field::new("item", Utf8, true)));

let are_all_equal = datatypes.windows(2).all(|w| w[0].borrow() == w[1].borrow());

if are_all_equal {
return datatypes[0].borrow().clone();
}

let are_all_structs = datatypes.iter().all(|x| matches!(x.borrow(), Struct(_)));

if are_all_structs {
// all are structs => union of all fields (that may have equal names)
let fields = datatypes.iter().fold(vec![], |mut acc, dt| {
if let Struct(new_fields) = dt.borrow() {
acc.extend(new_fields);
};
acc
});
// group fields by unique
let fields = fields.iter().fold(
HashMap::<&String, Vec<&DataType>>::new(),
|mut acc, field| {
match acc.entry(&field.name) {
indexmap::map::Entry::Occupied(mut v) => {
v.get_mut().push(&field.data_type);
}
indexmap::map::Entry::Vacant(v) => {
v.insert(vec![&field.data_type]);
}
}
acc
},
);
// and finally, coerce each of the fields within the same name
let fields = fields
.into_iter()
.map(|(name, dts)| Field::new(name, coerce_data_type(&dts), true))
.collect();
return Struct(fields);
} else if datatypes.len() > 2 {
return Utf8;
}
let (lhs, rhs) = (dt[0].borrow(), dt[1].borrow());
let (lhs, rhs) = (datatypes[0].borrow(), datatypes[1].borrow());

return match (lhs, rhs) {
(lhs, rhs) if lhs == rhs => lhs.clone(),
(List(lhs), List(rhs)) => {
let inner = coerce_data_type(&[lhs.data_type(), rhs.data_type()]);
List(Box::new(Field::new("item", inner, true)))
List(Box::new(Field::new(ITEM_NAME, inner, true)))
}
(scalar, List(list)) => {
let inner = coerce_data_type(&[scalar, list.data_type()]);
List(Box::new(Field::new("item", inner, true)))
List(Box::new(Field::new(ITEM_NAME, inner, true)))
}
(List(list), scalar) => {
let inner = coerce_data_type(&[scalar, list.data_type()]);
List(Box::new(Field::new("item", inner, true)))
List(Box::new(Field::new(ITEM_NAME, inner, true)))
}
(Float64, Int64) => Float64,
(Int64, Float64) => Float64,
Expand All @@ -209,21 +250,27 @@ mod test {
use crate::datatypes::DataType::*;

assert_eq!(
coerce_data_type(&[Float64, List(Box::new(Field::new("item", Float64, true)))]),
List(Box::new(Field::new("item", Float64, true))),
coerce_data_type(&[
Float64,
List(Box::new(Field::new(ITEM_NAME, Float64, true)))
]),
List(Box::new(Field::new(ITEM_NAME, Float64, true))),
);
assert_eq!(
coerce_data_type(&[Float64, List(Box::new(Field::new("item", Int64, true)))]),
List(Box::new(Field::new("item", Float64, true))),
coerce_data_type(&[Float64, List(Box::new(Field::new(ITEM_NAME, Int64, true)))]),
List(Box::new(Field::new(ITEM_NAME, Float64, true))),
);
assert_eq!(
coerce_data_type(&[Int64, List(Box::new(Field::new("item", Int64, true)))]),
List(Box::new(Field::new("item", Int64, true))),
coerce_data_type(&[Int64, List(Box::new(Field::new(ITEM_NAME, Int64, true)))]),
List(Box::new(Field::new(ITEM_NAME, Int64, true))),
);
// boolean and number are incompatible, return utf8
assert_eq!(
coerce_data_type(&[Boolean, List(Box::new(Field::new("item", Float64, true)))]),
List(Box::new(Field::new("item", Utf8, true))),
coerce_data_type(&[
Boolean,
List(Box::new(Field::new(ITEM_NAME, Float64, true)))
]),
List(Box::new(Field::new(ITEM_NAME, Utf8, true))),
);
}
}
30 changes: 25 additions & 5 deletions tests/it/io/json/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,15 +139,35 @@ fn infer_schema_mixed_list() -> Result<()> {
DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
true,
),
Field::new(
"d",
DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
true,
),
Field::new("d", DataType::Utf8, true),
];

let result = read::infer(&mut Cursor::new(data), None)?;

assert_eq!(result, fields);
Ok(())
}

#[test]
fn infer_nested_struct() -> Result<()> {
let data = r#"{"a": {"a": 2.0, "b": 2}}
{"a": {"b": 2}}
{"a": {"a": 2.0, "b": 2, "c": true}}
{"a": {"a": 2.0, "b": 2}}
"#;

let fields = vec![Field::new(
"a",
DataType::Struct(vec![
Field::new("a", DataType::Float64, true),
Field::new("b", DataType::Int64, true),
Field::new("c", DataType::Boolean, true),
]),
true,
)];

let result = read::infer(&mut Cursor::new(data), None)?;

assert_eq!(result, fields);
Ok(())
}

0 comments on commit a388691

Please sign in to comment.