Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: excel data type handling #3051

Merged
merged 6 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions crates/datasources/src/excel/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use crate::object_store::errors::ObjectStoreSourceError;
pub enum ExcelError {
#[error("Failed to load XLSX: {0}")]
Load(String),
#[error("Cannot parse cell value")]
Parse,
#[error("Failed to create record batch: {0}")]
CreateRecordBatch(#[from] ArrowError),
#[error(transparent)]
Expand Down
82 changes: 57 additions & 25 deletions crates/datasources/src/excel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@ use std::io::Cursor;
use std::sync::Arc;

use calamine::{DataType as CalamineDataType, Range, Reader, Sheets};
use datafusion::arrow::array::{ArrayRef, BooleanArray, Date64Array, PrimitiveArray, StringArray};
use datafusion::arrow::array::{
ArrayRef,
BooleanArray,
Date64Array,
NullArray,
PrimitiveArray,
StringArray,
};
use datafusion::arrow::datatypes::{DataType, Field, Float64Type, Int64Type, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use object_store::{ObjectMeta, ObjectStore};
Expand Down Expand Up @@ -121,11 +128,11 @@ fn xlsx_sheet_value_to_record_batch(
.collect::<BooleanArray>(),
) as ArrayRef,
DataType::Int64 => Arc::new(
rows.map(|r| r.get(i).and_then(|v| v.get_int()))
rows.map(|r| r.get(i).and_then(|v| v.as_i64()))
.collect::<PrimitiveArray<Int64Type>>(),
) as ArrayRef,
DataType::Float64 => Arc::new(
rows.map(|r| r.get(i).and_then(|v| v.get_float()))
rows.map(|r| r.get(i).and_then(|v| v.as_f64()))
.collect::<PrimitiveArray<Float64Type>>(),
) as ArrayRef,
DataType::Date64 => {
Expand All @@ -140,8 +147,9 @@ fn xlsx_sheet_value_to_record_batch(
}
Arc::new(arr.finish())
}
DataType::Null => Arc::new(NullArray::new(rows.len())),
_ => Arc::new(
rows.map(|r| r.get(i).map(|v| v.get_string().unwrap_or("null")))
rows.map(|r| r.get(i).map(|v| v.as_string().unwrap_or_default()))
.collect::<StringArray>(),
) as ArrayRef,
}
Expand All @@ -158,27 +166,30 @@ pub fn infer_schema(
) -> Result<Schema, ExcelError> {
let mut col_types: HashMap<&str, HashSet<DataType>> = HashMap::new();
let mut rows = r.rows();
let col_names: Vec<String> = rows
.next()
.unwrap()
.iter()
.enumerate()
.map(
|(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) {
(true, Some(s)) => Ok(s),
(true, None) => Err(ExcelError::Load("failed to parse header".to_string())),
(false, _) => Ok(format!("col{}", i)),
},
)
.collect::<Result<_, _>>()?;
let col_names: Vec<String> = if has_header {
rows.next()
.unwrap()
.iter()
.enumerate()
.map(
|(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) {
(true, Some(s)) => Ok(s),
(true, None) => Err(ExcelError::Load("failed to parse header".to_string())),
(false, _) => Ok(format!("col{}", i)),
},
)
.collect::<Result<_, _>>()?
} else {
(0..r.rows().next().unwrap_or_default().len())
.map(|n| format!("{}", n))
.collect()
};


for row in rows.take(infer_schema_length) {
for (i, col_val) in row.iter().enumerate() {
let col_name = col_names.get(i).unwrap();
if let Ok(col_type) = infer_value_type(col_val) {
if col_type == DataType::Null {
continue;
}
let entry = col_types.entry(col_name).or_default();
entry.insert(col_type);
} else {
Expand All @@ -196,8 +207,27 @@ pub fn infer_schema(
set
});

let dt = set.iter().next().cloned().unwrap_or(DataType::Utf8);
Field::new(col_name.replace(' ', "_"), dt, true)
let field_name = col_name.replace(' ', "_");

if set.len() == 1 {
Field::new(
field_name,
set.iter().next().cloned().unwrap_or(DataType::Utf8),
true,
)
} else if set.contains(&DataType::Utf8) {
Field::new(field_name, DataType::Utf8, true)
} else if set.contains(&DataType::Float64) {
Field::new(field_name, DataType::Float64, true)
} else if set.contains(&DataType::Int64) {
Field::new(field_name, DataType::Int64, true)
} else if set.contains(&DataType::Boolean) {
Field::new(field_name, DataType::Boolean, true)
} else if set.contains(&DataType::Null) {
Field::new(field_name, DataType::Null, true)
} else {
Field::new(field_name, DataType::Utf8, true)
}
})
.collect();

Expand All @@ -210,11 +240,13 @@ fn infer_value_type(v: &calamine::Data) -> Result<DataType, ExcelError> {
calamine::Data::Float(_) => Ok(DataType::Float64),
calamine::Data::Bool(_) => Ok(DataType::Boolean),
calamine::Data::String(_) => Ok(DataType::Utf8),
// TODO: parsing value errors that we get from the calamine
// library, could either become nulls or could be
// errors. right now they are errors, and this should probably
// be configurable, however...
calamine::Data::Error(e) => Err(ExcelError::Load(e.to_string())),
calamine::Data::DateTime(_) => Ok(DataType::Date64),
calamine::Data::Empty => Ok(DataType::Null),
_ => Err(ExcelError::Load(
"Failed to parse the cell value".to_owned(),
)),
_ => Err(ExcelError::Parse),
}
}
24 changes: 17 additions & 7 deletions testdata/sqllogictests/xlsx.slt
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ create external table bad_report from excel options(location='./invalid_path/ran

# should default to has_header=true and the first sheet of the file, if not specified
statement ok
create external table html_table from excel options(location='https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx');
create external table http_table from excel options(location='https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx');

query
select "Resources", "Cost", "Revenue" from html_table;
select "Resources", "Cost", "Revenue" from http_table;
----
1 10 100
2 20 200
Expand All @@ -131,18 +131,28 @@ select "Resources", "Cost", "Revenue" from html_table;

# should default to has_header=true and the first sheet of the file, if not specified
query
select "Resources", "Cost", "Revenue" from read_excel('https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx');
select "Resources", "Cost", "Revenue" from read_excel('./testdata/xlsx/multiple_sheets.xlsx');
----
1 10 100
2 20 200
3 30 300
4 40 400
5 50 500

query
select * from read_excel('https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'other', has_header => false);
query T
select * from read_excel('./testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'other', has_header => false);
----
NULL
HEADING
1
2
3
3

query
select * from read_excel('./testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'multiple_data_types', has_header => true);
----
1 1 foo foo
2 2 bar bar
3 3 baz baz
foo 4.0 4 4.0
5 5.0 5 5.0
bar (empty) (empty) (empty)
Binary file modified testdata/xlsx/multiple_sheets.xlsx
Binary file not shown.