-
Notifications
You must be signed in to change notification settings - Fork 224
Added interoperability with arrow-schema #1442
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,6 +160,126 @@ pub enum DataType { | |
Extension(String, Box<DataType>, Option<String>), | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<DataType> for arrow_schema::DataType { | ||
fn from(value: DataType) -> Self { | ||
match value { | ||
DataType::Null => Self::Null, | ||
DataType::Boolean => Self::Boolean, | ||
DataType::Int8 => Self::Int8, | ||
DataType::Int16 => Self::Int16, | ||
DataType::Int32 => Self::Int32, | ||
DataType::Int64 => Self::Int64, | ||
DataType::UInt8 => Self::UInt8, | ||
DataType::UInt16 => Self::UInt16, | ||
DataType::UInt32 => Self::UInt32, | ||
DataType::UInt64 => Self::UInt64, | ||
DataType::Float16 => Self::Float16, | ||
DataType::Float32 => Self::Float32, | ||
DataType::Float64 => Self::Float64, | ||
DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz), | ||
DataType::Date32 => Self::Date32, | ||
DataType::Date64 => Self::Date64, | ||
DataType::Time32(unit) => Self::Time32(unit.into()), | ||
DataType::Time64(unit) => Self::Time64(unit.into()), | ||
DataType::Duration(unit) => Self::Duration(unit.into()), | ||
DataType::Interval(unit) => Self::Interval(unit.into()), | ||
DataType::Binary => Self::Binary, | ||
DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(size as _), | ||
DataType::LargeBinary => Self::LargeBinary, | ||
DataType::Utf8 => Self::Utf8, | ||
DataType::LargeUtf8 => Self::LargeUtf8, | ||
DataType::List(f) => Self::List(Box::new((*f).into())), | ||
DataType::FixedSizeList(f, size) => { | ||
Self::FixedSizeList(Box::new((*f).into()), size as _) | ||
} | ||
DataType::LargeList(f) => Self::LargeList(Box::new((*f).into())), | ||
DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()), | ||
DataType::Union(fields, Some(ids), mode) => { | ||
let ids = ids.into_iter().map(|x| x as _).collect(); | ||
let fields = fields.into_iter().map(Into::into).collect(); | ||
Self::Union(fields, ids, mode.into()) | ||
} | ||
DataType::Union(fields, None, mode) => { | ||
let ids = (0..fields.len() as i8).collect(); | ||
let fields = fields.into_iter().map(Into::into).collect(); | ||
Self::Union(fields, ids, mode.into()) | ||
} | ||
DataType::Map(f, ordered) => Self::Map(Box::new((*f).into()), ordered), | ||
DataType::Dictionary(key, value, _) => Self::Dictionary( | ||
Box::new(DataType::from(key).into()), | ||
Box::new((*value).into()), | ||
), | ||
DataType::Decimal(precision, scale) => Self::Decimal128(precision as _, scale as _), | ||
DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _), | ||
DataType::Extension(_, d, _) => (*d).into(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious why arrow2 chose to represent extension types as an explicit data type, as opposed to just field metadata? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is so that the For example, polars uses it to store arbitrary Python objects on the type. In theory this could be kept in the Field's metadata. Pyarrow does the same: https://arrow.apache.org/docs/python/generated/pyarrow.ExtensionType.html |
||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<arrow_schema::DataType> for DataType { | ||
fn from(value: arrow_schema::DataType) -> Self { | ||
use arrow_schema::DataType; | ||
match value { | ||
DataType::Null => Self::Null, | ||
DataType::Boolean => Self::Boolean, | ||
DataType::Int8 => Self::Int8, | ||
DataType::Int16 => Self::Int16, | ||
DataType::Int32 => Self::Int32, | ||
DataType::Int64 => Self::Int64, | ||
DataType::UInt8 => Self::UInt8, | ||
DataType::UInt16 => Self::UInt16, | ||
DataType::UInt32 => Self::UInt32, | ||
DataType::UInt64 => Self::UInt64, | ||
DataType::Float16 => Self::Float16, | ||
DataType::Float32 => Self::Float32, | ||
DataType::Float64 => Self::Float64, | ||
DataType::Timestamp(unit, tz) => Self::Timestamp(unit.into(), tz), | ||
DataType::Date32 => Self::Date32, | ||
DataType::Date64 => Self::Date64, | ||
DataType::Time32(unit) => Self::Time32(unit.into()), | ||
DataType::Time64(unit) => Self::Time64(unit.into()), | ||
DataType::Duration(unit) => Self::Duration(unit.into()), | ||
DataType::Interval(unit) => Self::Interval(unit.into()), | ||
DataType::Binary => Self::Binary, | ||
DataType::FixedSizeBinary(size) => Self::FixedSizeBinary(size as _), | ||
DataType::LargeBinary => Self::LargeBinary, | ||
DataType::Utf8 => Self::Utf8, | ||
DataType::LargeUtf8 => Self::LargeUtf8, | ||
DataType::List(f) => Self::List(Box::new((*f).into())), | ||
DataType::FixedSizeList(f, size) => { | ||
Self::FixedSizeList(Box::new((*f).into()), size as _) | ||
} | ||
DataType::LargeList(f) => Self::LargeList(Box::new((*f).into())), | ||
DataType::Struct(f) => Self::Struct(f.into_iter().map(Into::into).collect()), | ||
DataType::Union(fields, ids, mode) => { | ||
let ids = ids.into_iter().map(|x| x as _).collect(); | ||
let fields = fields.into_iter().map(Into::into).collect(); | ||
Self::Union(fields, Some(ids), mode.into()) | ||
} | ||
DataType::Map(f, ordered) => Self::Map(Box::new((*f).into()), ordered), | ||
DataType::Dictionary(key, value) => { | ||
let key = match *key { | ||
DataType::Int8 => IntegerType::Int8, | ||
DataType::Int16 => IntegerType::Int16, | ||
DataType::Int32 => IntegerType::Int32, | ||
DataType::Int64 => IntegerType::Int64, | ||
DataType::UInt8 => IntegerType::UInt8, | ||
DataType::UInt16 => IntegerType::UInt16, | ||
DataType::UInt32 => IntegerType::UInt32, | ||
DataType::UInt64 => IntegerType::UInt64, | ||
d => panic!("illegal dictionary key type: {d}"), | ||
}; | ||
Self::Dictionary(key, Box::new((*value).into()), false) | ||
} | ||
DataType::Decimal128(precision, scale) => Self::Decimal(precision as _, scale as _), | ||
DataType::Decimal256(precision, scale) => Self::Decimal256(precision as _, scale as _), | ||
DataType::RunEndEncoded(_, _) => panic!("Run-end encoding not supported by arrow2"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could instead implement TryFrom, but it seemed a touch excessive for a single error case |
||
} | ||
} | ||
} | ||
|
||
/// Mode of [`DataType::Union`] | ||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||
#[cfg_attr(feature = "serde_types", derive(Serialize, Deserialize))] | ||
|
@@ -170,6 +290,26 @@ pub enum UnionMode { | |
Sparse, | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<UnionMode> for arrow_schema::UnionMode { | ||
fn from(value: UnionMode) -> Self { | ||
match value { | ||
UnionMode::Dense => Self::Dense, | ||
UnionMode::Sparse => Self::Sparse, | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<arrow_schema::UnionMode> for UnionMode { | ||
fn from(value: arrow_schema::UnionMode) -> Self { | ||
match value { | ||
arrow_schema::UnionMode::Dense => Self::Dense, | ||
arrow_schema::UnionMode::Sparse => Self::Sparse, | ||
} | ||
} | ||
} | ||
|
||
impl UnionMode { | ||
/// Constructs a [`UnionMode::Sparse`] if the input bool is true, | ||
/// or otherwise constructs a [`UnionMode::Dense`] | ||
|
@@ -206,6 +346,30 @@ pub enum TimeUnit { | |
Nanosecond, | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<TimeUnit> for arrow_schema::TimeUnit { | ||
fn from(value: TimeUnit) -> Self { | ||
match value { | ||
TimeUnit::Nanosecond => Self::Nanosecond, | ||
TimeUnit::Millisecond => Self::Millisecond, | ||
TimeUnit::Microsecond => Self::Microsecond, | ||
TimeUnit::Second => Self::Second, | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<arrow_schema::TimeUnit> for TimeUnit { | ||
fn from(value: arrow_schema::TimeUnit) -> Self { | ||
match value { | ||
arrow_schema::TimeUnit::Nanosecond => Self::Nanosecond, | ||
arrow_schema::TimeUnit::Millisecond => Self::Millisecond, | ||
arrow_schema::TimeUnit::Microsecond => Self::Microsecond, | ||
arrow_schema::TimeUnit::Second => Self::Second, | ||
} | ||
} | ||
} | ||
|
||
/// Interval units defined in Arrow | ||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||
#[cfg_attr(feature = "serde_types", derive(Serialize, Deserialize))] | ||
|
@@ -219,6 +383,28 @@ pub enum IntervalUnit { | |
MonthDayNano, | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<IntervalUnit> for arrow_schema::IntervalUnit { | ||
fn from(value: IntervalUnit) -> Self { | ||
match value { | ||
IntervalUnit::YearMonth => Self::YearMonth, | ||
IntervalUnit::DayTime => Self::DayTime, | ||
IntervalUnit::MonthDayNano => Self::MonthDayNano, | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "arrow")] | ||
impl From<arrow_schema::IntervalUnit> for IntervalUnit { | ||
fn from(value: arrow_schema::IntervalUnit) -> Self { | ||
match value { | ||
arrow_schema::IntervalUnit::YearMonth => Self::YearMonth, | ||
arrow_schema::IntervalUnit::DayTime => Self::DayTime, | ||
arrow_schema::IntervalUnit::MonthDayNano => Self::MonthDayNano, | ||
} | ||
} | ||
} | ||
|
||
impl DataType { | ||
/// the [`PhysicalType`] of this [`DataType`]. | ||
pub fn to_physical_type(&self) -> PhysicalType { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This conversion uses truncating
as _
casts, as provided the type is valid these cannot overflow / underflow. I'm not sure having a sensible behaviour for things like negative size FixedSizeList is necessary, garbage in garbage out was my thoughts