From 7f973fe499ae575f7e1191a9ec302044c9f23c86 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 27 Dec 2021 07:56:03 +0000 Subject: [PATCH] Removed Option from Field's metadata --- examples/metadata.rs | 4 +- src/datatypes/field.rs | 102 +++++------------------- src/datatypes/mod.rs | 44 +++++----- src/ffi/schema.rs | 14 ++-- src/io/avro/read/schema.rs | 24 +++--- src/io/ipc/read/schema.rs | 18 ++--- src/io/ipc/write/schema.rs | 12 +-- src/io/json_integration/read/schema.rs | 16 ++-- src/io/json_integration/write/schema.rs | 29 +++---- 9 files changed, 93 insertions(+), 170 deletions(-) diff --git a/examples/metadata.rs b/examples/metadata.rs index 6b020d7b288..bbfc830df5a 100644 --- a/examples/metadata.rs +++ b/examples/metadata.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashMap}; -use arrow2::datatypes::{DataType, Field, Schema}; +use arrow2::datatypes::{DataType, Field, Metadata, Schema}; fn main() { // two data types (logical types) @@ -12,7 +12,7 @@ fn main() { let field2 = Field::new("c2", type2_, true); // which can contain extra metadata: - let mut metadata = BTreeMap::new(); + let mut metadata = Metadata::new(); metadata.insert( "Office Space".to_string(), "Deals with real issues in the workplace.".to_string(), diff --git a/src/datatypes/field.rs b/src/datatypes/field.rs index fc40864b1fd..fad96c7d40a 100644 --- a/src/datatypes/field.rs +++ b/src/datatypes/field.rs @@ -1,56 +1,18 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::collections::BTreeMap; - use crate::error::{ArrowError, Result}; -use super::DataType; +use super::{DataType, Metadata}; -/// A logical [`DataType`] and its associated metadata per -/// [Arrow specification](https://arrow.apache.org/docs/cpp/api/datatype.html) -#[derive(Debug, Clone, Eq)] +/// Represents the metadata of a "column". +#[derive(Debug, Clone, Eq, PartialEq, Hash)] pub struct Field { /// Its name pub name: String, /// Its logical [`DataType`] pub data_type: DataType, - /// Whether its values can be null or not + /// Its nullability pub nullable: bool, - /// A map of key-value pairs containing additional custom meta data. - pub metadata: Option>, -} - -impl std::hash::Hash for Field { - fn hash(&self, state: &mut H) { - self.name.hash(state); - self.data_type.hash(state); - self.nullable.hash(state); - self.metadata.hash(state); - } -} - -impl PartialEq for Field { - fn eq(&self, other: &Self) -> bool { - self.name == other.name - && self.data_type == other.data_type - && self.nullable == other.nullable - && self.metadata == other.metadata - } + /// Additional custom (opaque) metadata. + pub metadata: Metadata, } impl Field { @@ -60,36 +22,24 @@ impl Field { name: name.into(), data_type, nullable, - metadata: None, + metadata: Default::default(), } } /// Creates a new [`Field`] with metadata. #[inline] - pub fn with_metadata(self, metadata: BTreeMap) -> Self { + pub fn with_metadata(self, metadata: Metadata) -> Self { Self { name: self.name, data_type: self.data_type, nullable: self.nullable, - metadata: Some(metadata), + metadata, } } - /// Sets the [`Field`]'s optional metadata. - /// The metadata is set as `None` for empty map. + /// Returns the [`Field`]'s metadata. #[inline] - pub fn set_metadata(&mut self, metadata: Option>) { - self.metadata = None; - if let Some(v) = metadata { - if !v.is_empty() { - self.metadata = Some(v); - } - } - } - - /// Returns the [`Field`]'s optional custom metadata. - #[inline] - pub const fn metadata(&self) -> &Option> { + pub const fn metadata(&self) -> &Metadata { &self.metadata } @@ -105,7 +55,7 @@ impl Field { &self.data_type } - /// Returns the [`Field`] nullability. + /// Returns whether the [`Field`] should contain null values. #[inline] pub const fn is_nullable(&self) -> bool { self.nullable @@ -125,27 +75,19 @@ impl Field { /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<()> { // merge metadata - match (self.metadata(), from.metadata()) { - (Some(self_metadata), Some(from_metadata)) => { - let mut merged = self_metadata.clone(); - for (key, from_value) in from_metadata { - if let Some(self_value) = self_metadata.get(key) { - if self_value != from_value { - return Err(ArrowError::InvalidArgumentError(format!( - "Fail to merge field due to conflicting metadata data value for key {}", key), - )); - } - } else { - merged.insert(key.clone(), from_value.clone()); - } + for (key, from_value) in from.metadata() { + if let Some(self_value) = self.metadata.get(key) { + if self_value != from_value { + return Err(ArrowError::InvalidArgumentError(format!( + "Fail to merge field due to conflicting metadata data value for key {}", + key + ))); } - self.set_metadata(Some(merged)); + } else { + self.metadata.insert(key.clone(), from_value.clone()); } - (None, Some(from_metadata)) => { - self.set_metadata(Some(from_metadata.clone())); - } - _ => {} } + match &mut self.data_type { DataType::Struct(nested_fields) => match &from.data_type { DataType::Struct(from_nested_fields) => { diff --git a/src/datatypes/mod.rs b/src/datatypes/mod.rs index 858595c2c08..9d145b3f9eb 100644 --- a/src/datatypes/mod.rs +++ b/src/datatypes/mod.rs @@ -1,5 +1,7 @@ #![deny(missing_docs)] +#![forbid(unsafe_code)] //! Contains all metadata, such as [`PhysicalType`], [`DataType`], [`Field`] and [`Schema`]. + mod field; mod physical_type; mod schema; @@ -8,6 +10,14 @@ pub use field::Field; pub use physical_type::*; pub use schema::Schema; +use std::collections::BTreeMap; +use std::sync::Arc; + +/// typedef for [BTreeMap] denoting a [`Field`]'s metadata. +pub type Metadata = BTreeMap; +/// typedef fpr [Option<(String, Option)>] descr +pub(crate) type Extension = Option<(String, Option)>; + /// The set of supported logical types. /// Each variant uniquely identifies a logical type, which define specific semantics to the data (e.g. how it should be represented). /// Each variant has a corresponding [`PhysicalType`], obtained via [`DataType::to_physical_type`], @@ -185,17 +195,12 @@ pub enum TimeUnit { /// Interval units defined in Arrow #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum IntervalUnit { - /// Indicates the number of elapsed whole months, stored as 4-byte integers. + /// The number of elapsed whole months. YearMonth, - /// Indicates the number of elapsed days and milliseconds, - /// stored as 2 contiguous 32-bit integers (8-bytes in total). + /// The number of elapsed days and milliseconds, + /// stored as 2 contiguous `i32` DayTime, - /// The values are stored contiguously in 16 byte blocks. Months and - /// days are encoded as 32 bit integers and nanoseconds is encoded as a - /// 64 bit integer. All integers are signed. Each field is independent - /// (e.g. there is no constraint that nanoseconds have the same sign - /// as days or that the quantitiy of nanoseconds represents less - /// then a day's worth of time). + /// The number of elapsed months (i32), days (i32) and nanoseconds (i64). MonthDayNano, } @@ -313,27 +318,14 @@ impl From for DataType { } } -// backward compatibility -use std::collections::BTreeMap; -use std::sync::Arc; - /// typedef for [`Arc`]. pub type SchemaRef = Arc; -/// typedef for [Option>]. -pub type Metadata = Option>; -/// typedef fpr [Option<(String, Option)>]. -pub type Extension = Option<(String, Option)>; - /// support get extension for metadata -pub fn get_extension(metadata: &Option>) -> Extension { - if let Some(metadata) = metadata { - if let Some(name) = metadata.get("ARROW:extension:name") { - let metadata = metadata.get("ARROW:extension:metadata").cloned(); - Some((name.clone(), metadata)) - } else { - None - } +pub fn get_extension(metadata: &Metadata) -> Extension { + if let Some(name) = metadata.get("ARROW:extension:name") { + let metadata = metadata.get("ARROW:extension:metadata").cloned(); + Some((name.clone(), metadata)) } else { None } diff --git a/src/ffi/schema.rs b/src/ffi/schema.rs index a02b61b33f2..8aa2313fe80 100644 --- a/src/ffi/schema.rs +++ b/src/ffi/schema.rs @@ -105,7 +105,7 @@ impl Ffi_ArrowSchema { let metadata = if let DataType::Extension(name, _, extension_metadata) = field.data_type() { // append extension information. - let mut metadata = metadata.clone().unwrap_or_default(); + let mut metadata = metadata.clone(); // metadata if let Some(extension_metadata) = extension_metadata { @@ -118,8 +118,10 @@ impl Ffi_ArrowSchema { metadata.insert("ARROW:extension:name".to_string(), name.clone()); Some(metadata_to_bytes(&metadata)) + } else if !metadata.is_empty() { + Some(metadata_to_bytes(metadata)) } else { - metadata.as_ref().map(metadata_to_bytes) + None }; let name = CString::new(name).unwrap(); @@ -227,9 +229,7 @@ pub(crate) unsafe fn to_field(schema: &Ffi_ArrowSchema) -> Result { data_type }; - let mut field = Field::new(schema.name(), data_type, schema.nullable()); - field.set_metadata(metadata); - Ok(field) + Ok(Field::new(schema.name(), data_type, schema.nullable()).with_metadata(metadata)) } fn to_integer_type(format: &str) -> Result { @@ -494,7 +494,7 @@ unsafe fn read_bytes(ptr: *const u8, len: usize) -> &'static str { unsafe fn metadata_from_bytes(data: *const ::std::os::raw::c_char) -> (Metadata, Extension) { let mut data = data as *const u8; // u8 = i8 if data.is_null() { - return (None, None); + return (Metadata::default(), None); }; let len = read_ne_i32(data); data = data.add(4); @@ -524,5 +524,5 @@ unsafe fn metadata_from_bytes(data: *const ::std::os::raw::c_char) -> (Metadata, }; } let extension = extension_name.map(|name| (name, extension_metadata)); - (Some(result), extension) + (result, extension) } diff --git a/src/io/avro/read/schema.rs b/src/io/avro/read/schema.rs index 38d2b0e6215..6aac5c3df9c 100644 --- a/src/io/avro/read/schema.rs +++ b/src/io/avro/read/schema.rs @@ -1,5 +1,3 @@ -use std::collections::BTreeMap; - use avro_schema::{Enum, Fixed, Record, Schema as AvroSchema}; use crate::datatypes::*; @@ -19,8 +17,8 @@ fn aliased(name: &str, namespace: Option<&str>, default_namespace: Option<&str>) } } -fn external_props(schema: &AvroSchema) -> BTreeMap { - let mut props = BTreeMap::new(); +fn external_props(schema: &AvroSchema) -> Metadata { + let mut props = Metadata::new(); match &schema { AvroSchema::Record(Record { doc: Some(ref doc), .. @@ -66,7 +64,7 @@ pub fn convert_schema(schema: &AvroSchema) -> Result { &field.schema, Some(&field.name), false, - Some(external_props(&field.schema)), + external_props(&field.schema), )?) } } @@ -84,7 +82,7 @@ fn schema_to_field( schema: &AvroSchema, name: Option<&str>, mut nullable: bool, - props: Option>, + props: Metadata, ) -> Result { let data_type = match schema { AvroSchema::Null => DataType::Null, @@ -129,7 +127,7 @@ fn schema_to_field( item_schema, Some("item"), // default name for list items false, - None, + Metadata::default(), )?)), AvroSchema::Map(_) => todo!("Avro maps are mapped to MapArrays"), AvroSchema::Union(schemas) => { @@ -141,7 +139,7 @@ fn schema_to_field( .iter() .find(|&schema| !matches!(schema, AvroSchema::Null)) { - schema_to_field(schema, None, has_nullable, None)? + schema_to_field(schema, None, has_nullable, Metadata::default())? .data_type() .clone() } else { @@ -153,7 +151,7 @@ fn schema_to_field( } else { let fields = schemas .iter() - .map(|s| schema_to_field(s, None, has_nullable, None)) + .map(|s| schema_to_field(s, None, has_nullable, Metadata::default())) .collect::>>()?; DataType::Union(fields, None, UnionMode::Dense) } @@ -162,7 +160,7 @@ fn schema_to_field( let fields: Result> = fields .iter() .map(|field| { - let mut props = BTreeMap::new(); + let mut props = Metadata::new(); if let Some(doc) = &field.doc { props.insert("avro::doc".to_string(), doc.clone()); } @@ -173,7 +171,7 @@ fn schema_to_field( &field.schema, Some(&format!("{}.{}", name, field.name)), false, - Some(props), + props, ) }) .collect(); @@ -201,7 +199,5 @@ fn schema_to_field( let name = name.unwrap_or_default(); - let mut field = Field::new(name, data_type, nullable); - field.set_metadata(props); - Ok(field) + Ok(Field::new(name, data_type, nullable).with_metadata(props)) } diff --git a/src/io/ipc/read/schema.rs b/src/io/ipc/read/schema.rs index 23e7b25767d..145a01a85a6 100644 --- a/src/io/ipc/read/schema.rs +++ b/src/io/ipc/read/schema.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; mod ipc { pub use arrow_format::ipc::File::*; @@ -32,19 +32,15 @@ fn deserialize_field(ipc_field: ipc::Field) -> (Field, IpcField) { fn read_metadata(field: &ipc::Field) -> Metadata { if let Some(list) = field.custom_metadata() { - if !list.is_empty() { - let mut metadata_map = BTreeMap::default(); - for kv in list { - if let (Some(k), Some(v)) = (kv.key(), kv.value()) { - metadata_map.insert(k.to_string(), v.to_string()); - } + let mut metadata_map = Metadata::new(); + for kv in list { + if let (Some(k), Some(v)) = (kv.key(), kv.value()) { + metadata_map.insert(k.to_string(), v.to_string()); } - Some(metadata_map) - } else { - None } + metadata_map } else { - None + Metadata::default() } } diff --git a/src/io/ipc/write/schema.rs b/src/io/ipc/write/schema.rs index 2d7950c9534..5cb04a6486a 100644 --- a/src/io/ipc/write/schema.rs +++ b/src/io/ipc/write/schema.rs @@ -1,14 +1,13 @@ use arrow_format::ipc::flatbuffers::{ FlatBufferBuilder, ForwardsUOffset, UnionWIPOffset, Vector, WIPOffset, }; -use std::collections::BTreeMap; mod ipc { pub use arrow_format::ipc::File::*; pub use arrow_format::ipc::Message::*; pub use arrow_format::ipc::Schema::*; } -use crate::datatypes::{DataType, Field, IntegerType, IntervalUnit, Schema, TimeUnit}; +use crate::datatypes::{DataType, Field, IntegerType, IntervalUnit, Metadata, Schema, TimeUnit}; use crate::io::ipc::endianess::is_native_little_endian; use super::super::IpcField; @@ -78,7 +77,7 @@ pub(crate) struct FbFieldType<'b> { fn write_metadata<'a>( fbb: &mut FlatBufferBuilder<'a>, - metadata: &BTreeMap, + metadata: &Metadata, kv_vec: &mut Vec>>, ) { for (k, v) in metadata { @@ -147,11 +146,8 @@ pub(crate) fn build_field<'a>( None }; - if let Some(metadata) = field.metadata() { - if !metadata.is_empty() { - write_metadata(fbb, metadata, &mut kv_vec); - } - }; + write_metadata(fbb, field.metadata(), &mut kv_vec); + let fb_metadata = if !kv_vec.is_empty() { Some(fbb.create_vector(&kv_vec)) } else { diff --git a/src/io/json_integration/read/schema.rs b/src/io/json_integration/read/schema.rs index 06cb0f1fd47..997bdf6c57d 100644 --- a/src/io/json_integration/read/schema.rs +++ b/src/io/json_integration/read/schema.rs @@ -1,16 +1,16 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use serde_derive::Deserialize; use serde_json::Value; use crate::{ - datatypes::UnionMode, error::{ArrowError, Result}, io::ipc::IpcField, }; use crate::datatypes::{ - get_extension, DataType, Field, IntegerType, IntervalUnit, Schema, TimeUnit, + get_extension, DataType, Field, IntegerType, IntervalUnit, Metadata, Schema, TimeUnit, + UnionMode, }; fn to_time_unit(item: Option<&Value>) -> Result { @@ -88,10 +88,10 @@ fn deserialize_fields(children: Option<&Value>) -> Result> { .unwrap_or_else(|| Ok(vec![])) } -fn read_metadata(metadata: &Value) -> Result> { +fn read_metadata(metadata: &Value) -> Result { match metadata { Value::Array(ref values) => { - let mut res: BTreeMap = BTreeMap::new(); + let mut res = Metadata::new(); for value in values { match value.as_object() { Some(map) => { @@ -126,7 +126,7 @@ fn read_metadata(metadata: &Value) -> Result> { Ok(res) } Value::Object(ref values) => { - let mut res: BTreeMap = BTreeMap::new(); + let mut res = Metadata::new(); for (k, v) in values { if let Some(str_value) = v.as_str() { res.insert(k.clone(), str_value.to_string().clone()); @@ -363,9 +363,9 @@ fn deserialize_field(value: &Value) -> Result { }; let metadata = if let Some(metadata) = map.get("metadata") { - Some(read_metadata(metadata)?) + read_metadata(metadata)? } else { - None + Metadata::default() }; let extension = get_extension(&metadata); diff --git a/src/io/json_integration/write/schema.rs b/src/io/json_integration/write/schema.rs index a9a73d86100..2965dfd6203 100644 --- a/src/io/json_integration/write/schema.rs +++ b/src/io/json_integration/write/schema.rs @@ -1,6 +1,6 @@ use serde_json::{json, Map, Value}; -use crate::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; +use crate::datatypes::{DataType, Field, IntervalUnit, Metadata, Schema, TimeUnit}; use crate::io::ipc::IpcField; use crate::io::json_integration::ArrowJsonSchema; @@ -108,7 +108,7 @@ fn serialize_field(field: &Field, ipc_field: &IpcField) -> ArrowJsonField { } _ => vec![], }; - let metadata = serialize_metadata(field); + let metadata = serialize_metadata(field.metadata()); let dictionary = if let DataType::Dictionary(key_type, _, is_ordered) = field.data_type() { use crate::datatypes::IntegerType::*; @@ -156,18 +156,19 @@ pub fn serialize_schema(schema: &Schema, ipc_fields: &[IpcField]) -> ArrowJsonSc } } -fn serialize_metadata(field: &Field) -> Option { - field.metadata().as_ref().and_then(|kv_list| { - let mut array = Vec::new(); - for (k, v) in kv_list { +fn serialize_metadata(metadata: &Metadata) -> Option { + let array = metadata + .iter() + .map(|(k, v)| { let mut kv_map = Map::new(); kv_map.insert(k.clone(), Value::String(v.clone())); - array.push(Value::Object(kv_map)); - } - if !array.is_empty() { - Some(Value::Array(array)) - } else { - None - } - }) + Value::Object(kv_map) + }) + .collect::>(); + + if !array.is_empty() { + Some(Value::Array(array)) + } else { + None + } }