diff --git a/Cargo.toml b/Cargo.toml index 4bc21d59487..b9d6749aca8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ base64 = { version = "0.13.0", optional = true } futures = { version = "0.3", optional = true } # for faster hashing -ahash = { version = "0.7", optional = true } +ahash = { version = "0.7" } # parquet support parquet2 = { version = "0.14.0", optional = true, default_features = false } @@ -175,7 +175,7 @@ compute_comparison = ["compute_take", "compute_boolean"] compute_concatenate = [] compute_contains = [] compute_filter = [] -compute_hash = ["multiversion", "ahash"] +compute_hash = ["multiversion"] compute_if_then_else = [] compute_length = [] compute_like = ["regex"] diff --git a/arrow-parquet-integration-testing/src/main.rs b/arrow-parquet-integration-testing/src/main.rs index f62fba5e099..d79da943187 100644 --- a/arrow-parquet-integration-testing/src/main.rs +++ b/arrow-parquet-integration-testing/src/main.rs @@ -1,9 +1,10 @@ use std::fs::File; -use std::{collections::HashMap, io::Read}; +use std::{io::Read}; use arrow2::array::Array; use arrow2::io::ipc::IpcField; use arrow2::{ + AHashMap, chunk::Chunk, datatypes::{DataType, Schema}, error::Result, @@ -40,7 +41,7 @@ pub fn read_gzip_json( let (schema, ipc_fields) = read::deserialize_schema(&schema)?; // read dictionaries - let mut dictionaries = HashMap::new(); + let mut dictionaries = AHashMap::new(); if let Some(dicts) = arrow_json.dictionaries { for json_dict in dicts { // TODO: convert to a concrete Arrow type diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 0a3b778e38c..e36ad7eef03 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -22,11 +22,11 @@ use arrow2::io::ipc::IpcField; use serde_json::Value; use arrow2::chunk::Chunk; +use arrow2::AHashMap; use arrow2::datatypes::*; use arrow2::error::Result; use arrow2::io::json_integration::{read, ArrowJsonBatch, ArrowJsonDictionaryBatch}; -use std::collections::HashMap; use std::fs::File; use std::io::BufReader; @@ -43,7 +43,7 @@ pub struct ArrowFile { pub fields: Vec, // we can evolve this into a concrete Arrow type // this is temporarily not being read from - pub _dictionaries: HashMap, + pub _dictionaries: AHashMap, pub chunks: Vec>>, } @@ -54,7 +54,7 @@ pub fn read_json_file(json_name: &str) -> Result { let (schema, fields) = read::deserialize_schema(&arrow_json["schema"])?; // read dictionaries - let mut dictionaries = HashMap::new(); + let mut dictionaries = AHashMap::new(); if let Some(dicts) = arrow_json.get("dictionaries") { for d in dicts .as_array() diff --git a/src/array/union/mod.rs b/src/array/union/mod.rs index 6d605da064a..3bda5915688 100644 --- a/src/array/union/mod.rs +++ b/src/array/union/mod.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use ahash::AHashMap; use crate::{ bitmap::Bitmap, @@ -31,7 +31,7 @@ type UnionComponents<'a> = (&'a [Field], Option<&'a [i32]>, UnionMode); pub struct UnionArray { types: Buffer, // None represents when there is no typeid - fields_hash: Option>, + fields_hash: Option>, fields: Vec>, offsets: Option>, data_type: DataType, diff --git a/src/compute/like.rs b/src/compute/like.rs index 4509617398f..cc7517e823f 100644 --- a/src/compute/like.rs +++ b/src/compute/like.rs @@ -1,6 +1,6 @@ //! Contains "like" operators such as [`like_utf8`] and [`like_utf8_scalar`]. -use std::collections::HashMap; +use ahash::AHashMap; use regex::bytes::Regex as BytesRegex; use regex::Regex; @@ -35,7 +35,7 @@ fn a_like_utf8 bool>( let validity = combine_validities(lhs.validity(), rhs.validity()); - let mut map = HashMap::new(); + let mut map = AHashMap::new(); let values = Bitmap::try_from_trusted_len_iter(lhs.iter().zip(rhs.iter()).map(|(lhs, rhs)| { @@ -179,7 +179,7 @@ fn a_like_binary bool>( let validity = combine_validities(lhs.validity(), rhs.validity()); - let mut map = HashMap::new(); + let mut map = AHashMap::new(); let values = Bitmap::try_from_trusted_len_iter(lhs.iter().zip(rhs.iter()).map(|(lhs, rhs)| { diff --git a/src/compute/merge_sort/mod.rs b/src/compute/merge_sort/mod.rs index 9c48a29dd00..2776d58266e 100644 --- a/src/compute/merge_sort/mod.rs +++ b/src/compute/merge_sort/mod.rs @@ -56,8 +56,9 @@ //! To serialize slices, e.g. for checkpointing or transfer via Arrow's IPC, you can store //! them as 3 non-null primitive arrays (e.g. `PrimitiveArray`). +use ahash::AHashMap; +use std::cmp::Ordering; use std::iter::once; -use std::{cmp::Ordering, collections::HashMap}; use itertools::Itertools; @@ -498,7 +499,7 @@ pub fn build_comparator_impl<'a>( .collect::>>()?; Ok(((lhs_index, rhs_index), multi_column_comparator)) }) - .collect::>>>()?; + .collect::>>>()?; // prepare a comparison function taking into account _nulls_ and sort options let cmp = move |left_index, left_row, right_index, right_row| { diff --git a/src/compute/regex_match.rs b/src/compute/regex_match.rs index 001bd0f5d28..41cacb293bf 100644 --- a/src/compute/regex_match.rs +++ b/src/compute/regex_match.rs @@ -1,7 +1,6 @@ //! Contains regex matching operators [`regex_match`] and [`regex_match_scalar`]. -use std::collections::HashMap; - +use ahash::AHashMap; use regex::Regex; use super::utils::combine_validities; @@ -18,7 +17,7 @@ pub fn regex_match(values: &Utf8Array, regex: &Utf8Array) -> Re )); } - let mut map = HashMap::new(); + let mut map = AHashMap::new(); let validity = combine_validities(values.validity(), regex.validity()); let iterator = values.iter().zip(regex.iter()).map(|(haystack, regex)| { diff --git a/src/io/avro/mod.rs b/src/io/avro/mod.rs index 852915d9006..5addae95e4b 100644 --- a/src/io/avro/mod.rs +++ b/src/io/avro/mod.rs @@ -48,7 +48,7 @@ macro_rules! avro_decode { macro_rules! read_header { ($reader:ident $($_await:tt)*) => {{ - let mut items = HashMap::new(); + let mut items = ahash::AHashMap::new(); loop { let len = zigzag_i64($reader)$($_await)*? as usize; diff --git a/src/io/avro/read/header.rs b/src/io/avro/read/header.rs index 66bec96cc05..c985ce25084 100644 --- a/src/io/avro/read/header.rs +++ b/src/io/avro/read/header.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use ahash::AHashMap; use avro_schema::Schema; use serde_json; @@ -9,7 +8,7 @@ use super::Compression; /// Deserializes the Avro header into an Avro [`Schema`] and optional [`Compression`]. pub(crate) fn deserialize_header( - header: HashMap>, + header: AHashMap>, ) -> Result<(Schema, Option)> { let schema = header .get("avro.schema") diff --git a/src/io/avro/read/util.rs b/src/io/avro/read/util.rs index 92cbe3db2b6..b14870a3905 100644 --- a/src/io/avro/read/util.rs +++ b/src/io/avro/read/util.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use ahash::AHashMap; use std::io::Read; use avro_schema::Schema; @@ -29,7 +29,7 @@ fn _read_binary(reader: &mut R) -> Result> { Ok(buf) } -fn read_header(reader: &mut R) -> Result>> { +fn read_header(reader: &mut R) -> Result>> { read_header!(reader) } diff --git a/src/io/avro/read_async/metadata.rs b/src/io/avro/read_async/metadata.rs index 4609509376f..213428e0f0b 100644 --- a/src/io/avro/read_async/metadata.rs +++ b/src/io/avro/read_async/metadata.rs @@ -1,6 +1,5 @@ //! Async Avro -use std::collections::HashMap; - +use ahash::AHashMap; use avro_schema::{Record, Schema as AvroSchema}; use futures::AsyncRead; use futures::AsyncReadExt; @@ -56,6 +55,6 @@ async fn _read_binary(reader: &mut R) -> Result( reader: &mut R, -) -> Result>> { +) -> Result>> { read_header!(reader.await) } diff --git a/src/io/avro/write/header.rs b/src/io/avro/write/header.rs index 1d25f2f596a..370a0532b5a 100644 --- a/src/io/avro/write/header.rs +++ b/src/io/avro/write/header.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use ahash::AHashMap; use avro_schema::Schema; use serde_json; @@ -11,10 +10,10 @@ use super::Compression; pub(crate) fn serialize_header( schema: &Schema, compression: Option, -) -> Result>> { +) -> Result>> { let schema = serde_json::to_string(schema).map_err(|e| Error::ExternalFormat(e.to_string()))?; - let mut header = HashMap::>::default(); + let mut header = AHashMap::>::default(); header.insert("avro.schema".to_string(), schema.into_bytes()); if let Some(compression) = compression { diff --git a/src/io/csv/read/infer_schema.rs b/src/io/csv/read/infer_schema.rs index 83462bc1e8d..4aa3eaf53ea 100644 --- a/src/io/csv/read/infer_schema.rs +++ b/src/io/csv/read/infer_schema.rs @@ -1,7 +1,5 @@ -use std::{ - collections::HashSet, - io::{Read, Seek}, -}; +use ahash::AHashSet; +use std::io::{Read, Seek}; use crate::datatypes::{DataType, Field}; use crate::error::Result; @@ -34,7 +32,7 @@ pub fn infer_schema DataType>( let header_length = headers.len(); // keep track of inferred field types - let mut column_types: Vec> = vec![HashSet::new(); header_length]; + let mut column_types: Vec> = vec![AHashSet::new(); header_length]; let mut records_count = 0; diff --git a/src/io/csv/read_async/infer_schema.rs b/src/io/csv/read_async/infer_schema.rs index b22f2a613d3..2a2a8061595 100644 --- a/src/io/csv/read_async/infer_schema.rs +++ b/src/io/csv/read_async/infer_schema.rs @@ -1,6 +1,5 @@ -use std::collections::HashSet; - use super::{AsyncReader, ByteRecord}; +use ahash::AHashSet; use crate::datatypes::{DataType, Field}; use crate::error::Result; @@ -41,7 +40,7 @@ where let header_length = headers.len(); // keep track of inferred field types - let mut column_types: Vec> = vec![HashSet::new(); header_length]; + let mut column_types: Vec> = vec![AHashSet::new(); header_length]; let mut records_count = 0; diff --git a/src/io/csv/utils.rs b/src/io/csv/utils.rs index 266801d940a..436c7d192f3 100644 --- a/src/io/csv/utils.rs +++ b/src/io/csv/utils.rs @@ -1,6 +1,5 @@ -use std::collections::HashSet; - use crate::datatypes::{DataType, Field, TimeUnit}; +use ahash::AHashSet; pub(super) const RFC3339: &str = "%Y-%m-%dT%H:%M:%S%.f%:z"; @@ -78,7 +77,7 @@ pub fn infer(bytes: &[u8]) -> DataType { } } -fn merge_fields(field_name: &str, possibilities: &mut HashSet) -> Field { +fn merge_fields(field_name: &str, possibilities: &mut AHashSet) -> Field { // determine data type based on possible types // if there are incompatible types, use DataType::Utf8 let data_type = match possibilities.len() { @@ -101,7 +100,7 @@ fn merge_fields(field_name: &str, possibilities: &mut HashSet) -> Fiel pub(crate) fn merge_schema( headers: &[String], - column_types: &mut [HashSet], + column_types: &mut [AHashSet], ) -> Vec { headers .iter() diff --git a/src/io/ipc/read/common.rs b/src/io/ipc/read/common.rs index 7a9434528f5..5638a5289e6 100644 --- a/src/io/ipc/read/common.rs +++ b/src/io/ipc/read/common.rs @@ -1,4 +1,5 @@ -use std::collections::{HashMap, VecDeque}; +use ahash::AHashMap; +use std::collections::VecDeque; use std::io::{Read, Seek}; use arrow_format; @@ -322,14 +323,14 @@ mod tests { pub fn prepare_projection( fields: &[Field], mut projection: Vec, -) -> (Vec, HashMap, Vec) { +) -> (Vec, AHashMap, Vec) { let fields = projection.iter().map(|x| fields[*x].clone()).collect(); // todo: find way to do this more efficiently let mut indices = (0..projection.len()).collect::>(); indices.sort_unstable_by_key(|&i| &projection[i]); let map = indices.iter().copied().enumerate().fold( - HashMap::default(), + AHashMap::default(), |mut acc, (index, new_index)| { acc.insert(index, new_index); acc @@ -355,7 +356,7 @@ pub fn prepare_projection( pub fn apply_projection( chunk: Chunk>, - map: &HashMap, + map: &AHashMap, ) -> Chunk> { // re-order according to projection let arrays = chunk.into_arrays(); diff --git a/src/io/ipc/read/file_async.rs b/src/io/ipc/read/file_async.rs index 0d80b80ac67..4b36a44026e 100644 --- a/src/io/ipc/read/file_async.rs +++ b/src/io/ipc/read/file_async.rs @@ -1,5 +1,5 @@ //! Async reader for Arrow IPC files -use std::collections::HashMap; +use ahash::AHashMap; use std::io::SeekFrom; use arrow_format::ipc::{planus::ReadAsRoot, Block, MessageHeaderRef}; @@ -73,7 +73,7 @@ impl<'a> FileStream<'a> { mut reader: R, mut dictionaries: Option, metadata: FileMetadata, - projection: Option<(Vec, HashMap)>, + projection: Option<(Vec, AHashMap)>, limit: Option, ) -> BoxStream<'a, Result>>> where diff --git a/src/io/ipc/read/mod.rs b/src/io/ipc/read/mod.rs index 5ffe6426e20..f623e0aef56 100644 --- a/src/io/ipc/read/mod.rs +++ b/src/io/ipc/read/mod.rs @@ -4,7 +4,7 @@ //! which provides arbitrary access to any of its messages, and the //! [`StreamReader`](stream::StreamReader), which only supports reading //! data in the order it was written in. -use std::collections::HashMap; +use ahash::AHashMap; use crate::array::Array; @@ -35,7 +35,7 @@ pub use schema::deserialize_schema; pub use stream::{read_stream_metadata, StreamMetadata, StreamReader, StreamState}; /// how dictionaries are tracked in this crate -pub type Dictionaries = HashMap>; +pub type Dictionaries = AHashMap>; pub(crate) type Node<'a> = arrow_format::ipc::FieldNodeRef<'a>; pub(crate) type IpcBuffer<'a> = arrow_format::ipc::BufferRef<'a>; diff --git a/src/io/ipc/read/reader.rs b/src/io/ipc/read/reader.rs index b18ca4ceade..ab72adec96e 100644 --- a/src/io/ipc/read/reader.rs +++ b/src/io/ipc/read/reader.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use ahash::AHashMap; use std::convert::TryInto; use std::io::{Read, Seek, SeekFrom}; @@ -119,7 +119,7 @@ pub fn read_file_dictionaries( let blocks = if let Some(blocks) = metadata.dictionaries.as_deref() { blocks } else { - return Ok(HashMap::new()); + return Ok(AHashMap::new()); }; // use a temporary smaller scratch for the messages let mut message_scratch = Default::default(); @@ -326,7 +326,7 @@ pub struct FileReader { // the dictionaries are going to be read dictionaries: Option, current_block: usize, - projection: Option<(Vec, HashMap, Schema)>, + projection: Option<(Vec, AHashMap, Schema)>, remaining: usize, data_scratch: Vec, message_scratch: Vec, diff --git a/src/io/ipc/read/stream.rs b/src/io/ipc/read/stream.rs index b3f3a986d68..cea6e08825f 100644 --- a/src/io/ipc/read/stream.rs +++ b/src/io/ipc/read/stream.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use ahash::AHashMap; use std::io::Read; use arrow_format; @@ -96,7 +96,7 @@ fn read_next( dictionaries: &mut Dictionaries, message_buffer: &mut Vec, data_buffer: &mut Vec, - projection: &Option<(Vec, HashMap, Schema)>, + projection: &Option<(Vec, AHashMap, Schema)>, scratch: &mut Vec, ) -> Result> { // determine metadata length @@ -241,7 +241,7 @@ pub struct StreamReader { finished: bool, data_buffer: Vec, message_buffer: Vec, - projection: Option<(Vec, HashMap, Schema)>, + projection: Option<(Vec, AHashMap, Schema)>, scratch: Vec, } diff --git a/src/io/json_integration/read/array.rs b/src/io/json_integration/read/array.rs index 2d7d0ec39a6..0c44cc2432b 100644 --- a/src/io/json_integration/read/array.rs +++ b/src/io/json_integration/read/array.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use ahash::AHashMap; use num_traits::NumCast; use serde_json::Value; @@ -192,7 +191,7 @@ fn to_list( json_col: &ArrowJsonColumn, data_type: DataType, field: &IpcField, - dictionaries: &HashMap, + dictionaries: &AHashMap, ) -> Result> { let validity = to_validity(&json_col.validity); @@ -214,7 +213,7 @@ fn to_map( json_col: &ArrowJsonColumn, data_type: DataType, field: &IpcField, - dictionaries: &HashMap, + dictionaries: &AHashMap, ) -> Result> { let validity = to_validity(&json_col.validity); @@ -234,7 +233,7 @@ fn to_dictionary( data_type: DataType, field: &IpcField, json_col: &ArrowJsonColumn, - dictionaries: &HashMap, + dictionaries: &AHashMap, ) -> Result> { // find dictionary let dict_id = field.dictionary_id.unwrap(); @@ -260,7 +259,7 @@ pub fn to_array( data_type: DataType, field: &IpcField, json_col: &ArrowJsonColumn, - dictionaries: &HashMap, + dictionaries: &AHashMap, ) -> Result> { use PhysicalType::*; match data_type.to_physical_type() { @@ -409,7 +408,7 @@ pub fn deserialize_chunk( schema: &Schema, ipc_fields: &[IpcField], json_batch: &ArrowJsonBatch, - json_dictionaries: &HashMap, + json_dictionaries: &AHashMap, ) -> Result>> { let arrays = schema .fields diff --git a/src/io/parquet/write/sink.rs b/src/io/parquet/write/sink.rs index 160ac81d7fe..9ba2d4aea90 100644 --- a/src/io/parquet/write/sink.rs +++ b/src/io/parquet/write/sink.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, pin::Pin, task::Poll}; +use ahash::AHashMap; +use std::{pin::Pin, task::Poll}; use futures::{future::BoxFuture, AsyncWrite, FutureExt, Sink, TryFutureExt}; use parquet2::metadata::KeyValue; @@ -62,7 +63,7 @@ pub struct FileSink<'a, W: AsyncWrite + Send + Unpin> { schema: Schema, parquet_schema: SchemaDescriptor, /// Key-value metadata that will be written to the file on close. - pub metadata: HashMap>, + pub metadata: AHashMap>, } impl<'a, W> FileSink<'a, W> @@ -105,7 +106,7 @@ where schema, encodings, parquet_schema, - metadata: HashMap::default(), + metadata: AHashMap::default(), }) } diff --git a/src/lib.rs b/src/lib.rs index 03963841b54..01fe444d5cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,3 +37,6 @@ mod docs; // re-exported because we return `Either` in our public API pub use either::Either; + +// re-exported to construct dictionaries +pub use ahash::AHashMap; diff --git a/tests/it/io/ipc/common.rs b/tests/it/io/ipc/common.rs index deaf44ec6ab..6254c8f5bbd 100644 --- a/tests/it/io/ipc/common.rs +++ b/tests/it/io/ipc/common.rs @@ -1,4 +1,5 @@ -use std::{collections::HashMap, fs::File, io::Read}; +use ahash::AHashMap; +use std::{fs::File, io::Read}; use arrow2::{ array::Array, chunk::Chunk, datatypes::Schema, error::Result, @@ -29,7 +30,7 @@ pub fn read_gzip_json(version: &str, file_name: &str) -> Result { let (schema, ipc_fields) = read::deserialize_schema(&schema)?; // read dictionaries - let mut dictionaries = HashMap::new(); + let mut dictionaries = AHashMap::new(); if let Some(dicts) = arrow_json.dictionaries { for json_dict in dicts { // TODO: convert to a concrete Arrow type diff --git a/tests/it/io/parquet/write_async.rs b/tests/it/io/parquet/write_async.rs index 89d76efea92..5644caf67f5 100644 --- a/tests/it/io/parquet/write_async.rs +++ b/tests/it/io/parquet/write_async.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use ahash::AHashMap; use arrow2::{ array::{Float32Array, Int32Array}, chunk::Chunk, @@ -46,7 +45,7 @@ async fn test_parquet_async_roundtrip() { buffer.set_position(0); let metadata = read_metadata_async(&mut buffer).await.unwrap(); - let kv = HashMap::>::from_iter( + let kv = AHashMap::>::from_iter( metadata .key_value_metadata() .to_owned()