From d9eaf33872a1641df85fe52a3dd503c40e714943 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 18 Oct 2023 10:54:16 +0100 Subject: [PATCH] Update arrow 48.0.0 --- Cargo.toml | 20 +++- datafusion-cli/Cargo.lock | 112 +++++++----------- datafusion-cli/Cargo.toml | 9 +- datafusion/common/Cargo.toml | 2 +- .../common/src/file_options/csv_writer.rs | 24 +--- .../core/src/datasource/file_format/csv.rs | 6 +- .../src/datasource/listing_table_factory.rs | 3 +- .../core/src/datasource/physical_plan/csv.rs | 2 +- datafusion/core/tests/sql/mod.rs | 2 +- datafusion/wasmtest/Cargo.toml | 2 +- 10 files changed, 73 insertions(+), 109 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d28ebd15a09e7..2ad0c57f58004 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,12 +48,12 @@ rust-version = "1.70" version = "32.0.0" [workspace.dependencies] -arrow = { version = "47.0.0", features = ["prettyprint"] } -arrow-array = { version = "47.0.0", default-features = false, features = ["chrono-tz"] } -arrow-buffer = { version = "47.0.0", default-features = false } -arrow-flight = { version = "47.0.0", features = ["flight-sql-experimental"] } -arrow-schema = { version = "47.0.0", default-features = false } -parquet = { version = "47.0.0", features = ["arrow", "async", "object_store"] } +arrow = { version = "48.0.0", features = ["prettyprint"] } +arrow-array = { version = "48.0.0", default-features = false, features = ["chrono-tz"] } +arrow-buffer = { version = "48.0.0", default-features = false } +arrow-flight = { version = "48.0.0", features = ["flight-sql-experimental"] } +arrow-schema = { version = "48.0.0", default-features = false } +parquet = { version = "48.0.0", features = ["arrow", "async", "object_store"] } sqlparser = { version = "0.38.0", features = ["visitor"] } chrono = { version = "0.4.31", default-features = false } @@ -74,3 +74,11 @@ opt-level = 3 overflow-checks = false panic = 'unwind' rpath = false + +[patch.crates-io] +arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-array = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-flight = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 1c872c28485ca..4e7aefe2ab585 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -129,9 +129,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fab9e93ba8ce88a37d5a30dce4b9913b75413dc1ac56cb5d72e5a840543f829" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-arith", @@ -151,9 +150,8 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc1d4e368e87ad9ee64f28b9577a3834ce10fe2703a26b28417d485bbbdff956" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -166,9 +164,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-buffer", @@ -183,9 +180,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda119225204141138cb0541c692fbfef0e875ba01bfdeaed09e9d354f9d6195" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "bytes", "half", @@ -194,9 +190,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d825d51b9968868d50bc5af92388754056796dbc62a4e25307d588a1fc84dee" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -212,9 +207,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43ef855dc6b126dc197f43e061d4de46b9d4c033aa51c2587657f7508242cef1" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -231,9 +225,8 @@ dependencies = [ [[package]] name = "arrow-data" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-buffer", "arrow-schema", @@ -243,9 +236,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1248005c8ac549f869b7a840859d942bf62471479c1a2d82659d453eebcd166a" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -257,9 +249,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f03d7e3b04dd688ccec354fe449aed56b831679f03e44ee2c1cfc4045067b69c" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -277,9 +268,8 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03b87aa408ea6a6300e49eb2eba0c032c88ed9dc19e0a9948489c55efdca71f4" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -292,9 +282,8 @@ dependencies = [ [[package]] name = "arrow-row" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "114a348ab581e7c9b6908fcab23cb39ff9f060eb19e72b13f8fb8eaa37f65d22" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-array", @@ -307,15 +296,13 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" [[package]] name = "arrow-select" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5c71e003202e67e9db139e5278c79f5520bb79922261dfe140e4637ee8b6108" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-array", @@ -327,9 +314,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cebbb282d6b9244895f4a9a912e55e57bce112554c7fa91fcec5459cb421ab" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -338,7 +324,7 @@ dependencies = [ "arrow-select", "num", "regex", - "regex-syntax 0.7.5", + "regex-syntax", ] [[package]] @@ -1232,7 +1218,7 @@ dependencies = [ "hashbrown 0.14.1", "itertools", "log", - "regex-syntax 0.8.1", + "regex-syntax", ] [[package]] @@ -2072,23 +2058,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] -name = "lz4" -version = "1.24.0" +name = "lz4_flex" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" +checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8" dependencies = [ - "libc", - "lz4-sys", -] - -[[package]] -name = "lz4-sys" -version = "1.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" -dependencies = [ - "cc", - "libc", + "twox-hash", ] [[package]] @@ -2363,9 +2338,8 @@ dependencies = [ [[package]] name = "parquet" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-array", @@ -2382,7 +2356,7 @@ dependencies = [ "flate2", "futures", "hashbrown 0.14.1", - "lz4", + "lz4_flex", "num", "num-bigint", "object_store", @@ -2392,7 +2366,7 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd 0.12.4", + "zstd 0.13.0", ] [[package]] @@ -2681,7 +2655,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax 0.8.1", + "regex-syntax", ] [[package]] @@ -2692,7 +2666,7 @@ checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.1", + "regex-syntax", ] [[package]] @@ -2701,12 +2675,6 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a6ebcd15653947e6140f59a9811a06ed061d18a5c35dfca2e2e4c5525696878" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.1" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index b2a22cec987fd..2a50ef04ba51b 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -29,7 +29,7 @@ rust-version = "1.70" readme = "README.md" [dependencies] -arrow = "47.0.0" +arrow = "48.0.0" async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" @@ -50,3 +50,10 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.17" + +[patch.crates-io] +arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-array = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 733aba1e1da15..047c502d5cc2f 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -50,7 +50,7 @@ half = { version = "2.1", default-features = false } num_cpus = "1.13.0" object_store = { version = "0.7.0", default-features = false, optional = true } parquet = { workspace = true, optional = true } -pyo3 = { version = "0.19.0", optional = true } +pyo3 = { version = "0.20.0", optional = true } sqlparser = { workspace = true } [dev-dependencies] diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index b69e778431cc5..fef4a1d21b4bc 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -37,13 +37,6 @@ pub struct CsvWriterOptions { /// Compression to apply after ArrowWriter serializes RecordBatches. /// This compression is applied by DataFusion not the ArrowWriter itself. pub compression: CompressionTypeVariant, - /// Indicates whether WriterBuilder.has_header() is set to true. - /// This is duplicative as WriterBuilder also stores this information. - /// However, WriterBuilder does not allow public read access to the - /// has_header parameter. - pub has_header: bool, - // TODO: expose a way to read has_header in arrow create - // https://github.com/apache/arrow-rs/issues/4735 } impl CsvWriterOptions { @@ -54,7 +47,6 @@ impl CsvWriterOptions { Self { writer_options, compression, - has_header: true, } } } @@ -65,29 +57,20 @@ impl TryFrom<(&ConfigOptions, &StatementOptions)> for CsvWriterOptions { fn try_from(value: (&ConfigOptions, &StatementOptions)) -> Result { let _configs = value.0; let statement_options = value.1; - let mut has_header = true; let mut builder = WriterBuilder::default(); let mut compression = CompressionTypeVariant::UNCOMPRESSED; for (option, value) in &statement_options.options { builder = match option.to_lowercase().as_str(){ "header" => { - has_header = value.parse() + let has_header = value.parse() .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - builder.has_headers(has_header) + builder.with_header(has_header) }, "date_format" => builder.with_date_format(value.to_owned()), "datetime_format" => builder.with_datetime_format(value.to_owned()), "timestamp_format" => builder.with_timestamp_format(value.to_owned()), "time_format" => builder.with_time_format(value.to_owned()), - "rfc3339" => { - let value_bool = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - if value_bool{ - builder.with_rfc3339() - } else{ - builder - } - }, + "rfc3339" => builder, // No-op "null_value" => builder.with_null(value.to_owned()), "compression" => { compression = CompressionTypeVariant::from_str(value.replace('\'', "").as_str())?; @@ -112,7 +95,6 @@ impl TryFrom<(&ConfigOptions, &StatementOptions)> for CsvWriterOptions { } } Ok(CsvWriterOptions { - has_header, writer_options: builder, compression, }) diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index bc01b29ba04b2..939d3b0b55814 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -431,7 +431,7 @@ impl CsvSerializer { impl BatchSerializer for CsvSerializer { async fn serialize(&mut self, batch: RecordBatch) -> Result { let builder = self.builder.clone(); - let mut writer = builder.has_headers(self.header).build(&mut self.buffer); + let mut writer = builder.with_header(self.header).build(&mut self.buffer); writer.write(&batch)?; drop(writer); self.header = false; @@ -508,7 +508,7 @@ impl CsvSink { } else { CsvSerializer::new() .with_builder(inner_clone) - .with_header(options_clone.has_header) + .with_header(options_clone.writer_options.header()) }); serializer }; @@ -540,7 +540,7 @@ impl CsvSink { let serializer: Box = Box::new( CsvSerializer::new() .with_builder(inner_clone) - .with_header(options_clone.has_header), + .with_header(options_clone.writer_options.header()), ); serializer }; diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index ebfb589f179e1..e74bf6fa6499c 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -179,10 +179,9 @@ impl TableProviderFactory for ListingTableFactory { FileType::CSV => { let mut csv_writer_options = file_type_writer_options.try_into_csv()?.clone(); - csv_writer_options.has_header = cmd.has_header; csv_writer_options.writer_options = csv_writer_options .writer_options - .has_headers(cmd.has_header) + .with_header(cmd.has_header) .with_delimiter(cmd.delimiter.try_into().map_err(|_| { DataFusionError::Internal( "Unable to convert CSV delimiter into u8".into(), diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index f3b2fa9de7a9c..8cb4811bb0b01 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -538,7 +538,7 @@ pub async fn plan_to_csv( let mut write_headers = true; while let Some(batch) = stream.next().await.transpose()? { let mut writer = csv::WriterBuilder::new() - .has_headers(write_headers) + .with_header(write_headers) .build(buffer); writer.write(&batch)?; buffer = writer.into_inner(); diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index d44513e69a9f9..c50df557e759c 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -335,7 +335,7 @@ async fn register_tpch_csv_data( let schema = Arc::new(get_tpch_table_schema(table_name)); let mut reader = ::csv::ReaderBuilder::new() - .has_headers(false) + .with_header(false) .from_reader(data.as_bytes()); let records: Vec<_> = reader.records().map(|it| it.unwrap()).collect(); diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 691031866af2f..e1a9a5d41a5ac 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -46,5 +46,5 @@ datafusion-sql = { path = "../sql" } # getrandom must be compiled with js feature getrandom = { version = "0.2.8", features = ["js"] } -parquet = { version = "47.0.0", default-features = false } +parquet = { version = "48.0.0", default-features = false } wasm-bindgen = "0.2.87"