From 30d3269b4149f49730a5090aeac9043858c333e1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 18 Oct 2023 10:54:16 +0100 Subject: [PATCH 1/5] Update arrow 48.0.0 --- Cargo.toml | 20 +++- datafusion-cli/Cargo.lock | 112 +++++++----------- datafusion-cli/Cargo.toml | 9 +- datafusion/common/Cargo.toml | 2 +- .../common/src/file_options/csv_writer.rs | 24 +--- datafusion/common/src/file_options/mod.rs | 2 +- .../core/src/datasource/file_format/csv.rs | 6 +- .../src/datasource/listing_table_factory.rs | 3 +- .../core/src/datasource/physical_plan/csv.rs | 2 +- datafusion/wasmtest/Cargo.toml | 2 +- 10 files changed, 73 insertions(+), 109 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d28ebd15a09e..2ad0c57f5800 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,12 +48,12 @@ rust-version = "1.70" version = "32.0.0" [workspace.dependencies] -arrow = { version = "47.0.0", features = ["prettyprint"] } -arrow-array = { version = "47.0.0", default-features = false, features = ["chrono-tz"] } -arrow-buffer = { version = "47.0.0", default-features = false } -arrow-flight = { version = "47.0.0", features = ["flight-sql-experimental"] } -arrow-schema = { version = "47.0.0", default-features = false } -parquet = { version = "47.0.0", features = ["arrow", "async", "object_store"] } +arrow = { version = "48.0.0", features = ["prettyprint"] } +arrow-array = { version = "48.0.0", default-features = false, features = ["chrono-tz"] } +arrow-buffer = { version = "48.0.0", default-features = false } +arrow-flight = { version = "48.0.0", features = ["flight-sql-experimental"] } +arrow-schema = { version = "48.0.0", default-features = false } +parquet = { version = "48.0.0", features = ["arrow", "async", "object_store"] } sqlparser = { version = "0.38.0", features = ["visitor"] } chrono = { version = "0.4.31", default-features = false } @@ -74,3 +74,11 @@ opt-level = 3 overflow-checks = false panic = 'unwind' rpath = false + +[patch.crates-io] +arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-array = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-flight = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 1c872c28485c..4e7aefe2ab58 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -129,9 +129,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fab9e93ba8ce88a37d5a30dce4b9913b75413dc1ac56cb5d72e5a840543f829" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-arith", @@ -151,9 +150,8 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc1d4e368e87ad9ee64f28b9577a3834ce10fe2703a26b28417d485bbbdff956" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -166,9 +164,8 @@ dependencies = [ [[package]] name = "arrow-array" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d02efa7253ede102d45a4e802a129e83bcc3f49884cab795b1ac223918e4318d" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-buffer", @@ -183,9 +180,8 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda119225204141138cb0541c692fbfef0e875ba01bfdeaed09e9d354f9d6195" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "bytes", "half", @@ -194,9 +190,8 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d825d51b9968868d50bc5af92388754056796dbc62a4e25307d588a1fc84dee" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -212,9 +207,8 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43ef855dc6b126dc197f43e061d4de46b9d4c033aa51c2587657f7508242cef1" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -231,9 +225,8 @@ dependencies = [ [[package]] name = "arrow-data" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "475a4c3699c8b4095ca61cecf15da6f67841847a5f5aac983ccb9a377d02f73a" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-buffer", "arrow-schema", @@ -243,9 +236,8 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1248005c8ac549f869b7a840859d942bf62471479c1a2d82659d453eebcd166a" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -257,9 +249,8 @@ dependencies = [ [[package]] name = "arrow-json" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f03d7e3b04dd688ccec354fe449aed56b831679f03e44ee2c1cfc4045067b69c" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -277,9 +268,8 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03b87aa408ea6a6300e49eb2eba0c032c88ed9dc19e0a9948489c55efdca71f4" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -292,9 +282,8 @@ dependencies = [ [[package]] name = "arrow-row" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "114a348ab581e7c9b6908fcab23cb39ff9f060eb19e72b13f8fb8eaa37f65d22" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-array", @@ -307,15 +296,13 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d1d179c117b158853e0101bfbed5615e86fe97ee356b4af901f1c5001e1ce4b" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" [[package]] name = "arrow-select" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5c71e003202e67e9db139e5278c79f5520bb79922261dfe140e4637ee8b6108" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-array", @@ -327,9 +314,8 @@ dependencies = [ [[package]] name = "arrow-string" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4cebbb282d6b9244895f4a9a912e55e57bce112554c7fa91fcec5459cb421ab" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "arrow-array", "arrow-buffer", @@ -338,7 +324,7 @@ dependencies = [ "arrow-select", "num", "regex", - "regex-syntax 0.7.5", + "regex-syntax", ] [[package]] @@ -1232,7 +1218,7 @@ dependencies = [ "hashbrown 0.14.1", "itertools", "log", - "regex-syntax 0.8.1", + "regex-syntax", ] [[package]] @@ -2072,23 +2058,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] -name = "lz4" -version = "1.24.0" +name = "lz4_flex" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" +checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8" dependencies = [ - "libc", - "lz4-sys", -] - -[[package]] -name = "lz4-sys" -version = "1.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" -dependencies = [ - "cc", - "libc", + "twox-hash", ] [[package]] @@ -2363,9 +2338,8 @@ dependencies = [ [[package]] name = "parquet" -version = "47.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0463cc3b256d5f50408c49a4be3a16674f4c8ceef60941709620a062b1f6bf4d" +version = "48.0.0" +source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" dependencies = [ "ahash", "arrow-array", @@ -2382,7 +2356,7 @@ dependencies = [ "flate2", "futures", "hashbrown 0.14.1", - "lz4", + "lz4_flex", "num", "num-bigint", "object_store", @@ -2392,7 +2366,7 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd 0.12.4", + "zstd 0.13.0", ] [[package]] @@ -2681,7 +2655,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax 0.8.1", + "regex-syntax", ] [[package]] @@ -2692,7 +2666,7 @@ checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.1", + "regex-syntax", ] [[package]] @@ -2701,12 +2675,6 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a6ebcd15653947e6140f59a9811a06ed061d18a5c35dfca2e2e4c5525696878" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.1" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index b2a22cec987f..2a50ef04ba51 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -29,7 +29,7 @@ rust-version = "1.70" readme = "README.md" [dependencies] -arrow = "47.0.0" +arrow = "48.0.0" async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" @@ -50,3 +50,10 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.17" + +[patch.crates-io] +arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-array = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } +parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 733aba1e1da1..047c502d5cc2 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -50,7 +50,7 @@ half = { version = "2.1", default-features = false } num_cpus = "1.13.0" object_store = { version = "0.7.0", default-features = false, optional = true } parquet = { workspace = true, optional = true } -pyo3 = { version = "0.19.0", optional = true } +pyo3 = { version = "0.20.0", optional = true } sqlparser = { workspace = true } [dev-dependencies] diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index b69e778431cc..fef4a1d21b4b 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -37,13 +37,6 @@ pub struct CsvWriterOptions { /// Compression to apply after ArrowWriter serializes RecordBatches. /// This compression is applied by DataFusion not the ArrowWriter itself. pub compression: CompressionTypeVariant, - /// Indicates whether WriterBuilder.has_header() is set to true. - /// This is duplicative as WriterBuilder also stores this information. - /// However, WriterBuilder does not allow public read access to the - /// has_header parameter. - pub has_header: bool, - // TODO: expose a way to read has_header in arrow create - // https://github.com/apache/arrow-rs/issues/4735 } impl CsvWriterOptions { @@ -54,7 +47,6 @@ impl CsvWriterOptions { Self { writer_options, compression, - has_header: true, } } } @@ -65,29 +57,20 @@ impl TryFrom<(&ConfigOptions, &StatementOptions)> for CsvWriterOptions { fn try_from(value: (&ConfigOptions, &StatementOptions)) -> Result { let _configs = value.0; let statement_options = value.1; - let mut has_header = true; let mut builder = WriterBuilder::default(); let mut compression = CompressionTypeVariant::UNCOMPRESSED; for (option, value) in &statement_options.options { builder = match option.to_lowercase().as_str(){ "header" => { - has_header = value.parse() + let has_header = value.parse() .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - builder.has_headers(has_header) + builder.with_header(has_header) }, "date_format" => builder.with_date_format(value.to_owned()), "datetime_format" => builder.with_datetime_format(value.to_owned()), "timestamp_format" => builder.with_timestamp_format(value.to_owned()), "time_format" => builder.with_time_format(value.to_owned()), - "rfc3339" => { - let value_bool = value.parse() - .map_err(|_| DataFusionError::Configuration(format!("Unable to parse {value} as bool as required for {option}!")))?; - if value_bool{ - builder.with_rfc3339() - } else{ - builder - } - }, + "rfc3339" => builder, // No-op "null_value" => builder.with_null(value.to_owned()), "compression" => { compression = CompressionTypeVariant::from_str(value.replace('\'', "").as_str())?; @@ -112,7 +95,6 @@ impl TryFrom<(&ConfigOptions, &StatementOptions)> for CsvWriterOptions { } } Ok(CsvWriterOptions { - has_header, writer_options: builder, compression, }) diff --git a/datafusion/common/src/file_options/mod.rs b/datafusion/common/src/file_options/mod.rs index 45b105dfadae..b7c1341e3046 100644 --- a/datafusion/common/src/file_options/mod.rs +++ b/datafusion/common/src/file_options/mod.rs @@ -523,9 +523,9 @@ mod tests { let csv_options = CsvWriterOptions::try_from((&config, &options))?; let builder = csv_options.writer_options; + assert!(builder.header()); let buff = Vec::new(); let _properties = builder.build(buff); - assert!(csv_options.has_header); assert_eq!(csv_options.compression, CompressionTypeVariant::GZIP); // TODO expand unit test if csv::WriterBuilder allows public read access to properties diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index bc01b29ba04b..939d3b0b5581 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -431,7 +431,7 @@ impl CsvSerializer { impl BatchSerializer for CsvSerializer { async fn serialize(&mut self, batch: RecordBatch) -> Result { let builder = self.builder.clone(); - let mut writer = builder.has_headers(self.header).build(&mut self.buffer); + let mut writer = builder.with_header(self.header).build(&mut self.buffer); writer.write(&batch)?; drop(writer); self.header = false; @@ -508,7 +508,7 @@ impl CsvSink { } else { CsvSerializer::new() .with_builder(inner_clone) - .with_header(options_clone.has_header) + .with_header(options_clone.writer_options.header()) }); serializer }; @@ -540,7 +540,7 @@ impl CsvSink { let serializer: Box = Box::new( CsvSerializer::new() .with_builder(inner_clone) - .with_header(options_clone.has_header), + .with_header(options_clone.writer_options.header()), ); serializer }; diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index ebfb589f179e..e74bf6fa6499 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -179,10 +179,9 @@ impl TableProviderFactory for ListingTableFactory { FileType::CSV => { let mut csv_writer_options = file_type_writer_options.try_into_csv()?.clone(); - csv_writer_options.has_header = cmd.has_header; csv_writer_options.writer_options = csv_writer_options .writer_options - .has_headers(cmd.has_header) + .with_header(cmd.has_header) .with_delimiter(cmd.delimiter.try_into().map_err(|_| { DataFusionError::Internal( "Unable to convert CSV delimiter into u8".into(), diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index f3b2fa9de7a9..8cb4811bb0b0 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -538,7 +538,7 @@ pub async fn plan_to_csv( let mut write_headers = true; while let Some(batch) = stream.next().await.transpose()? { let mut writer = csv::WriterBuilder::new() - .has_headers(write_headers) + .with_header(write_headers) .build(buffer); writer.write(&batch)?; buffer = writer.into_inner(); diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml index 691031866af2..e1a9a5d41a5a 100644 --- a/datafusion/wasmtest/Cargo.toml +++ b/datafusion/wasmtest/Cargo.toml @@ -46,5 +46,5 @@ datafusion-sql = { path = "../sql" } # getrandom must be compiled with js feature getrandom = { version = "0.2.8", features = ["js"] } -parquet = { version = "47.0.0", default-features = false } +parquet = { version = "48.0.0", default-features = false } wasm-bindgen = "0.2.87" From 66e819e6d7cab6feb609b6a86034f69ee20b3c7f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 18 Oct 2023 13:42:16 +0100 Subject: [PATCH 2/5] Fix for pyo3 --- datafusion/common/src/pyarrow.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/pyarrow.rs b/datafusion/common/src/pyarrow.rs index d78aa8b988f7..59a8b811e3c8 100644 --- a/datafusion/common/src/pyarrow.rs +++ b/datafusion/common/src/pyarrow.rs @@ -94,10 +94,11 @@ mod tests { Some(locals), ) .expect("Couldn't get python info"); - let executable: String = - locals.get_item("executable").unwrap().extract().unwrap(); - let python_path: Vec<&str> = - locals.get_item("python_path").unwrap().extract().unwrap(); + let executable = locals.get_item("executable").unwrap().unwrap(); + let executable: String = executable.extract().unwrap(); + + let python_path = locals.get_item("python_path").unwrap().unwrap(); + let python_path: Vec<&str> = python_path.extract().unwrap(); panic!("pyarrow not found\nExecutable: {executable}\nPython path: {python_path:?}\n\ HINT: try `pip install pyarrow`\n\ From 5245de3d2356cd0eea4c9f27bee630d2ac029b46 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 18 Oct 2023 13:46:52 +0100 Subject: [PATCH 3/5] Update json.slt --- datafusion/sqllogictest/test_files/json.slt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt index 69902f2982dc..db1311e0bad4 100644 --- a/datafusion/sqllogictest/test_files/json.slt +++ b/datafusion/sqllogictest/test_files/json.slt @@ -58,8 +58,10 @@ AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)] ------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 --------JsonExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/2.json]]}, projection=[a] -query error DataFusion error: Schema error: No field named mycol\. +query ? SELECT mycol FROM single_nan +---- +NULL statement ok DROP TABLE json_test From 18345c634b21b58873a87626d3946e398baa52e1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 23 Oct 2023 16:10:51 +0100 Subject: [PATCH 4/5] Update pin and fix clippy --- Cargo.toml | 8 ---- datafusion-cli/Cargo.lock | 45 ++++++++++++------- datafusion-cli/Cargo.toml | 7 --- .../core/src/datasource/physical_plan/csv.rs | 2 +- datafusion/core/tests/fifo.rs | 2 +- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2ad0c57f5800..71088e7fc7ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,11 +74,3 @@ opt-level = 3 overflow-checks = false panic = 'unwind' rpath = false - -[patch.crates-io] -arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-array = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-flight = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4e7aefe2ab58..b83088f94c57 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -130,7 +130,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edb738d83750ec705808f6d44046d165e6bb8623f64e29a4d53fcb136ab22dfb" dependencies = [ "ahash", "arrow-arith", @@ -151,7 +152,8 @@ dependencies = [ [[package]] name = "arrow-arith" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c3d17fc5b006e7beeaebfb1d2edfc92398b981f82d9744130437909b72a468" dependencies = [ "arrow-array", "arrow-buffer", @@ -165,7 +167,8 @@ dependencies = [ [[package]] name = "arrow-array" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55705ada5cdde4cb0f202ffa6aa756637e33fea30e13d8d0d0fd6a24ffcee1e3" dependencies = [ "ahash", "arrow-buffer", @@ -181,7 +184,8 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a722f90a09b94f295ab7102542e97199d3500128843446ef63e410ad546c5333" dependencies = [ "bytes", "half", @@ -191,7 +195,8 @@ dependencies = [ [[package]] name = "arrow-cast" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af01fc1a06f6f2baf31a04776156d47f9f31ca5939fe6d00cd7a059f95a46ff1" dependencies = [ "arrow-array", "arrow-buffer", @@ -208,7 +213,8 @@ dependencies = [ [[package]] name = "arrow-csv" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83cbbfde86f9ecd3f875c42a73d8aeab3d95149cd80129b18d09e039ecf5391b" dependencies = [ "arrow-array", "arrow-buffer", @@ -226,7 +232,8 @@ dependencies = [ [[package]] name = "arrow-data" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a547195e607e625e7fafa1a7269b8df1a4a612c919efd9b26bd86e74538f3a" dependencies = [ "arrow-buffer", "arrow-schema", @@ -237,7 +244,8 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e36bf091502ab7e37775ff448413ef1ffff28ff93789acb669fffdd51b394d51" dependencies = [ "arrow-array", "arrow-buffer", @@ -250,7 +258,8 @@ dependencies = [ [[package]] name = "arrow-json" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ac346bc84846ab425ab3c8c7b6721db90643bc218939677ed7e071ccbfb919d" dependencies = [ "arrow-array", "arrow-buffer", @@ -269,7 +278,8 @@ dependencies = [ [[package]] name = "arrow-ord" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4502123d2397319f3a13688432bc678c61cb1582f2daa01253186da650bf5841" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,7 +293,8 @@ dependencies = [ [[package]] name = "arrow-row" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "249fc5a07906ab3f3536a6e9f118ec2883fbcde398a97a5ba70053f0276abda4" dependencies = [ "ahash", "arrow-array", @@ -297,12 +308,14 @@ dependencies = [ [[package]] name = "arrow-schema" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d7a8c3f97f5ef6abd862155a6f39aaba36b029322462d72bbcfa69782a50614" [[package]] name = "arrow-select" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f868f4a5001429e20f7c1994b5cd1aa68b82e3db8cf96c559cdb56dc8be21410" dependencies = [ "ahash", "arrow-array", @@ -315,7 +328,8 @@ dependencies = [ [[package]] name = "arrow-string" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27fdf8fc70040a2dee78af2e217479cb5b263bd7ab8711c7999e74056eb688a" dependencies = [ "arrow-array", "arrow-buffer", @@ -2339,7 +2353,8 @@ dependencies = [ [[package]] name = "parquet" version = "48.0.0" -source = "git+https://github.com/tustvold/arrow-rs.git?rev=27c795d5e4dc2351eb57f7225cd9dd051d3651c3#27c795d5e4dc2351eb57f7225cd9dd051d3651c3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "239229e6a668ab50c61de3dce61cf0fa1069345f7aa0f4c934491f92205a4945" dependencies = [ "ahash", "arrow-array", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 2a50ef04ba51..64e094437c5f 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -50,10 +50,3 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.17" - -[patch.crates-io] -arrow = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-array = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-buffer = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -arrow-schema = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } -parquet = { git = "https://github.com/tustvold/arrow-rs.git", rev = "27c795d5e4dc2351eb57f7225cd9dd051d3651c3" } diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 8cb4811bb0b0..99d08022e64b 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -289,7 +289,7 @@ impl CsvConfig { let mut builder = csv::ReaderBuilder::new(self.file_schema.clone()) .with_delimiter(self.delimiter) .with_batch_size(self.batch_size) - .has_header(self.has_header) + .with_header(self.has_header) .with_quote(self.quote); if let Some(proj) = &self.file_projection { diff --git a/datafusion/core/tests/fifo.rs b/datafusion/core/tests/fifo.rs index 2c8b0b784f26..7d9ea97f7b5b 100644 --- a/datafusion/core/tests/fifo.rs +++ b/datafusion/core/tests/fifo.rs @@ -377,7 +377,7 @@ mod unix_test { ])); let mut reader = ReaderBuilder::new(schema) - .has_header(true) + .with_header(true) .with_batch_size(TEST_BATCH_SIZE) .build(file) .map_err(|e| DataFusionError::Internal(e.to_string())) From f16fa15383966efc82dc9c622f82e351343b79db Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 23 Oct 2023 16:53:22 +0100 Subject: [PATCH 5/5] More clippy --- datafusion-cli/src/print_format.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-cli/src/print_format.rs b/datafusion-cli/src/print_format.rs index e2994bc14034..0738bf6f9b47 100644 --- a/datafusion-cli/src/print_format.rs +++ b/datafusion-cli/src/print_format.rs @@ -59,7 +59,7 @@ fn print_batches_with_sep(batches: &[RecordBatch], delimiter: u8) -> Result