diff --git a/.cargo/audit.toml b/.cargo/audit.toml index aa5492c1beb..fa5991233ee 100644 --- a/.cargo/audit.toml +++ b/.cargo/audit.toml @@ -10,4 +10,8 @@ ignore = [ # Therefore, this advisory does not affect us. "RUSTSEC-2020-0071", "RUSTSEC-2020-0159", # same as previous + + # this cannot be addressed, only mitigated. + # See [.github/workflows/security.yml] for details on how we mitigate this. + "RUSTSEC-2020-0159", ] diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index a1d19e6f8c6..1d6099a4553 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -8,3 +8,25 @@ jobs: - uses: actions-rs/audit-check@v1 with: token: ${{ secrets.GITHUB_TOKEN }} + + # mitigation for RUSTSEC-2020-0159 + # flatbuffers' usage of `unsafe` is problematic and a risk. + # This performs a round-trip over IPC (that uses flatbuffers) for some arrow types + # using miri, which hits much of `flatbuffers` usage in this crate. + miri-checks: + name: RUSTSEC-2020-0159 mitigation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: nightly-2021-10-24 + override: true + - uses: Swatinem/rust-cache@v1 + - name: Install Miri + run: | + rustup component add miri + cargo miri setup + + - name: Run + run: MIRIFLAGS="-Zmiri-disable-stacked-borrows -Zmiri-disable-isolation" cargo miri test --tests --features io_ipc,io_ipc_compression,io_json_integration io::ipc::write::file::write_100_nested diff --git a/tests/it/io/ipc/write/file.rs b/tests/it/io/ipc/write/file.rs index e00936203c4..8dcbc7423cb 100644 --- a/tests/it/io/ipc/write/file.rs +++ b/tests/it/io/ipc/write/file.rs @@ -14,7 +14,7 @@ fn round_trip(batch: RecordBatch) -> Result<()> { // write IPC version 5 let written_result = { let options = WriteOptions { - compression: Some(Compression::ZSTD), + compression: Some(Compression::LZ4), }; let mut writer = FileWriter::try_new(result, batch.schema(), options)?; writer.write(&batch)?; @@ -85,7 +85,12 @@ fn test_file(version: &str, file_name: &str, compressed: bool) -> Result<()> { #[test] fn write_100_primitive() -> Result<()> { test_file("1.0.0-littleendian", "generated_primitive", false)?; - test_file("1.0.0-bigendian", "generated_primitive", false)?; + test_file("1.0.0-bigendian", "generated_primitive", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_primitive() -> Result<()> { test_file("1.0.0-littleendian", "generated_primitive", true)?; test_file("1.0.0-bigendian", "generated_primitive", true) } @@ -93,7 +98,12 @@ fn write_100_primitive() -> Result<()> { #[test] fn write_100_datetime() -> Result<()> { test_file("1.0.0-littleendian", "generated_datetime", false)?; - test_file("1.0.0-bigendian", "generated_datetime", false)?; + test_file("1.0.0-bigendian", "generated_datetime", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_datetime() -> Result<()> { test_file("1.0.0-littleendian", "generated_datetime", true)?; test_file("1.0.0-bigendian", "generated_datetime", true) } @@ -101,7 +111,12 @@ fn write_100_datetime() -> Result<()> { #[test] fn write_100_dictionary_unsigned() -> Result<()> { test_file("1.0.0-littleendian", "generated_dictionary_unsigned", false)?; - test_file("1.0.0-bigendian", "generated_dictionary_unsigned", false)?; + test_file("1.0.0-bigendian", "generated_dictionary_unsigned", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_dictionary_unsigned() -> Result<()> { test_file("1.0.0-littleendian", "generated_dictionary_unsigned", true)?; test_file("1.0.0-bigendian", "generated_dictionary_unsigned", true) } @@ -109,7 +124,12 @@ fn write_100_dictionary_unsigned() -> Result<()> { #[test] fn write_100_dictionary() -> Result<()> { test_file("1.0.0-littleendian", "generated_dictionary", false)?; - test_file("1.0.0-bigendian", "generated_dictionary", false)?; + test_file("1.0.0-bigendian", "generated_dictionary", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_dictionary() -> Result<()> { test_file("1.0.0-littleendian", "generated_dictionary", true)?; test_file("1.0.0-bigendian", "generated_dictionary", true) } @@ -117,7 +137,12 @@ fn write_100_dictionary() -> Result<()> { #[test] fn write_100_interval() -> Result<()> { test_file("1.0.0-littleendian", "generated_interval", false)?; - test_file("1.0.0-bigendian", "generated_interval", false)?; + test_file("1.0.0-bigendian", "generated_interval", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_interval() -> Result<()> { test_file("1.0.0-littleendian", "generated_interval", true)?; test_file("1.0.0-bigendian", "generated_interval", true) } @@ -132,7 +157,12 @@ fn write_100_large_batch() -> Result<()> { #[test] fn write_100_nested() -> Result<()> { test_file("1.0.0-littleendian", "generated_nested", false)?; - test_file("1.0.0-bigendian", "generated_nested", false)?; + test_file("1.0.0-bigendian", "generated_nested", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_nested() -> Result<()> { test_file("1.0.0-littleendian", "generated_nested", true)?; test_file("1.0.0-bigendian", "generated_nested", true) } @@ -144,7 +174,12 @@ fn write_100_nested_large_offsets() -> Result<()> { "generated_nested_large_offsets", false, )?; - test_file("1.0.0-bigendian", "generated_nested_large_offsets", false)?; + test_file("1.0.0-bigendian", "generated_nested_large_offsets", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_nested_large_offsets() -> Result<()> { test_file("1.0.0-littleendian", "generated_nested_large_offsets", true)?; test_file("1.0.0-bigendian", "generated_nested_large_offsets", true) } @@ -152,7 +187,12 @@ fn write_100_nested_large_offsets() -> Result<()> { #[test] fn write_100_null_trivial() -> Result<()> { test_file("1.0.0-littleendian", "generated_null_trivial", false)?; - test_file("1.0.0-bigendian", "generated_null_trivial", false)?; + test_file("1.0.0-bigendian", "generated_null_trivial", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_null_trivial() -> Result<()> { test_file("1.0.0-littleendian", "generated_null_trivial", true)?; test_file("1.0.0-bigendian", "generated_null_trivial", true) } @@ -160,7 +200,12 @@ fn write_100_null_trivial() -> Result<()> { #[test] fn write_100_null() -> Result<()> { test_file("1.0.0-littleendian", "generated_null", false)?; - test_file("1.0.0-bigendian", "generated_null", false)?; + test_file("1.0.0-bigendian", "generated_null", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_null() -> Result<()> { test_file("1.0.0-littleendian", "generated_null", true)?; test_file("1.0.0-bigendian", "generated_null", true) } @@ -176,7 +221,12 @@ fn write_100_primitive_large_offsets() -> Result<()> { "1.0.0-bigendian", "generated_primitive_large_offsets", false, - )?; + ) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_primitive_large_offsets() -> Result<()> { test_file( "1.0.0-littleendian", "generated_primitive_large_offsets", @@ -192,7 +242,12 @@ fn write_100_primitive_no_batches() -> Result<()> { "generated_primitive_no_batches", false, )?; - test_file("1.0.0-bigendian", "generated_primitive_no_batches", false)?; + test_file("1.0.0-bigendian", "generated_primitive_no_batches", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_primitive_no_batches() -> Result<()> { test_file("1.0.0-littleendian", "generated_primitive_no_batches", true)?; test_file("1.0.0-bigendian", "generated_primitive_no_batches", true) } @@ -204,7 +259,12 @@ fn write_100_primitive_zerolength() -> Result<()> { "generated_primitive_zerolength", false, )?; - test_file("1.0.0-bigendian", "generated_primitive_zerolength", false)?; + test_file("1.0.0-bigendian", "generated_primitive_zerolength", false) +} + +#[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support +fn write_100_compressed_primitive_zerolength() -> Result<()> { test_file("1.0.0-littleendian", "generated_primitive_zerolength", true)?; test_file("1.0.0-bigendian", "generated_primitive_zerolength", true) } @@ -262,6 +322,7 @@ fn write_generated_017_union() -> Result<()> { } #[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support fn write_boolean() -> Result<()> { use std::sync::Arc; let array = Arc::new(BooleanArray::from([ @@ -275,6 +336,7 @@ fn write_boolean() -> Result<()> { } #[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support fn write_sliced_utf8() -> Result<()> { use std::sync::Arc; let array = Arc::new(Utf8Array::::from_slice(["aa", "bb"]).slice(1, 1)) as Arc; @@ -283,6 +345,7 @@ fn write_sliced_utf8() -> Result<()> { } #[test] +#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support fn write_sliced_list() -> Result<()> { let data = vec![ Some(vec![Some(1i32), Some(2), Some(3)]), diff --git a/tests/it/test_util.rs b/tests/it/test_util.rs index edabe6b88f9..3a9ea1b52e9 100644 --- a/tests/it/test_util.rs +++ b/tests/it/test_util.rs @@ -1,89 +1,3 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Utils to make testing easier - -use std::{env, error::Error, path::PathBuf}; - -/// Returns the arrow test data directory, which is by default stored -/// in a git submodule rooted at `arrow/testing/data`. -/// -/// The default can be overridden by the optional environment -/// variable `ARROW_TEST_DATA` -/// -/// panics when the directory can not be found. -/// -/// Example: -/// ``` -/// let testdata = arrow::util::test_util::arrow_test_data(); -/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata); -/// assert!(std::path::PathBuf::from(csvdata).exists()); -/// ``` pub fn arrow_test_data() -> String { - match get_data_dir("ARROW_TEST_DATA", "testing/arrow-testing/data") { - Ok(pb) => pb.display().to_string(), - Err(err) => panic!("failed to get arrow data dir: {}", err), - } -} - -/// Returns a directory path for finding test data. -/// -/// udf_env: name of an environment variable -/// -/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR) -/// -/// Returns either: -/// The path referred to in `udf_env` if that variable is set and refers to a directory -/// The submodule_data directory relative to CARGO_MANIFEST_PATH -fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result> { - // Try user defined env. - if let Ok(dir) = env::var(udf_env) { - let trimmed = dir.trim().to_string(); - if !trimmed.is_empty() { - let pb = PathBuf::from(trimmed); - if pb.is_dir() { - return Ok(pb); - } else { - return Err(format!( - "the data dir `{}` defined by env {} not found", - pb.display().to_string(), - udf_env - ) - .into()); - } - } - } - - // The env is undefined or its value is trimmed to empty, let's try default dir. - - // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package", - // set by `cargo run` or `cargo test`, see: - // https://doc.rust-lang.org/cargo/reference/environment-variables.html - let dir = env!("CARGO_MANIFEST_DIR"); - - let pb = PathBuf::from(dir).join(submodule_data); - if pb.is_dir() { - Ok(pb) - } else { - Err(format!( - "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\ - HINT: try running `git submodule update --init`", - udf_env, - pb.display().to_string(), - ).into()) - } + "testing/arrow-testing/data".to_string() }