jorgecarleitao · jorgecarleitao · Nov 10, 2021 · Nov 10, 2021
diff --git a/.cargo/audit.toml b/.cargo/audit.toml
@@ -10,4 +10,8 @@ ignore = [
     # Therefore, this advisory does not affect us.
     "RUSTSEC-2020-0071",
     "RUSTSEC-2020-0159", # same as previous
+
+    # this cannot be addressed, only mitigated.
+    # See [.github/workflows/security.yml] for details on how we mitigate this.
+    "RUSTSEC-2021-0122",
 ]
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
@@ -8,3 +8,27 @@ jobs:
       - uses: actions-rs/audit-check@v1
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
+
+  # mitigation for RUSTSEC-2021-0122
+  # flatbuffers' usage of `unsafe` is problematic and a risk.
+  # This performs a round-trip over IPC (that uses flatbuffers) for some arrow types
+  # using miri, which hits much of `flatbuffers` usage in this crate.
+  miri-checks:
+    name: RUSTSEC-2021-0122 mitigation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: true # needed to test IPC, which are located in a submodule
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: nightly-2021-10-24
+          override: true
+      - uses: Swatinem/rust-cache@v1
+      - name: Install Miri
+        run: |
+          rustup component add miri
+          cargo miri setup
+
+      - name: Run
+        run: MIRIFLAGS="-Zmiri-disable-stacked-borrows -Zmiri-disable-isolation" cargo miri test --tests --features io_ipc,io_ipc_compression,io_json_integration io::ipc::write::file::write_100_nested
diff --git a/tests/it/io/ipc/write/file.rs b/tests/it/io/ipc/write/file.rs
@@ -14,7 +14,7 @@ fn round_trip(batch: RecordBatch) -> Result<()> {
     // write IPC version 5
     let written_result = {
         let options = WriteOptions {
-            compression: Some(Compression::ZSTD),
+            compression: Some(Compression::LZ4),
         };
         let mut writer = FileWriter::try_new(result, batch.schema(), options)?;
         writer.write(&batch)?;
@@ -85,39 +85,64 @@ fn test_file(version: &str, file_name: &str, compressed: bool) -> Result<()> {
 #[test]
 fn write_100_primitive() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_primitive", false)?;
-    test_file("1.0.0-bigendian", "generated_primitive", false)?;
+    test_file("1.0.0-bigendian", "generated_primitive", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_primitive() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_primitive", true)?;
     test_file("1.0.0-bigendian", "generated_primitive", true)
 }
 
 #[test]
 fn write_100_datetime() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_datetime", false)?;
-    test_file("1.0.0-bigendian", "generated_datetime", false)?;
+    test_file("1.0.0-bigendian", "generated_datetime", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_datetime() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_datetime", true)?;
     test_file("1.0.0-bigendian", "generated_datetime", true)
 }
 
 #[test]
 fn write_100_dictionary_unsigned() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_dictionary_unsigned", false)?;
-    test_file("1.0.0-bigendian", "generated_dictionary_unsigned", false)?;
+    test_file("1.0.0-bigendian", "generated_dictionary_unsigned", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_dictionary_unsigned() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_dictionary_unsigned", true)?;
     test_file("1.0.0-bigendian", "generated_dictionary_unsigned", true)
 }
 
 #[test]
 fn write_100_dictionary() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_dictionary", false)?;
-    test_file("1.0.0-bigendian", "generated_dictionary", false)?;
+    test_file("1.0.0-bigendian", "generated_dictionary", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_dictionary() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_dictionary", true)?;
     test_file("1.0.0-bigendian", "generated_dictionary", true)
 }
 
 #[test]
 fn write_100_interval() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_interval", false)?;
-    test_file("1.0.0-bigendian", "generated_interval", false)?;
+    test_file("1.0.0-bigendian", "generated_interval", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_interval() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_interval", true)?;
     test_file("1.0.0-bigendian", "generated_interval", true)
 }
@@ -132,7 +157,12 @@ fn write_100_large_batch() -> Result<()> {
 #[test]
 fn write_100_nested() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_nested", false)?;
-    test_file("1.0.0-bigendian", "generated_nested", false)?;
+    test_file("1.0.0-bigendian", "generated_nested", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_nested() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_nested", true)?;
     test_file("1.0.0-bigendian", "generated_nested", true)
 }
@@ -144,23 +174,38 @@ fn write_100_nested_large_offsets() -> Result<()> {
         "generated_nested_large_offsets",
         false,
     )?;
-    test_file("1.0.0-bigendian", "generated_nested_large_offsets", false)?;
+    test_file("1.0.0-bigendian", "generated_nested_large_offsets", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_nested_large_offsets() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_nested_large_offsets", true)?;
     test_file("1.0.0-bigendian", "generated_nested_large_offsets", true)
 }
 
 #[test]
 fn write_100_null_trivial() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_null_trivial", false)?;
-    test_file("1.0.0-bigendian", "generated_null_trivial", false)?;
+    test_file("1.0.0-bigendian", "generated_null_trivial", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_null_trivial() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_null_trivial", true)?;
     test_file("1.0.0-bigendian", "generated_null_trivial", true)
 }
 
 #[test]
 fn write_100_null() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_null", false)?;
-    test_file("1.0.0-bigendian", "generated_null", false)?;
+    test_file("1.0.0-bigendian", "generated_null", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_null() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_null", true)?;
     test_file("1.0.0-bigendian", "generated_null", true)
 }
@@ -176,7 +221,12 @@ fn write_100_primitive_large_offsets() -> Result<()> {
         "1.0.0-bigendian",
         "generated_primitive_large_offsets",
         false,
-    )?;
+    )
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_primitive_large_offsets() -> Result<()> {
     test_file(
         "1.0.0-littleendian",
         "generated_primitive_large_offsets",
@@ -192,7 +242,12 @@ fn write_100_primitive_no_batches() -> Result<()> {
         "generated_primitive_no_batches",
         false,
     )?;
-    test_file("1.0.0-bigendian", "generated_primitive_no_batches", false)?;
+    test_file("1.0.0-bigendian", "generated_primitive_no_batches", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_primitive_no_batches() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_primitive_no_batches", true)?;
     test_file("1.0.0-bigendian", "generated_primitive_no_batches", true)
 }
@@ -204,7 +259,12 @@ fn write_100_primitive_zerolength() -> Result<()> {
         "generated_primitive_zerolength",
         false,
     )?;
-    test_file("1.0.0-bigendian", "generated_primitive_zerolength", false)?;
+    test_file("1.0.0-bigendian", "generated_primitive_zerolength", false)
+}
+
+#[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
+fn write_100_compressed_primitive_zerolength() -> Result<()> {
     test_file("1.0.0-littleendian", "generated_primitive_zerolength", true)?;
     test_file("1.0.0-bigendian", "generated_primitive_zerolength", true)
 }
@@ -262,6 +322,7 @@ fn write_generated_017_union() -> Result<()> {
 }
 
 #[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
 fn write_boolean() -> Result<()> {
     use std::sync::Arc;
     let array = Arc::new(BooleanArray::from([
@@ -275,6 +336,7 @@ fn write_boolean() -> Result<()> {
 }
 
 #[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
 fn write_sliced_utf8() -> Result<()> {
     use std::sync::Arc;
     let array = Arc::new(Utf8Array::<i32>::from_slice(["aa", "bb"]).slice(1, 1)) as Arc<dyn Array>;
@@ -283,6 +345,7 @@ fn write_sliced_utf8() -> Result<()> {
 }
 
 #[test]
+#[cfg_attr(miri, ignore)] // compression uses FFI, which miri does not support
 fn write_sliced_list() -> Result<()> {
     let data = vec![
         Some(vec![Some(1i32), Some(2), Some(3)]),

diff --git a/tests/it/test_util.rs b/tests/it/test_util.rs
@@ -1,89 +1,3 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Utils to make testing easier
-
-use std::{env, error::Error, path::PathBuf};
-
-/// Returns the arrow test data directory, which is by default stored
-/// in a git submodule rooted at `arrow/testing/data`.
-///
-/// The default can be overridden by the optional environment
-/// variable `ARROW_TEST_DATA`
-///
-/// panics when the directory can not be found.
-///
-/// Example:
-/// ```
-/// let testdata = arrow::util::test_util::arrow_test_data();
-/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
-/// assert!(std::path::PathBuf::from(csvdata).exists());
-/// ```
 pub fn arrow_test_data() -> String {
-    match get_data_dir("ARROW_TEST_DATA", "testing/arrow-testing/data") {
-        Ok(pb) => pb.display().to_string(),
-        Err(err) => panic!("failed to get arrow data dir: {}", err),
-    }
-}
-
-/// Returns a directory path for finding test data.
-///
-/// udf_env: name of an environment variable
-///
-/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR)
-///
-///  Returns either:
-/// The path referred to in `udf_env` if that variable is set and refers to a directory
-/// The submodule_data directory relative to CARGO_MANIFEST_PATH
-fn get_data_dir(udf_env: &str, submodule_data: &str) -> Result<PathBuf, Box<dyn Error>> {
-    // Try user defined env.
-    if let Ok(dir) = env::var(udf_env) {
-        let trimmed = dir.trim().to_string();
-        if !trimmed.is_empty() {
-            let pb = PathBuf::from(trimmed);
-            if pb.is_dir() {
-                return Ok(pb);
-            } else {
-                return Err(format!(
-                    "the data dir `{}` defined by env {} not found",
-                    pb.display().to_string(),
-                    udf_env
-                )
-                .into());
-            }
-        }
-    }
-
-    // The env is undefined or its value is trimmed to empty, let's try default dir.
-
-    // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package",
-    // set by `cargo run` or `cargo test`, see:
-    // https://doc.rust-lang.org/cargo/reference/environment-variables.html
-    let dir = env!("CARGO_MANIFEST_DIR");
-
-    let pb = PathBuf::from(dir).join(submodule_data);
-    if pb.is_dir() {
-        Ok(pb)
-    } else {
-        Err(format!(
-            "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\
-             HINT: try running `git submodule update --init`",
-            udf_env,
-            pb.display().to_string(),
-        ).into())
-    }
+    "testing/arrow-testing/data".to_string()
 }