From 5e100f67b2c2e8ba0fa4481212365554c0208ecc Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 24 Oct 2023 16:04:02 +0200 Subject: [PATCH] PVF worker: Add seccomp restrictions (block networking) We're already working on sandboxing by [blocking all unneeded syscalls](https://github.com/paritytech/polkadot-sdk/issues/882). However, due to the wide scope it will take a while longer. This PR starts with a much smaller scope, only blocking network-related syscalls until the above is ready. For security we block the following with `seccomp`: - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without affecting consensus. - `io_uring` - as discussed [here](https://github.com/paritytech/polkadot/pull/7334#discussion_r1281611134), io_uring allows for networking and needs to be blocked. See below for a discussion on the safety of doing this. - `connect`ing to sockets - the above two points are enough for networking and is what birdcage does (or [used to do](https://github.com/phylum-dev/birdcage/pull/47)) to restrict networking. However, it is possible to [connect to abstract unix sockets](https://lore.kernel.org/landlock/20231023.ahphah4Wii4v@digikod.net/T/#u) to do some kinds of sandbox escapes, so we also block the `connect` syscall. (Intentionally left out of implementer's guide because it felt like too much detail.) `io_uring` is just a way of issuing system calls in an async manner, and there is nothing stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally, not many applications use `io_uring` in production yet, because of the numerous kernel CVEs discovered. It's still under a lot of development. Android outright banned `io_uring` for these reasons. Considering `io_uring`'s status, and that it very likely would get detected either by our [recently-added static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by testing, I think it is fairly safe to block it. If execution hits an edge case code path unique to a given machine, it's already taken a non-deterministic branch anyway. After all, we just care that the majority of validators reach the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can always admit fault and refund the wrong validator. On the other hand, if all validators take the code path that results in a seccomp violation, then they would all vote against the current candidate, which is also fine. The violation would get logged (in big scary letters) and hopefully some validator reports it to us. Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is no consensus. But so many things would have to go wrong for that to happen: 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for IO-heavy applications) 2. The new syscall is not detected by our static analysis 3. It is never triggered in any of our tests 4. It then gets triggered on some super edge case in production on 50% of validators causing a stall (bad but very unlikely) 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad?) Considering how many things would have to go wrong here, we believe it's safe to block `io_uring`. Closes https://github.com/paritytech/polkadot-sdk/issues/619 Original PR in Polkadot repo: https://github.com/paritytech/polkadot/pull/7334 --- Cargo.lock | 11 + .../node/core/candidate-validation/src/lib.rs | 3 +- polkadot/node/core/pvf/common/Cargo.toml | 2 + polkadot/node/core/pvf/common/src/lib.rs | 2 + .../node/core/pvf/common/src/worker/mod.rs | 30 +- .../core/pvf/common/src/worker/security.rs | 512 ------------------ .../common/src/worker/security/landlock.rs | 327 +++++++++++ .../pvf/common/src/worker/security/mod.rs | 206 +++++++ .../pvf/common/src/worker/security/seccomp.rs | 197 +++++++ polkadot/node/core/pvf/src/host.rs | 79 ++- polkadot/node/core/pvf/src/worker_intf.rs | 3 + polkadot/node/core/pvf/tests/it/adder.rs | 17 +- polkadot/node/core/pvf/tests/it/main.rs | 20 +- .../src/node/utility/pvf-host-and-workers.md | 11 + 14 files changed, 875 insertions(+), 545 deletions(-) delete mode 100644 polkadot/node/core/pvf/common/src/worker/security.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/landlock.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/mod.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/seccomp.rs diff --git a/Cargo.lock b/Cargo.lock index a8d679c6ce8b..e57e0265457c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12251,11 +12251,13 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", + "seccompiler", "sp-core", "sp-externalities", "sp-io", "sp-tracing", "tempfile", + "thiserror", "tokio", "tracing-gum", ] @@ -16100,6 +16102,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "seccompiler" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f6575e3c2b3a0fe2ef3e53855b6a8dead7c29f783da5e123d378c8c6a89017e" +dependencies = [ + "libc", +] + [[package]] name = "secp256k1" version = "0.24.3" diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 21a7121d47bd..93db7d11cee8 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -149,7 +149,8 @@ async fn run( exec_worker_path, ), pvf_metrics, - ); + ) + .await; ctx.spawn_blocking("pvf-validation-host", task.boxed())?; loop { diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 4bdacca72f42..e4ca20ffdb29 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -30,6 +30,8 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.3.0" +seccompiler = "0.3.0" +thiserror = "1.0.31" [dev-dependencies] assert_matches = "1.4.0" diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 53c287ea9709..813dccadf055 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -50,6 +50,8 @@ pub mod tests { pub struct SecurityStatus { /// Whether the landlock features we use are fully available on this system. pub can_enable_landlock: bool, + /// Whether the seccomp features we use are fully available on this system. + pub can_enable_seccomp: bool, // Whether we are able to unshare the user namespace and change the filesystem root. pub can_unshare_user_namespace_and_change_root: bool, } diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index d0bd5b6bd7c5..d1e17bac24ff 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -75,6 +75,13 @@ macro_rules! decl_worker_main { let status = -1; std::process::exit(status) }, + "--check-can-enable-seccomp" => { + #[cfg(target_os = "linux")] + let status = if security::seccomp::check_is_fully_enabled() { 0 } else { -1 }; + #[cfg(not(target_os = "linux"))] + let status = -1; + std::process::exit(status) + }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] let status = if let Err(err) = security::unshare_user_namespace_and_change_root( @@ -119,6 +126,7 @@ macro_rules! decl_worker_main { let mut worker_dir_path = None; let mut node_version = None; let mut can_enable_landlock = false; + let mut can_enable_seccomp = false; let mut can_unshare_user_namespace_and_change_root = false; let mut i = 2; @@ -137,6 +145,7 @@ macro_rules! decl_worker_main { i += 1 }, "--can-enable-landlock" => can_enable_landlock = true, + "--can-enable-seccomp" => can_enable_seccomp = true, "--can-unshare-user-namespace-and-change-root" => can_unshare_user_namespace_and_change_root = true, arg => panic!("Unexpected argument found: {}", arg), @@ -151,6 +160,7 @@ macro_rules! decl_worker_main { let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned(); let security_status = $crate::SecurityStatus { can_enable_landlock, + can_enable_seccomp, can_unshare_user_namespace_and_change_root, }; @@ -307,6 +317,22 @@ pub fn worker_event_loop( let landlock_status = security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { + // We previously were able to enable, so this should never happen. + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", + landlock_status + ); + } + } + + #[cfg(target_os = "linux")] + if security_status.can_enable_seccomp { + let seccomp_status = + security::seccomp::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); + if !matches!(seccomp_status, Ok(())) { // We previously were able to enable, so this should never happen. // // TODO: Make this a real error in secure-mode. See: @@ -315,8 +341,8 @@ pub fn worker_event_loop( target: LOG_TARGET, %worker_kind, %worker_pid, - "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", - landlock_status + "could not fully enable seccomp: {:?}. This should not happen, please report to the Polkadot devs", + seccomp_status ); } } diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs deleted file mode 100644 index 1b7614177448..000000000000 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ /dev/null @@ -1,512 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! Functionality for securing workers. -//! -//! This is needed because workers are used to compile and execute untrusted code (PVFs). -//! -//! We currently employ the following security measures: -//! -//! - Restrict filesystem -//! - Use Landlock to remove all unnecessary FS access rights. -//! - Unshare the user and mount namespaces. -//! - Change the root directory to a worker-specific temporary directory. -//! - Remove env vars - -use crate::{worker::WorkerKind, LOG_TARGET}; - -/// Unshare the user namespace and change root to be the artifact directory. -/// -/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: -/// "CLONE_NEWUSER requires that the calling process is not threaded." -#[cfg(target_os = "linux")] -pub fn unshare_user_namespace_and_change_root( - worker_kind: WorkerKind, - worker_pid: u32, - worker_dir_path: &std::path::Path, -) -> Result<(), String> { - use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; - - // The following was copied from the `cstr_core` crate. - // - // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 - #[inline] - #[doc(hidden)] - const fn cstr_is_valid(bytes: &[u8]) -> bool { - if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { - return false - } - - let mut index = 0; - while index < bytes.len() - 1 { - if bytes[index] == 0 { - return false - } - index += 1; - } - true - } - - macro_rules! cstr { - ($e:expr) => {{ - const STR: &[u8] = concat!($e, "\0").as_bytes(); - const STR_VALID: bool = cstr_is_valid(STR); - let _ = [(); 0 - (!(STR_VALID) as usize)]; - #[allow(unused_unsafe)] - unsafe { - core::ffi::CStr::from_bytes_with_nul_unchecked(STR) - } - }} - } - - gum::debug!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?worker_dir_path, - "unsharing the user namespace and calling pivot_root", - ); - - let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) - .expect("on unix; the path will never contain 0 bytes; qed"); - - // Wrapper around all the work to prevent repetitive error handling. - // - // # Errors - // - // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does - // not give the context of which call failed, so we return a &str error. - || -> Result<(), &'static str> { - // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps - // (2) and (3) are adapted from the example in pivot_root(2), with the additional - // change described in the `pivot_root(".", ".")` section. - unsafe { - // 1. `unshare` the user and the mount namespaces. - if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { - return Err("unshare user and mount namespaces") - } - - // 2. Setup mounts. - // - // Ensure that new root and its parent mount don't have shared propagation (which would - // cause pivot_root() to return an error), and prevent propagation of mount events to - // the initial mount namespace. - if libc::mount( - ptr::null(), - cstr!("/").as_ptr(), - ptr::null(), - libc::MS_REC | libc::MS_PRIVATE, - ptr::null(), - ) < 0 - { - return Err("mount MS_PRIVATE") - } - // Ensure that the new root is a mount point. - let additional_flags = - if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { - libc::MS_RDONLY - } else { - 0 - }; - if libc::mount( - worker_dir_path_c.as_ptr(), - worker_dir_path_c.as_ptr(), - ptr::null(), // ignored when MS_BIND is used - libc::MS_BIND | - libc::MS_REC | libc::MS_NOEXEC | - libc::MS_NODEV | libc::MS_NOSUID | - libc::MS_NOATIME | additional_flags, - ptr::null(), // ignored when MS_BIND is used - ) < 0 - { - return Err("mount MS_BIND") - } - - // 3. `pivot_root` to the artifact directory. - if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { - return Err("chdir to worker dir path") - } - if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { - return Err("pivot_root") - } - if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { - return Err("umount the old root mount point") - } - } - - Ok(()) - }() - .map_err(|err_ctx| { - let err = std::io::Error::last_os_error(); - format!("{}: {}", err_ctx, err) - })?; - - // Do some assertions. - if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { - return Err("expected current dir after pivot_root to be `/`".into()) - } - env::set_current_dir("..").map_err(|err| err.to_string())?; - if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { - return Err("expected not to be able to break out of new root by doing `..`".into()) - } - - Ok(()) -} - -/// Require env vars to have been removed when spawning the process, to prevent malicious code from -/// accessing them. -pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { - let mut ok = true; - - for (key, value) in std::env::vars_os() { - // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of - // randomness for malicious code. In the future we can remove it also and log in the host; - // see . - if key == "RUST_LOG" { - continue - } - // An exception for MacOS. This is not a secure platform anyway, so we let it slide. - #[cfg(target_os = "macos")] - if key == "__CF_USER_TEXT_ENCODING" { - continue - } - - gum::error!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?key, - ?value, - "env var was present that should have been removed", - ); - - ok = false; - } - - ok -} - -/// The [landlock] docs say it best: -/// -/// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict -/// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security -/// sandboxes as new security layers in addition to the existing system-wide access-controls. This -/// kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or -/// malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, -/// to securely restrict themselves." -/// -/// [landlock]: https://docs.rs/landlock/latest/landlock/index.html -#[cfg(target_os = "linux")] -pub mod landlock { - pub use landlock::RulesetStatus; - - use crate::{worker::WorkerKind, LOG_TARGET}; - use landlock::*; - use std::{ - fmt, - path::{Path, PathBuf}, - }; - - /// Landlock ABI version. We use ABI V1 because: - /// - /// 1. It is supported by our reference kernel version. - /// 2. Later versions do not (yet) provide additional security that would benefit us. - /// - /// # Versions (as of October 2023) - /// - /// - Polkadot reference kernel version: 5.16+ - /// - /// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads. - /// - /// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During - /// execution an attacker can only affect the name of a symlinked artifact and not the - /// original one. - /// - /// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can - /// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we - /// plan to check for file integrity anyway; see - /// . - /// - /// # Determinism - /// - /// You may wonder whether we could always use the latest ABI instead of only the ABI supported - /// by the reference kernel version. It seems plausible, since landlock provides a best-effort - /// approach to enabling sandboxing. For example, if the reference version only supported V1 and - /// we were on V2, then landlock would use V2 if it was supported on the current machine, and - /// just fall back to V1 if not. - /// - /// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, - /// they may have different semantics on some PVFs. So a malicious PVF now has a new attack - /// vector: they can exploit this indeterminism between landlock ABIs! - /// - /// On the other hand we do want validators to be as secure as possible and protect their keys - /// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy - /// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version - /// supports it or if it introduces some new feature that is beneficial to security. - pub const LANDLOCK_ABI: ABI = ABI::V1; - - #[derive(Debug)] - pub enum TryRestrictError { - InvalidExceptionPath(PathBuf), - RulesetError(RulesetError), - } - - impl From for TryRestrictError { - fn from(err: RulesetError) -> Self { - Self::RulesetError(err) - } - } - - impl fmt::Display for TryRestrictError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path), - Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()), - } - } - } - - impl std::error::Error for TryRestrictError {} - - /// Try to enable landlock for the given kind of worker. - pub fn enable_for_worker( - worker_kind: WorkerKind, - worker_pid: u32, - worker_dir_path: &Path, - ) -> Result> { - let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { - WorkerKind::Prepare => { - vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] - }, - WorkerKind::Execute => { - vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] - }, - WorkerKind::CheckPivotRoot => - panic!("this should only be passed for checking pivot_root; qed"), - }; - - gum::debug!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?worker_dir_path, - "enabling landlock with exceptions: {:?}", - exceptions, - ); - - Ok(try_restrict(exceptions)?) - } - - // TODO: - /// Runs a check for landlock and returns a single bool indicating whether the given landlock - /// ABI is fully enabled on the current Linux environment. - pub fn check_is_fully_enabled() -> bool { - let status_from_thread: Result> = - match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())) - .join() - { - Ok(Ok(status)) => Ok(status), - Ok(Err(ruleset_err)) => Err(ruleset_err.into()), - Err(_err) => Err("a panic occurred in try_restrict".into()), - }; - - matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) - } - - /// Tries to restrict the current thread (should only be called in a process' main thread) with - /// the following landlock access controls: - /// - /// 1. all global filesystem access restricted, with optional exceptions - /// 2. ... more sandbox types (e.g. networking) may be supported in the future. - /// - /// If landlock is not supported in the current environment this is simply a noop. - /// - /// # Returns - /// - /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). - fn try_restrict(fs_exceptions: I) -> Result - where - I: IntoIterator, - P: AsRef, - A: Into>, - { - let mut ruleset = - Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; - for (fs_path, access_bits) in fs_exceptions { - let paths = &[fs_path.as_ref().to_owned()]; - let mut rules = path_beneath_rules(paths, access_bits).peekable(); - if rules.peek().is_none() { - // `path_beneath_rules` silently ignores missing paths, so check for it manually. - return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned())) - } - ruleset = ruleset.add_rules(rules)?; - } - let status = ruleset.restrict_self()?; - Ok(status.ruleset) - } - - #[cfg(test)] - mod tests { - use super::*; - use std::{fs, io::ErrorKind, thread}; - - #[test] - fn restricted_thread_cannot_read_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread cannot read from FS. - let handle = - thread::spawn(|| { - // Create, write, and read two tmp files. This should succeed before any - // landlock restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); - let path1 = tmpfile1.path(); - let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); - let path2 = tmpfile2.path(); - - fs::write(path1, TEXT).unwrap(); - let s = fs::read_to_string(path1).unwrap(); - assert_eq!(s, TEXT); - fs::write(path2, TEXT).unwrap(); - let s = fs::read_to_string(path2).unwrap(); - assert_eq!(s, TEXT); - - // Apply Landlock with a read exception for only one of the files. - let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read from both files, only tmpfile1 should succeed. - let result = fs::read_to_string(path1); - assert!(matches!( - result, - Ok(s) if s == TEXT - )); - let result = fs::read_to_string(path2); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - - // Apply Landlock for all files. - let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read from tmpfile1 after landlock, it should fail. - let result = fs::read_to_string(path1); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } - - #[test] - fn restricted_thread_cannot_write_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread cannot write to FS. - let handle = - thread::spawn(|| { - // Create and write two tmp files. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); - let path1 = tmpfile1.path(); - let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); - let path2 = tmpfile2.path(); - - fs::write(path1, TEXT).unwrap(); - fs::write(path2, TEXT).unwrap(); - - // Apply Landlock with a write exception for only one of the files. - let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to write to both files, only tmpfile1 should succeed. - let result = fs::write(path1, TEXT); - assert!(matches!(result, Ok(_))); - let result = fs::write(path2, TEXT); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - - // Apply Landlock for all files. - let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to write to tmpfile1 after landlock, it should fail. - let result = fs::write(path1, TEXT); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } - - // Test that checks whether landlock under our ABI version is able to truncate files. - #[test] - fn restricted_thread_can_truncate_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread can truncate file. - let handle = - thread::spawn(|| { - // Create and write a file. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile = tempfile::NamedTempFile::new().unwrap(); - let path = tmpfile.path(); - - fs::write(path, TEXT).unwrap(); - - // Apply Landlock with all exceptions under the current ABI. - let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to truncate the file. - let result = tmpfile.as_file().set_len(0); - assert!(result.is_ok()); - }); - - assert!(handle.join().is_ok()); - } - } -} diff --git a/polkadot/node/core/pvf/common/src/worker/security/landlock.rs b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs new file mode 100644 index 000000000000..5444b1933b23 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs @@ -0,0 +1,327 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! The [landlock] docs say it best: +//! +//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict +//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security +//! sandboxes as new security layers in addition to the existing system-wide access-controls. This +//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or +//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, +//! to securely restrict themselves." +//! +//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html + +pub use landlock::RulesetStatus; + +use crate::{ + worker::{stringify_panic_payload, WorkerKind}, + LOG_TARGET, +}; +use landlock::*; +use std::{ + path::{Path, PathBuf}, +}; + +/// Landlock ABI version. We use ABI V1 because: +/// +/// 1. It is supported by our reference kernel version. +/// 2. Later versions do not (yet) provide additional security that would benefit us. +/// +/// # Versions (as of October 2023) +/// +/// - Polkadot reference kernel version: 5.16+ +/// +/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads. +/// +/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During +/// execution an attacker can only affect the name of a symlinked artifact and not the original +/// one. +/// +/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can +/// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we +/// plan to check for file integrity anyway; see +/// . +/// +/// # Determinism +/// +/// You may wonder whether we could always use the latest ABI instead of only the ABI supported +/// by the reference kernel version. It seems plausible, since landlock provides a best-effort +/// approach to enabling sandboxing. For example, if the reference version only supported V1 and +/// we were on V2, then landlock would use V2 if it was supported on the current machine, and +/// just fall back to V1 if not. +/// +/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, +/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack +/// vector: they can exploit this indeterminism between landlock ABIs! +/// +/// On the other hand we do want validators to be as secure as possible and protect their keys +/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy +/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version +/// supports it or if it introduces some new feature that is beneficial to security. +pub const LANDLOCK_ABI: ABI = ABI::V1; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Invalid exception path: {0:?}")] + InvalidExceptionPath(PathBuf), + #[error(transparent)] + RulesetError(#[from] RulesetError), + #[error("A panic occurred in try_restrict: {0}")] + Panic(String), +} + +pub type Result = std::result::Result; + +/// Try to enable landlock for the given kind of worker. +pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &Path, +) -> Result { + let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { + WorkerKind::Prepare => { + vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] + }, + WorkerKind::Execute => { + vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] + }, + WorkerKind::CheckPivotRoot => + panic!("this should only be passed for checking pivot_root; qed"), + }; + + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling landlock with exceptions: {:?}", + exceptions, + ); + + try_restrict(exceptions) +} + +// TODO: +/// Runs a check for landlock and returns a single bool indicating whether the given landlock +/// ABI is fully enabled on the current Linux environment. +pub fn check_is_fully_enabled() -> bool { + let status_from_thread: Result = + match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() + { + Ok(Ok(status)) => Ok(status), + Ok(Err(ruleset_err)) => Err(ruleset_err.into()), + Err(err) => Err(Error::Panic(stringify_panic_payload(err))), + }; + + matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) +} + +/// Tries to restrict the current thread (should only be called in a process' main thread) with +/// the following landlock access controls: +/// +/// 1. all global filesystem access restricted, with optional exceptions +/// 2. ... more sandbox types (e.g. networking) may be supported in the future. +/// +/// If landlock is not supported in the current environment this is simply a noop. +/// +/// # Returns +/// +/// The status of the restriction (whether it was fully, partially, or not-at-all enforced). +fn try_restrict(fs_exceptions: I) -> Result +where + I: IntoIterator, + P: AsRef, + A: Into>, +{ + let mut ruleset = + Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; + for (fs_path, access_bits) in fs_exceptions { + let paths = &[fs_path.as_ref().to_owned()]; + let mut rules = path_beneath_rules(paths, access_bits).peekable(); + if rules.peek().is_none() { + // `path_beneath_rules` silently ignores missing paths, so check for it manually. + return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned())) + } + ruleset = ruleset.add_rules(rules)?; + } + let status = ruleset.restrict_self()?; + Ok(status.ruleset) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{fs, io::ErrorKind, thread}; + + #[test] + fn restricted_thread_cannot_read_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread cannot read from FS. + let handle = thread::spawn(|| { + // Create, write, and read two tmp files. This should succeed before any + // landlock restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + let s = fs::read_to_string(path1).unwrap(); + assert_eq!(s, TEXT); + fs::write(path2, TEXT).unwrap(); + let s = fs::read_to_string(path2).unwrap(); + assert_eq!(s, TEXT); + + // Apply Landlock with a read exception for only one of the files. + let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to read from both files, only tmpfile1 should succeed. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Ok(s) if s == TEXT + )); + let result = fs::read_to_string(path2); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to read from tmpfile1 after landlock, it should fail. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + #[test] + fn restricted_thread_cannot_write_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread cannot write to FS. + let handle = thread::spawn(|| { + // Create and write two tmp files. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + fs::write(path2, TEXT).unwrap(); + + // Apply Landlock with a write exception for only one of the files. + let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to write to both files, only tmpfile1 should succeed. + let result = fs::write(path1, TEXT); + assert!(matches!(result, Ok(_))); + let result = fs::write(path2, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to write to tmpfile1 after landlock, it should fail. + let result = fs::write(path1, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + // Test that checks whether landlock under our ABI version is able to truncate files. + #[test] + fn restricted_thread_can_truncate_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread can truncate file. + let handle = thread::spawn(|| { + // Create and write a file. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let path = tmpfile.path(); + + fs::write(path, TEXT).unwrap(); + + // Apply Landlock with all exceptions under the current ABI. + let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to truncate the file. + let result = tmpfile.as_file().set_len(0); + assert!(result.is_ok()); + }); + + assert!(handle.join().is_ok()); + } +} diff --git a/polkadot/node/core/pvf/common/src/worker/security/mod.rs b/polkadot/node/core/pvf/common/src/worker/security/mod.rs new file mode 100644 index 000000000000..0f74df148707 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/mod.rs @@ -0,0 +1,206 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for securing workers. +//! +//! This is needed because workers are used to compile and execute untrusted code (PVFs). +//! +//! We currently employ the following security measures: +//! +//! - Restrict filesystem +//! - Use Landlock to remove all unnecessary FS access rights. +//! - Unshare the user and mount namespaces. +//! - Change the root directory to a worker-specific temporary directory. +//! - Remove env vars + +use crate::{worker::WorkerKind, LOG_TARGET}; + +#[cfg(target_os = "linux")] +pub mod landlock; + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub mod seccomp; + +/// Unshare the user namespace and change root to be the artifact directory. +/// +/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: +/// "CLONE_NEWUSER requires that the calling process is not threaded." +#[cfg(target_os = "linux")] +pub fn unshare_user_namespace_and_change_root( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &std::path::Path, +) -> Result<(), String> { + use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; + + // The following was copied from the `cstr_core` crate. + // + // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 + #[inline] + #[doc(hidden)] + const fn cstr_is_valid(bytes: &[u8]) -> bool { + if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { + return false + } + + let mut index = 0; + while index < bytes.len() - 1 { + if bytes[index] == 0 { + return false + } + index += 1; + } + true + } + + macro_rules! cstr { + ($e:expr) => {{ + const STR: &[u8] = concat!($e, "\0").as_bytes(); + const STR_VALID: bool = cstr_is_valid(STR); + let _ = [(); 0 - (!(STR_VALID) as usize)]; + #[allow(unused_unsafe)] + unsafe { + core::ffi::CStr::from_bytes_with_nul_unchecked(STR) + } + }} + } + + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "unsharing the user namespace and calling pivot_root", + ); + + let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) + .expect("on unix; the path will never contain 0 bytes; qed"); + + // Wrapper around all the work to prevent repetitive error handling. + // + // # Errors + // + // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does + // not give the context of which call failed, so we return a &str error. + || -> Result<(), &'static str> { + // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps + // (2) and (3) are adapted from the example in pivot_root(2), with the additional + // change described in the `pivot_root(".", ".")` section. + unsafe { + // 1. `unshare` the user and the mount namespaces. + if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { + return Err("unshare user and mount namespaces") + } + + // 2. Setup mounts. + // + // Ensure that new root and its parent mount don't have shared propagation (which would + // cause pivot_root() to return an error), and prevent propagation of mount events to + // the initial mount namespace. + if libc::mount( + ptr::null(), + cstr!("/").as_ptr(), + ptr::null(), + libc::MS_REC | libc::MS_PRIVATE, + ptr::null(), + ) < 0 + { + return Err("mount MS_PRIVATE") + } + // Ensure that the new root is a mount point. + let additional_flags = + if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { + libc::MS_RDONLY + } else { + 0 + }; + if libc::mount( + worker_dir_path_c.as_ptr(), + worker_dir_path_c.as_ptr(), + ptr::null(), // ignored when MS_BIND is used + libc::MS_BIND | + libc::MS_REC | libc::MS_NOEXEC | + libc::MS_NODEV | libc::MS_NOSUID | + libc::MS_NOATIME | additional_flags, + ptr::null(), // ignored when MS_BIND is used + ) < 0 + { + return Err("mount MS_BIND") + } + + // 3. `pivot_root` to the artifact directory. + if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { + return Err("chdir to worker dir path") + } + if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { + return Err("pivot_root") + } + if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { + return Err("umount the old root mount point") + } + } + + Ok(()) + }() + .map_err(|err_ctx| { + let err = std::io::Error::last_os_error(); + format!("{}: {}", err_ctx, err) + })?; + + // Do some assertions. + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected current dir after pivot_root to be `/`".into()) + } + env::set_current_dir("..").map_err(|err| err.to_string())?; + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected not to be able to break out of new root by doing `..`".into()) + } + + Ok(()) +} + +/// Require env vars to have been removed when spawning the process, to prevent malicious code from +/// accessing them. +pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { + let mut ok = true; + + for (key, value) in std::env::vars_os() { + // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of + // randomness for malicious code. In the future we can remove it also and log in the host; + // see . + if key == "RUST_LOG" { + continue + } + // An exception for MacOS. This is not a secure platform anyway, so we let it slide. + #[cfg(target_os = "macos")] + if key == "__CF_USER_TEXT_ENCODING" { + continue + } + + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?key, + ?value, + "env var was present that should have been removed", + ); + + ok = false; + } + + ok +} diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs new file mode 100644 index 000000000000..f3c312cc4116 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -0,0 +1,197 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for sandboxing workers by restricting their capabilities by blocking certain +//! syscalls with seccomp. +//! +//! For security we block the following: +//! +//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without +//! affecting consensus. +//! +//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the +//! safety of doing this. +//! +//! - `connect`ing to sockets - the above two points are enough for networking. However, it is +//! possible to [connect to abstract unix +//! sockets](https://lore.kernel.org/landlock/20231023.ahphah4Wii4v@digikod.net/T/#u) to do some +//! kinds of sandbox escapes, so we also block the `connect` syscall. +//! +//! # Safety of blocking io_uring +//! +//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing +//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally, +//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs +//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these +//! reasons. +//! +//! Considering `io_uring`'s status discussed above, and that it very likely would get detected +//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by +//! testing, we think it is safe to block it. +//! +//! ## Consensus analysis +//! +//! If execution hits an edge case code path unique to a given machine, it's already taken a +//! non-deterministic branch anyway. After all, we just care that the majority of validators reach +//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can +//! always admit fault and refund the wrong validator. On the other hand, if all validators take the +//! code path that results in a seccomp violation, then they would all vote against the current +//! candidate, which is also fine. The violation would get logged (in big scary letters) and +//! hopefully some validator reports it to us. +//! +//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is +//! no consensus. But so many things would have to go wrong for that to happen: +//! +//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for +//! IO-heavy applications) +//! +//! 2. The new syscall is not detected by our static analysis +//! +//! 3. It is never triggered in any of our tests +//! +//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a +//! stall (bad but very unlikely) +//! +//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad) +//! +//! Considering how many things would have to go wrong here, we believe it's safe to block +//! `io_uring`. +//! +//! # Action on caught syscall +//! +//! TODO + +use crate::{ + worker::{stringify_panic_payload, WorkerKind}, + LOG_TARGET, +}; +use seccompiler::*; +use std::{collections::BTreeMap, path::Path}; + +/// The action to take on caught syscalls. +#[cfg(not(test))] +const CAUGHT_ACTION: SeccompAction = SeccompAction::KillProcess; +/// Don't kill the process when testing. +#[cfg(test)] +const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32); + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error(transparent)] + Seccomp(#[from] seccompiler::Error), + #[error(transparent)] + Backend(#[from] seccompiler::BackendError), + #[error("A panic occurred in try_restrict: {0}")] + Panic(String), +} + +pub type Result = std::result::Result; + +/// Try to enable seccomp for the given kind of worker. +pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &Path, +) -> Result<()> { + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling seccomp", + ); + + try_restrict() +} + +/// Runs a check for seccomp and returns a single bool indicating whether seccomp with our rules is +/// fully enabled on the current Linux environment. +pub fn check_is_fully_enabled() -> bool { + let status_from_thread: Result<()> = match std::thread::spawn(|| try_restrict()).join() { + Ok(Ok(())) => Ok(()), + Ok(Err(err)) => Err(err.into()), + Err(err) => Err(Error::Panic(stringify_panic_payload(err))), + }; + + matches!(status_from_thread, Ok(())) +} + +/// Applies a `seccomp` filter to disable networking for the PVF threads. +pub fn try_restrict() -> Result<()> { + // Build a `seccomp` filter which by default allows all syscalls except those blocked in the + // blacklist. + let mut blacklisted_rules = BTreeMap::default(); + + // Restrict the creation of sockets. + blacklisted_rules.insert(libc::SYS_socketpair, vec![]); + blacklisted_rules.insert(libc::SYS_socket, vec![]); + + // Restrict io_uring. + blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]); + blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]); + blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]); + + // Restrict connecting to abstract unix sockets. + blacklisted_rules.insert(libc::SYS_connect, vec![]); + + let filter = SeccompFilter::new( + blacklisted_rules, + // Mismatch action: what to do if not in rule list. + SeccompAction::Allow, + // Match action: what to do if in rule list. + CAUGHT_ACTION, + TargetArch::x86_64, + )?; + + let bpf_prog: BpfProgram = filter.try_into()?; + + // Applies filter (runs seccomp) to the calling thread. + seccompiler::apply_filter(&bpf_prog)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{io::ErrorKind, net::TcpListener, thread}; + + #[test] + fn sandboxed_thread_cannot_use_sockets() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + let handle = thread::spawn(|| { + // Open a socket, this should succeed before seccomp is applied. + TcpListener::bind("127.0.0.1:0").unwrap(); + + let status = try_restrict(); + if !matches!(status, Ok(())) { + panic!("Ruleset should be enforced since we checked if seccomp is enabled"); + } + + // Try to open a socket after seccomp. + assert!(matches!( + TcpListener::bind("127.0.0.1:0"), + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } +} diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 81695829122b..e3fcc294298e 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -29,7 +29,7 @@ use crate::{ use always_assert::never; use futures::{ channel::{mpsc, oneshot}, - Future, FutureExt, SinkExt, StreamExt, + join, Future, FutureExt, SinkExt, StreamExt, }; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, @@ -200,15 +200,21 @@ impl Config { /// The future should not return normally but if it does then that indicates an unrecoverable error. /// In that case all pending requests will be canceled, dropping the result senders and new ones /// will be rejected. -pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { +pub async fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host"); // Run checks for supported security features once per host startup. Warn here if not enabled. let security_status = { - let can_enable_landlock = check_landlock(&config.prepare_worker_program_path); - let can_unshare_user_namespace_and_change_root = - check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path); - SecurityStatus { can_enable_landlock, can_unshare_user_namespace_and_change_root } + let (can_enable_landlock, can_enable_seccomp, can_unshare_user_namespace_and_change_root) = join!( + check_landlock(&config.prepare_worker_program_path), + check_seccomp(&config.prepare_worker_program_path), + check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path) + ); + SecurityStatus { + can_enable_landlock, + can_enable_seccomp, + can_unshare_user_namespace_and_change_root, + } }; let (to_host_tx, to_host_rx) = mpsc::channel(10); @@ -887,17 +893,17 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream /// We do this check by spawning a new process and trying to sandbox it. To get as close as possible /// to running the check in a worker, we try it... in a worker. The expected return status is 0 on /// success and -1 on failure. -fn check_can_unshare_user_namespace_and_change_root( +async fn check_can_unshare_user_namespace_and_change_root( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, ) -> bool { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { - let output = std::process::Command::new(prepare_worker_program_path) + match tokio::process::Command::new(prepare_worker_program_path) .arg("--check-can-unshare-user-namespace-and-change-root") - .output(); - - match output { + .output() + .await + { Ok(output) if output.status.success() => true, Ok(output) => { let stderr = std::str::from_utf8(&output.stderr) @@ -938,15 +944,16 @@ fn check_can_unshare_user_namespace_and_change_root( /// We do this check by spawning a new process and trying to sandbox it. To get as close as possible /// to running the check in a worker, we try it... in a worker. The expected return status is 0 on /// success and -1 on failure. -fn check_landlock( +async fn check_landlock( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, ) -> bool { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { - match std::process::Command::new(prepare_worker_program_path) + match tokio::process::Command::new(prepare_worker_program_path) .arg("--check-can-enable-landlock") .status() + .await { Ok(status) if status.success() => true, Ok(status) => { @@ -981,6 +988,52 @@ fn check_landlock( } } +/// Check if seccomp is supported and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +async fn check_seccomp( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-seccomp") + .status() + .await + { + Ok(status) if status.success() => true, + Ok(status) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + "Cannot fully enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with seccomp support for maximum security." + ); + false + } + } +} + #[cfg(test)] pub(crate) mod tests { use super::*; diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index bd85d84055ce..906ef8401168 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -270,6 +270,9 @@ impl WorkerHandle { if security_status.can_enable_landlock { args.push("--can-enable-landlock".to_string()); } + if security_status.can_enable_seccomp { + args.push("--can-enable-seccomp".to_string()); + } if security_status.can_unshare_user_namespace_and_change_root { args.push("--can-unshare-user-namespace-and-change-root".to_string()); } diff --git a/polkadot/node/core/pvf/tests/it/adder.rs b/polkadot/node/core/pvf/tests/it/adder.rs index 8bdd09db208a..e8d8a9a6b63e 100644 --- a/polkadot/node/core/pvf/tests/it/adder.rs +++ b/polkadot/node/core/pvf/tests/it/adder.rs @@ -28,7 +28,7 @@ async fn execute_good_block_on_parent() { let block_data = BlockData { state: 0, add: 512 }; - let host = TestHost::new(); + let host = TestHost::new().await; let ret = host .validate_candidate( @@ -56,7 +56,7 @@ async fn execute_good_chain_on_parent() { let mut parent_hash = [0; 32]; let mut last_state = 0; - let host = TestHost::new(); + let host = TestHost::new().await; for (number, add) in (0..10).enumerate() { let parent_head = @@ -98,7 +98,7 @@ async fn execute_bad_block_on_parent() { add: 256, }; - let host = TestHost::new(); + let host = TestHost::new().await; let _err = host .validate_candidate( @@ -117,7 +117,7 @@ async fn execute_bad_block_on_parent() { #[tokio::test] async fn stress_spawn() { - let host = std::sync::Arc::new(TestHost::new()); + let host = std::sync::Arc::new(TestHost::new().await); async fn execute(host: std::sync::Arc) { let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; @@ -149,9 +149,12 @@ async fn stress_spawn() { // With one worker, run multiple execution jobs serially. They should not conflict. #[tokio::test] async fn execute_can_run_serially() { - let host = std::sync::Arc::new(TestHost::new_with_config(|cfg| { - cfg.execute_workers_max_num = 1; - })); + let host = std::sync::Arc::new( + TestHost::new_with_config(|cfg| { + cfg.execute_workers_max_num = 1; + }) + .await, + ); async fn execute(host: std::sync::Arc) { let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index f699b5840d8f..c3b338538361 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -42,11 +42,11 @@ struct TestHost { } impl TestHost { - fn new() -> Self { - Self::new_with_config(|_| ()) + async fn new() -> Self { + Self::new_with_config(|_| ()).await } - fn new_with_config(f: F) -> Self + async fn new_with_config(f: F) -> Self where F: FnOnce(&mut Config), { @@ -65,7 +65,7 @@ impl TestHost { execute_worker_path, ); f(&mut config); - let (host, task) = start(config, Metrics::default()); + let (host, task) = start(config, Metrics::default()).await; let _ = tokio::task::spawn(task); Self { cache_dir, host: Mutex::new(host) } } @@ -131,7 +131,7 @@ impl TestHost { #[tokio::test] async fn terminates_on_timeout() { - let host = TestHost::new(); + let host = TestHost::new().await; let start = std::time::Instant::now(); let result = host @@ -161,7 +161,7 @@ async fn terminates_on_timeout() { #[tokio::test] async fn ensure_parallel_execution() { // Run some jobs that do not complete, thus timing out. - let host = TestHost::new(); + let host = TestHost::new().await; let execute_pvf_future_1 = host.validate_candidate( halt::wasm_binary_unwrap(), ValidationParams { @@ -208,7 +208,7 @@ async fn ensure_parallel_execution() { async fn execute_queue_doesnt_stall_if_workers_died() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 5; - }); + }).await; // Here we spawn 8 validation jobs for the `halt` PVF and share those between 5 workers. The // first five jobs should timeout and the workers killed. For the next 3 jobs a new batch of @@ -245,7 +245,7 @@ async fn execute_queue_doesnt_stall_if_workers_died() { async fn execute_queue_doesnt_stall_with_varying_executor_params() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 2; - }); + }).await; let executor_params_1 = ExecutorParams::default(); let executor_params_2 = ExecutorParams::from(&[ExecutorParam::StackLogicalMax(1024)][..]); @@ -293,7 +293,7 @@ async fn execute_queue_doesnt_stall_with_varying_executor_params() { // Test that deleting a prepared artifact does not lead to a dispute when we try to execute it. #[tokio::test] async fn deleting_prepared_artifact_does_not_dispute() { - let host = TestHost::new(); + let host = TestHost::new().await; let cache_dir = host.cache_dir.path(); let result = host @@ -354,7 +354,7 @@ async fn deleting_prepared_artifact_does_not_dispute() { async fn prepare_can_run_serially() { let host = TestHost::new_with_config(|cfg| { cfg.prepare_workers_hard_max_num = 1; - }); + }).await; let _stats = host .precheck_pvf(::adder::wasm_binary_unwrap(), Default::default()) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index 6a14a3a013d4..41ee02028b0d 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -126,6 +126,17 @@ with untrusted code does not have unnecessary access to the file-system. This provides some protection against attackers accessing sensitive data or modifying data on the host machine. +*Currently this is only supported on Linux.* + +### Restricting networking + +We also disable networking on PVF threads by disabling certain syscalls, such as +the creation of sockets. This prevents attackers from either downloading +payloads or communicating sensitive data from the validator's machine to the +outside world. + +*Currently this is only supported on Linux.* + ### Clearing env vars We clear environment variables before handling untrusted code, because why give