From f4debc8e946efc02b622660c8ebb8ef369edd52d Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Wed, 27 Jan 2021 12:16:03 -0800 Subject: [PATCH] Resolve DLL imports at CRT startup, not on demand On Windows, libstd uses GetProcAddress to locate some DLL imports, so that libstd can run on older versions of Windows. If a given DLL import is not present, then libstd uses other behavior (such as fallback implementations). This commit uses a feature of the Windows CRT to do these DLL imports during module initialization, before main() (or DllMain()) is called. This is the ideal time to resolve imports, because the module is effectively single-threaded at that point; no other threads can touch the data or code of the module that is being initialized. This avoids several problems. First, it makes the cost of performing the DLL import lookups deterministic. Right now, the DLL imports are done on demand, which means that application threads _might_ have to do the DLL import during some time-sensitive operation. This is a small source of unpredictability. Since threads can race, it's even possible to have more than one thread running the same redundant DLL lookup. This commit also removes using the heap to allocate strings, during the DLL lookups. --- library/std/src/sys/windows/c.rs | 1 + library/std/src/sys/windows/compat.rs | 153 +++++++++++-------- library/std/src/sys/windows/thread_parker.rs | 12 +- 3 files changed, 95 insertions(+), 71 deletions(-) diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs index f43a19d91b657..dec886208103d 100644 --- a/library/std/src/sys/windows/c.rs +++ b/library/std/src/sys/windows/c.rs @@ -975,6 +975,7 @@ extern "system" { pub fn freeaddrinfo(res: *mut ADDRINFOA); pub fn GetProcAddress(handle: HMODULE, name: LPCSTR) -> *mut c_void; + pub fn GetModuleHandleA(lpModuleName: LPCSTR) -> HMODULE; pub fn GetModuleHandleW(lpModuleName: LPCWSTR) -> HMODULE; pub fn GetSystemTimeAsFileTime(lpSystemTimeAsFileTime: LPFILETIME); diff --git a/library/std/src/sys/windows/compat.rs b/library/std/src/sys/windows/compat.rs index e9588e2975825..017a4bbe97cc5 100644 --- a/library/std/src/sys/windows/compat.rs +++ b/library/std/src/sys/windows/compat.rs @@ -1,93 +1,116 @@ -//! A "compatibility layer" for spanning XP and Windows 7 +//! A "compatibility layer" for supporting older versions of Windows //! -//! The standard library currently binds many functions that are not available -//! on Windows XP, but we would also like to support building executables that -//! run on XP. To do this we specify all non-XP APIs as having a fallback -//! implementation to do something reasonable. +//! The standard library uses some Windows API functions that are not present +//! on older versions of Windows. (Note that the oldest version of Windows +//! that Rust supports is Windows 7 (client) and Windows Server 2008 (server).) +//! This module implements a form of delayed DLL import binding, using +//! `GetModuleHandle` and `GetProcAddress` to look up DLL entry points at +//! runtime. //! -//! This dynamic runtime detection of whether a function is available is -//! implemented with `GetModuleHandle` and `GetProcAddress` paired with a -//! static-per-function which caches the result of the first check. In this -//! manner we pay a semi-large one-time cost up front for detecting whether a -//! function is available but afterwards it's just a load and a jump. - -use crate::ffi::CString; -use crate::sys::c; - -pub fn lookup(module: &str, symbol: &str) -> Option { - let mut module: Vec = module.encode_utf16().collect(); - module.push(0); - let symbol = CString::new(symbol).unwrap(); - unsafe { - let handle = c::GetModuleHandleW(module.as_ptr()); - match c::GetProcAddress(handle, symbol.as_ptr()) as usize { - 0 => None, - n => Some(n), - } - } -} +//! This implementation uses a static initializer to look up the DLL entry +//! points. The CRT (C runtime) executes static initializers before `main` +//! is called (for binaries) and before `DllMain` is called (for DLLs). +//! This is the ideal time to look up DLL imports, because we are guaranteed +//! that no other threads will attempt to call these entry points. Thus, +//! we can look up the imports and store them in `static mut` fields +//! without any synchronization. +//! +//! This has an additional advantage: Because the DLL import lookup happens +//! at module initialization, the cost of these lookups is deterministic, +//! and is removed from the code paths that actually call the DLL imports. +//! That is, there is no unpredictable "cache miss" that occurs when calling +//! a DLL import. For applications that benefit from predictable delays, +//! this is a benefit. This also eliminates the comparison-and-branch +//! from the hot path. +//! +//! Currently, the standard library uses only a small number of dynamic +//! DLL imports. If this number grows substantially, then the cost of +//! performing all of the lookups at initialization time might become +//! substantial. +//! +//! The mechanism of registering a static initializer with the CRT is +//! documented in +//! [CRT Initialization](https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-initialization?view=msvc-160). +//! It works by contributing a global symbol to the `.CRT$XCU` section. +//! The linker builds a table of all static initializer functions. +//! The CRT startup code then iterates that table, calling each +//! initializer function. +//! +//! # **WARNING!!* +//! The environment that a static initializer function runs in is highly +//! constrained. There are **many** restrictions on what static initializers +//! can safely do. Static initializer functions **MUST NOT** do any of the +//! following (this list is not comprehensive): +//! * touch any other static field that is used by a different static +//! initializer, because the order that static initializers run in +//! is not defined. +//! * call `LoadLibrary` or any other function that acquires the DLL +//! loader lock. +//! * call any Rust function or CRT function that touches any static +//! (global) state. macro_rules! compat_fn { ($module:literal: $( $(#[$meta:meta])* - pub fn $symbol:ident($($argname:ident: $argtype:ty),*) -> $rettype:ty $body:block + pub fn $symbol:ident($($argname:ident: $argtype:ty),*) -> $rettype:ty $fallback_body:block )*) => ($( $(#[$meta])* pub mod $symbol { #[allow(unused_imports)] use super::*; - use crate::sync::atomic::{AtomicUsize, Ordering}; use crate::mem; type F = unsafe extern "system" fn($($argtype),*) -> $rettype; - static PTR: AtomicUsize = AtomicUsize::new(0); - - #[allow(unused_variables)] - unsafe extern "system" fn fallback($($argname: $argtype),*) -> $rettype $body - - /// This address is stored in `PTR` to incidate an unavailable API. - /// - /// This way, call() will end up calling fallback() if it is unavailable. - /// - /// This is a `static` to avoid rustc duplicating `fn fallback()` - /// into both load() and is_available(), which would break - /// is_available()'s comparison. By using the same static variable - /// in both places, they'll refer to the same (copy of the) - /// function. + /// Points to the DLL import, or the fallback function. /// - /// LLVM merging the address of fallback with other functions - /// (because of unnamed_addr) is fine, since it's only compared to - /// an address from GetProcAddress from an external dll. - static FALLBACK: F = fallback; + /// This static can be an ordinary, unsynchronized, mutable static because + /// we guarantee that all of the writes finish during CRT initialization, + /// and all of the reads occur after CRT initialization. + static mut PTR: Option = None; - #[cold] - fn load() -> usize { - // There is no locking here. It's okay if this is executed by multiple threads in - // parallel. `lookup` will result in the same value, and it's okay if they overwrite - // eachothers result as long as they do so atomically. We don't need any guarantees - // about memory ordering, as this involves just a single atomic variable which is - // not used to protect or order anything else. - let addr = crate::sys::compat::lookup($module, stringify!($symbol)) - .unwrap_or(FALLBACK as usize); - PTR.store(addr, Ordering::Relaxed); - addr - } + /// This symbol is what allows the CRT to find the `init` function and call it. + /// It is marked `#[used]` because otherwise Rust would assume that it was not + /// used, and would remove it. + #[used] + #[link_section = ".CRT$XCU"] + static INIT_TABLE_ENTRY: fn() = init; - fn addr() -> usize { - match PTR.load(Ordering::Relaxed) { - 0 => load(), - addr => addr, + fn init() { + // There is no locking here. This code is executed before main() is entered, and + // is guaranteed to be single-threaded. + // + // DO NOT do anything interesting or complicated in this function! DO NOT call + // any Rust functions or CRT functions, if those functions touch any global state, + // because this function runs during global initialization. For example, DO NOT + // do any dynamic allocation, don't call LoadLibrary, etc. + unsafe { + let module_name: *const u8 = concat!($module, "\0").as_ptr(); + let symbol_name: *const u8 = concat!(stringify!($symbol), "\0").as_ptr(); + let module_handle = $crate::sys::c::GetModuleHandleA(module_name as *const i8); + if !module_handle.is_null() { + match $crate::sys::c::GetProcAddress(module_handle, symbol_name as *const i8) as usize { + 0 => {} + n => { + PTR = Some(mem::transmute::(n)); + } + } + } } } #[allow(dead_code)] - pub fn is_available() -> bool { - addr() != FALLBACK as usize + pub fn option() -> Option { + unsafe { PTR } } + #[allow(dead_code)] pub unsafe fn call($($argname: $argtype),*) -> $rettype { - mem::transmute::(addr())($($argname),*) + if let Some(ptr) = PTR { + ptr($($argname),*) + } else { + $fallback_body + } } } diff --git a/library/std/src/sys/windows/thread_parker.rs b/library/std/src/sys/windows/thread_parker.rs index 9e4c9aa0a512c..4f59d4dd452be 100644 --- a/library/std/src/sys/windows/thread_parker.rs +++ b/library/std/src/sys/windows/thread_parker.rs @@ -108,10 +108,10 @@ impl Parker { return; } - if c::WaitOnAddress::is_available() { + if let Some(wait_on_address) = c::WaitOnAddress::option() { loop { // Wait for something to happen, assuming it's still set to PARKED. - c::WaitOnAddress(self.ptr(), &PARKED as *const _ as c::LPVOID, 1, c::INFINITE); + wait_on_address(self.ptr(), &PARKED as *const _ as c::LPVOID, 1, c::INFINITE); // Change NOTIFIED=>EMPTY but leave PARKED alone. if self.state.compare_exchange(NOTIFIED, EMPTY, Acquire, Acquire).is_ok() { // Actually woken up by unpark(). @@ -140,9 +140,9 @@ impl Parker { return; } - if c::WaitOnAddress::is_available() { + if let Some(wait_on_address) = c::WaitOnAddress::option() { // Wait for something to happen, assuming it's still set to PARKED. - c::WaitOnAddress(self.ptr(), &PARKED as *const _ as c::LPVOID, 1, dur2timeout(timeout)); + wait_on_address(self.ptr(), &PARKED as *const _ as c::LPVOID, 1, dur2timeout(timeout)); // Set the state back to EMPTY (from either PARKED or NOTIFIED). // Note that we don't just write EMPTY, but use swap() to also // include an acquire-ordered read to synchronize with unpark()'s @@ -192,9 +192,9 @@ impl Parker { // purpose, to make sure every unpark() has a release-acquire ordering // with park(). if self.state.swap(NOTIFIED, Release) == PARKED { - if c::WakeByAddressSingle::is_available() { + if let Some(wake_by_address_single) = c::WakeByAddressSingle::option() { unsafe { - c::WakeByAddressSingle(self.ptr()); + wake_by_address_single(self.ptr()); } } else { // If we run NtReleaseKeyedEvent before the waiting thread runs