From 206251b04f5b0f8623209af6caab2a515c71eb9f Mon Sep 17 00:00:00 2001 From: Mark Rousskov Date: Sun, 5 May 2024 19:57:24 -0400 Subject: [PATCH] [WIP] Improve VecCache under parallel frontend This replaces the single Vec allocation with a series of progressively larger buckets. With the cfg for parallel enabled but with -Zthreads=1, this looks like a slight regression in i-count and cycle counts (<0.1%). With the parallel frontend at -Zthreads=4, this is an improvement (-5% wall-time from 5.788 to 5.4688 on libcore) than our current Lock-based approach, likely due to reducing the bouncing of the cache line holding the lock. At -Zthreads=32 it's a huge improvement (-46%: 8.829 -> 4.7319 seconds). FIXME: Extract the internals to rustc_data_structures, safety comments, etc. --- .../rustc_query_system/src/query/caches.rs | 259 +++++++++++++++--- 1 file changed, 226 insertions(+), 33 deletions(-) diff --git a/compiler/rustc_query_system/src/query/caches.rs b/compiler/rustc_query_system/src/query/caches.rs index acc29b67cccec..30fea5b5ddf10 100644 --- a/compiler/rustc_query_system/src/query/caches.rs +++ b/compiler/rustc_query_system/src/query/caches.rs @@ -2,13 +2,16 @@ use crate::dep_graph::DepNodeIndex; use rustc_data_structures::fx::FxHashMap; use rustc_data_structures::sharded::{self, Sharded}; -use rustc_data_structures::sync::{Lock, OnceLock}; +use rustc_data_structures::sync::{AtomicUsize, OnceLock}; use rustc_hir::def_id::LOCAL_CRATE; -use rustc_index::{Idx, IndexVec}; +use rustc_index::Idx; use rustc_span::def_id::DefId; use rustc_span::def_id::DefIndex; use std::fmt::Debug; use std::hash::Hash; +use std::marker::PhantomData; +use std::mem::offset_of; +use std::sync::atomic::{AtomicPtr, AtomicU32, Ordering}; pub trait QueryCache: Sized { type Key: Hash + Eq + Copy + Debug; @@ -100,16 +103,173 @@ where } } -pub struct VecCache { - cache: Lock>>, +struct Slot { + // We never construct &Slot so it's fine for this to not be in an UnsafeCell. + value: V, + // This is both an index and a once-lock. + // + // 0: not yet initialized. + // 1: lock held, initializing. + // 2..u32::MAX - 2: initialized. + index_and_lock: AtomicU32, } impl Default for VecCache { fn default() -> Self { - VecCache { cache: Default::default() } + VecCache { + buckets: Default::default(), + key: PhantomData, + len: Default::default(), + present: Default::default(), + } + } +} + +#[derive(Copy, Clone, Debug)] +struct SlotIndex { + bucket_idx: usize, + entries: usize, + index_in_bucket: usize, +} + +impl SlotIndex { + #[inline] + fn from_index(idx: u32) -> Self { + let mut bucket = idx.checked_ilog2().unwrap_or(0) as usize; + let entries; + let running_sum; + if bucket <= 11 { + entries = 1 << 12; + running_sum = 0; + bucket = 0; + } else { + entries = 1 << bucket; + running_sum = entries; + bucket = bucket - 11; + } + SlotIndex { bucket_idx: bucket, entries, index_in_bucket: idx as usize - running_sum } + } + + #[inline] + unsafe fn get(&self, buckets: &[AtomicPtr>; 21]) -> Option<(V, u32)> { + // SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e., + // in-bounds of buckets. + let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) }; + let ptr = bucket.load(Ordering::Acquire); + // Bucket is not yet initialized: then we obviously won't find this entry in that bucket. + if ptr.is_null() { + return None; + } + // SAFETY: Follows from preconditions on `buckets` and `self. + let slot = unsafe { ptr.add(self.index_in_bucket) }; + + // SAFETY: + let index_and_lock = + unsafe { &*slot.byte_add(offset_of!(Slot, index_and_lock)).cast::() }; + let current = index_and_lock.load(Ordering::Acquire); + let index = match current { + 0 => return None, + // Treat "initializing" as actually just not initialized at all. + // The only reason this is a separate state is that `complete` calls could race and + // we can't allow that, but from load perspective there's no difference. + 1 => return None, + _ => current - 2, + }; + + // SAFETY: + // * slot is a valid pointer (buckets are always valid for the index we get). + // * value is initialized since we saw a >= 2 index above. + // * `V: Copy`, so safe to read. + let value = unsafe { slot.byte_add(offset_of!(Slot, value)).cast::().read() }; + Some((value, index)) + } + + /// Returns true if this successfully put into the map. + #[inline] + unsafe fn put(&self, buckets: &[AtomicPtr>; 21], value: V, extra: u32) -> bool { + static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + // SAFETY: `bucket_idx` is ilog2(u32).saturating_sub(11), which is at most 21, i.e., + // in-bounds of buckets. + let bucket = unsafe { buckets.get_unchecked(self.bucket_idx) }; + let mut ptr = bucket.load(Ordering::Acquire); + let _allocator_guard; + if ptr.is_null() { + // If we load null, then acquire the global lock; this path is quite cold, so it's cheap + // to use a global lock. + _allocator_guard = LOCK.lock(); + // And re-load the value. + ptr = bucket.load(Ordering::Acquire); + } + + // OK, now under the allocator lock, if we're still null then it's definitely us that will + // initialize this bucket. + if ptr.is_null() { + let bucket_layout = + std::alloc::Layout::array::>(self.entries as usize).unwrap(); + // SAFETY: Always >0 entries in each bucket. + let allocated = unsafe { std::alloc::alloc_zeroed(bucket_layout).cast::>() }; + if allocated.is_null() { + std::alloc::handle_alloc_error(bucket_layout); + } + bucket.store(allocated, Ordering::Release); + ptr = allocated; + } + assert!(!ptr.is_null()); + + // SAFETY: `index_in_bucket` is always in-bounds of the allocated array. + let slot = unsafe { ptr.add(self.index_in_bucket) }; + + let index_and_lock = + unsafe { &*slot.byte_add(offset_of!(Slot, index_and_lock)).cast::() }; + match index_and_lock.compare_exchange(0, 1, Ordering::Relaxed, Ordering::Relaxed) { + Ok(_) => { + // We have acquired the initialization lock. It is our job to write `value` and + // then set the lock to the real index. + + unsafe { + slot.byte_add(offset_of!(Slot, value)).cast::().write(value); + } + + index_and_lock.store(extra.checked_add(2).unwrap(), Ordering::Release); + + true + } + + // Treat "initializing" as actually initialized: we lost the race and should skip + // any updates to this slot. + // + // FIXME: Is that OK? Some of our other implementations replace the entry instead of + // preserving the earliest write. We could do a parking-lot or spinlock here... + Err(1) => false, + + // This slot was already populated. Also ignore, currently this is the same as + // "initializing". + Err(_) => false, + } } } +pub struct VecCache { + // Entries per bucket: + // Bucket 0: 4096 2^12 + // Bucket 1: 4096 2^12 + // Bucket 2: 8192 + // Bucket 3: 16384 + // ... + // Bucket 19: 1073741824 + // Bucket 20: 2147483648 + // The total number of entries if all buckets are initialized is u32::MAX-1. + buckets: [AtomicPtr>; 21], + + // Present and len are only used during incremental and self-profiling. + // They are an optimization over iterating the full buckets array. + present: [AtomicPtr>; 21], + len: AtomicUsize, + + key: PhantomData, +} + impl QueryCache for VecCache where K: Eq + Idx + Copy + Debug, @@ -120,20 +280,41 @@ where #[inline(always)] fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> { - let lock = self.cache.lock(); - if let Some(Some(value)) = lock.get(*key) { Some(*value) } else { None } + let key = u32::try_from(key.index()).unwrap(); + let slot_idx = SlotIndex::from_index(key); + match unsafe { slot_idx.get(&self.buckets) } { + Some((value, idx)) => Some((value, DepNodeIndex::from_u32(idx))), + None => None, + } } #[inline] fn complete(&self, key: K, value: V, index: DepNodeIndex) { - let mut lock = self.cache.lock(); - lock.insert(key, (value, index)); + let key = u32::try_from(key.index()).unwrap(); + let slot_idx = SlotIndex::from_index(key); + if unsafe { slot_idx.put(&self.buckets, value, index.index() as u32) } { + let present_idx = self.len.fetch_add(1, Ordering::Relaxed); + let slot = SlotIndex::from_index(present_idx as u32); + // We should always be uniquely putting due to `len` fetch_add returning unique values. + assert!(unsafe { slot.put(&self.present, (), key) }); + } } fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) { - for (k, v) in self.cache.lock().iter_enumerated() { - if let Some(v) = v { - f(&k, &v.0, v.1); + for idx in 0..self.len.load(Ordering::Relaxed) { + let key = SlotIndex::from_index(idx as u32); + match unsafe { key.get(&self.present) } { + // This shouldn't happen in our current usage (iter is really only + // used long after queries are done running), but if we hit this in practice it's + // probably fine to just break early. + None => unreachable!(), + Some(((), key)) => { + let key = K::new(key as usize); + // unwrap() is OK: present entries are always written only after we put the real + // entry. + let value = self.lookup(&key).unwrap(); + f(&key, &value.0, value.1); + } } } } @@ -142,10 +323,7 @@ where pub struct DefIdCache { /// Stores the local DefIds in a dense map. Local queries are much more often dense, so this is /// a win over hashing query keys at marginal memory cost (~5% at most) compared to FxHashMap. - /// - /// The second element of the tuple is the set of keys actually present in the IndexVec, used - /// for faster iteration in `iter()`. - local: Lock<(IndexVec>, Vec)>, + local: VecCache, foreign: DefaultCache, } @@ -165,8 +343,7 @@ where #[inline(always)] fn lookup(&self, key: &DefId) -> Option<(V, DepNodeIndex)> { if key.krate == LOCAL_CRATE { - let cache = self.local.lock(); - cache.0.get(key.index).and_then(|v| *v) + self.local.lookup(&key.index) } else { self.foreign.lookup(key) } @@ -175,27 +352,43 @@ where #[inline] fn complete(&self, key: DefId, value: V, index: DepNodeIndex) { if key.krate == LOCAL_CRATE { - let mut cache = self.local.lock(); - let (cache, present) = &mut *cache; - let slot = cache.ensure_contains_elem(key.index, Default::default); - if slot.is_none() { - // FIXME: Only store the present set when running in incremental mode. `iter` is not - // used outside of saving caches to disk and self-profile. - present.push(key.index); - } - *slot = Some((value, index)); + self.local.complete(key.index, value, index) } else { self.foreign.complete(key, value, index) } } fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) { - let guard = self.local.lock(); - let (cache, present) = &*guard; - for &idx in present.iter() { - let value = cache[idx].unwrap(); - f(&DefId { krate: LOCAL_CRATE, index: idx }, &value.0, value.1); - } + self.local.iter(&mut |key, value, index| { + f(&DefId { krate: LOCAL_CRATE, index: *key }, value, index); + }); self.foreign.iter(f); } } + +#[test] +fn slot_index_exhaustive() { + let mut buckets = [0u32; 21]; + for idx in 0..=u32::MAX { + buckets[SlotIndex::from_index(idx).bucket_idx] += 1; + } + let mut prev = None::; + for idx in 0..u32::MAX { + let slot_idx = SlotIndex::from_index(idx); + if let Some(p) = prev { + if p.bucket_idx == slot_idx.bucket_idx { + assert_eq!(p.index_in_bucket + 1, slot_idx.index_in_bucket); + } else { + assert_eq!(slot_idx.index_in_bucket, 0); + } + } else { + assert_eq!(idx, 0); + assert_eq!(slot_idx.index_in_bucket, 0); + assert_eq!(slot_idx.bucket_idx, 0); + } + + assert_eq!(buckets[slot_idx.bucket_idx], slot_idx.entries as u32); + + prev = Some(slot_idx); + } +}