From 624950c62cef42458f5f96752805284a41824eaa Mon Sep 17 00:00:00 2001 From: Vitaly Goldshteyn Date: Tue, 31 Dec 2024 06:44:37 +0100 Subject: [PATCH] Store hash in the probed_indices array in common/raw_hashtable.h to avoid its recomputation. (#4726) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Store hash in probed_indices array to avoid its recomputation. Benchmarks on ARM (altra, aarch64). ``` name old CYCLES/op new CYCLES/op delta BM_MapInsertSeq>/1 119 ± 2% 119 ± 1% ~ (p=0.961 n=55+54) BM_MapInsertSeq>/2 133 ± 1% 134 ± 1% ~ (p=0.342 n=56+57) BM_MapInsertSeq>/3 150 ± 1% 150 ± 1% ~ (p=0.856 n=56+57) BM_MapInsertSeq>/4 167 ± 2% 167 ± 2% ~ (p=0.430 n=56+57) BM_MapInsertSeq>/8 234 ± 5% 234 ± 3% ~ (p=0.957 n=57+57) BM_MapInsertSeq>/16 368 ± 4% 368 ± 4% ~ (p=0.762 n=57+57) BM_MapInsertSeq>/32 650 ± 4% 650 ± 4% ~ (p=0.955 n=57+57) BM_MapInsertSeq>/64 1.93k ± 4% 1.98k ± 4% +2.35% (p=0.000 n=57+57) BM_MapInsertSeq>/256 9.68k ± 5% 9.85k ± 3% +1.74% (p=0.000 n=57+57) BM_MapInsertSeq>/4096 177k ± 3% 163k ± 2% -8.17% (p=0.000 n=57+57) BM_MapInsertSeq>/65536 3.99M ± 3% 3.87M ± 4% -3.12% (p=0.000 n=56+56) BM_MapInsertSeq>/1048576 90.5M ± 5% 91.3M ± 6% +0.87% (p=0.025 n=55+55) BM_MapInsertSeq>/16777216 2.77G ± 8% 2.74G ± 9% ~ (p=0.076 n=57+57) BM_MapInsertSeq>/56 1.05k ± 5% 1.05k ± 5% ~ (p=0.727 n=57+57) BM_MapInsertSeq>/224 6.29k ± 5% 6.37k ± 4% +1.32% (p=0.000 n=57+57) BM_MapInsertSeq>/3584 124k ± 4% 109k ± 3% -12.46% (p=0.000 n=57+57) BM_MapInsertSeq>/57344 2.67M ± 4% 2.50M ± 4% -6.40% (p=0.000 n=57+57) BM_MapInsertSeq>/917504 65.3M ± 6% 65.8M ± 6% +0.89% (p=0.050 n=55+56) BM_MapInsertSeq>/14680064 2.17G ±10% 2.14G ± 9% -1.55% (p=0.032 n=57+57) BM_MapInsertSeq>/1 122 ± 1% 122 ± 1% ~ (p=0.415 n=56+56) BM_MapInsertSeq>/2 136 ± 1% 136 ± 1% ~ (p=0.861 n=56+57) BM_MapInsertSeq>/3 153 ± 1% 153 ± 1% ~ (p=0.607 n=56+57) BM_MapInsertSeq>/4 170 ± 2% 174 ± 3% +2.34% (p=0.001 n=56+57) BM_MapInsertSeq>/8 238 ± 4% 242 ± 3% +1.59% (p=0.000 n=57+57) BM_MapInsertSeq>/16 382 ± 4% 383 ± 4% ~ (p=0.977 n=57+57) BM_MapInsertSeq>/32 701 ± 7% 682 ± 5% -2.69% (p=0.000 n=57+57) BM_MapInsertSeq>/64 2.13k ± 6% 2.09k ± 3% -1.89% (p=0.000 n=57+57) BM_MapInsertSeq>/256 10.3k ± 3% 10.2k ± 3% -0.94% (p=0.000 n=57+57) BM_MapInsertSeq>/4096 184k ± 2% 179k ± 2% -2.62% (p=0.000 n=57+57) BM_MapInsertSeq>/65536 3.63M ± 2% 3.68M ± 3% +1.22% (p=0.000 n=54+57) BM_MapInsertSeq>/1048576 129M ±10% 129M ±10% ~ (p=0.874 n=57+57) BM_MapInsertSeq>/16777216 3.27G ±11% 3.24G ±10% ~ (p=0.451 n=57+57) BM_MapInsertSeq>/56 1.18k ± 9% 1.10k ± 5% -6.52% (p=0.000 n=57+57) BM_MapInsertSeq>/224 6.76k ± 5% 6.59k ± 4% -2.55% (p=0.000 n=57+57) BM_MapInsertSeq>/3584 117k ± 2% 115k ± 3% -1.93% (p=0.000 n=57+57) BM_MapInsertSeq>/57344 2.22M ± 3% 2.24M ± 2% +0.87% (p=0.000 n=57+57) BM_MapInsertSeq>/917504 95.0M ± 8% 94.8M ± 9% ~ (p=0.894 n=55+57) BM_MapInsertSeq>/14680064 2.42G ±14% 2.40G ±13% ~ (p=0.852 n=57+57) BM_MapInsertSeq>/1 124 ± 1% 124 ± 1% ~ (p=0.604 n=56+55) BM_MapInsertSeq>/2 140 ± 1% 140 ± 1% ~ (p=0.181 n=56+56) BM_MapInsertSeq>/3 158 ± 1% 158 ± 3% ~ (p=1.000 n=56+57) BM_MapInsertSeq>/4 176 ± 2% 176 ± 3% ~ (p=0.125 n=56+57) BM_MapInsertSeq>/8 247 ± 4% 247 ± 2% ~ (p=0.614 n=57+57) BM_MapInsertSeq>/16 391 ± 3% 391 ± 2% ~ (p=0.993 n=57+57) BM_MapInsertSeq>/32 690 ± 3% 691 ± 3% ~ (p=0.224 n=57+57) BM_MapInsertSeq>/64 2.17k ± 3% 2.22k ± 3% +1.94% (p=0.000 n=57+57) BM_MapInsertSeq>/256 11.1k ± 3% 11.3k ± 3% +1.58% (p=0.000 n=57+57) BM_MapInsertSeq>/4096 204k ± 2% 193k ± 2% -5.65% (p=0.000 n=57+57) BM_MapInsertSeq>/65536 5.19M ± 3% 5.09M ± 3% -2.05% (p=0.000 n=56+56) BM_MapInsertSeq>/1048576 124M ±10% 123M ± 6% ~ (p=0.626 n=57+57) BM_MapInsertSeq>/16777216 3.30G ± 9% 3.25G ± 8% -1.39% (p=0.019 n=57+57) BM_MapInsertSeq>/56 1.12k ± 3% 1.12k ± 3% ~ (p=0.482 n=57+57) BM_MapInsertSeq>/224 7.04k ± 4% 7.14k ± 3% +1.36% (p=0.000 n=57+57) BM_MapInsertSeq>/3584 138k ± 2% 126k ± 2% -8.89% (p=0.000 n=57+57) BM_MapInsertSeq>/57344 3.48M ± 4% 3.34M ± 4% -3.93% (p=0.000 n=56+56) BM_MapInsertSeq>/917504 84.4M ± 7% 84.9M ± 6% ~ (p=0.159 n=56+57) BM_MapInsertSeq>/14680064 2.42G ± 9% 2.40G ±10% ~ (p=0.300 n=57+57) BM_MapInsertSeq>/1 168 ± 0% 168 ± 0% ~ (p=0.555 n=56+55) BM_MapInsertSeq>/2 208 ± 0% 208 ± 0% ~ (p=0.722 n=52+53) BM_MapInsertSeq>/3 248 ± 0% 248 ± 0% ~ (p=0.248 n=53+54) BM_MapInsertSeq>/4 288 ± 0% 288 ± 0% ~ (p=0.185 n=54+55) BM_MapInsertSeq>/8 457 ± 0% 457 ± 0% ~ (p=0.665 n=53+53) BM_MapInsertSeq>/16 867 ± 1% 867 ± 1% ~ (p=0.174 n=47+52) BM_MapInsertSeq>/32 1.61k ± 3% 1.62k ± 4% ~ (p=0.402 n=57+57) BM_MapInsertSeq>/64 4.96k ± 9% 4.89k ± 5% -1.37% (p=0.046 n=57+54) BM_MapInsertSeq>/256 26.9k ± 8% 26.5k ± 8% -1.51% (p=0.004 n=56+55) BM_MapInsertSeq>/4096 600k ± 3% 588k ± 2% -2.07% (p=0.000 n=57+57) BM_MapInsertSeq>/65536 13.9M ± 3% 13.5M ± 2% -2.99% (p=0.000 n=55+56) BM_MapInsertSeq>/1048576 407M ± 7% 393M ± 5% -3.27% (p=0.000 n=56+57) BM_MapInsertSeq>/16777216 10.2G ± 8% 9.9G ± 5% -3.50% (p=0.000 n=57+57) BM_MapInsertSeq>/56 2.81k ± 5% 2.81k ± 4% ~ (p=0.809 n=56+56) BM_MapInsertSeq>/224 17.9k ± 6% 17.6k ± 5% -1.20% (p=0.035 n=57+52) BM_MapInsertSeq>/3584 374k ± 3% 367k ± 3% -1.80% (p=0.000 n=57+57) BM_MapInsertSeq>/57344 8.64M ± 3% 8.53M ± 2% -1.29% (p=0.000 n=55+55) BM_MapInsertSeq>/917504 247M ± 6% 244M ± 5% -1.19% (p=0.021 n=56+57) BM_MapInsertSeq>/14680064 6.81G ± 8% 6.64G ± 6% -2.46% (p=0.000 n=57+57) ``` Benchmarks on x86 ``` name old cpu/op new cpu/op delta BM_MapInsertSeq>/1 32.9ns ± 3% 32.6ns ± 3% -0.84% (p=0.027 n=54+51) BM_MapInsertSeq>/2 35.9ns ± 3% 35.7ns ± 4% ~ (p=0.123 n=54+54) BM_MapInsertSeq>/3 39.7ns ± 3% 47.4ns ± 4% +19.40% (p=0.000 n=55+56) BM_MapInsertSeq>/4 52.7ns ± 3% 52.1ns ± 4% -1.22% (p=0.000 n=57+57) BM_MapInsertSeq>/8 78.1ns ± 3% 78.3ns ± 3% ~ (p=0.141 n=50+57) BM_MapInsertSeq>/16 135ns ± 3% 135ns ± 4% ~ (p=0.936 n=53+57) BM_MapInsertSeq>/32 249ns ± 3% 241ns ± 3% -3.28% (p=0.000 n=55+57) BM_MapInsertSeq>/64 631ns ± 3% 618ns ± 3% -2.21% (p=0.000 n=57+57) BM_MapInsertSeq>/256 2.62µs ± 3% 2.36µs ± 4% -10.02% (p=0.000 n=52+53) BM_MapInsertSeq>/4096 39.2µs ± 3% 37.9µs ± 4% -3.40% (p=0.000 n=57+56) BM_MapInsertSeq>/65536 972µs ± 3% 955µs ± 3% -1.76% (p=0.000 n=57+57) BM_MapInsertSeq>/1048576 16.2ms ± 4% 16.3ms ± 5% ~ (p=0.231 n=52+54) BM_MapInsertSeq>/16777216 651ms ± 3% 648ms ± 2% -0.42% (p=0.048 n=57+56) BM_MapInsertSeq>/56 418ns ± 3% 401ns ± 3% -4.10% (p=0.000 n=54+57) BM_MapInsertSeq>/224 1.79µs ± 3% 1.61µs ± 3% -10.20% (p=0.000 n=57+57) BM_MapInsertSeq>/3584 26.0µs ± 3% 24.9µs ± 4% -4.13% (p=0.000 n=57+56) BM_MapInsertSeq>/57344 560µs ± 3% 549µs ± 3% -2.11% (p=0.000 n=56+57) BM_MapInsertSeq>/917504 10.4ms ± 3% 10.4ms ± 3% ~ (p=0.805 n=56+56) BM_MapInsertSeq>/14680064 422ms ± 2% 421ms ± 3% ~ (p=0.269 n=57+56) BM_MapInsertSeq>/1 33.7ns ± 3% 33.7ns ± 3% ~ (p=0.620 n=55+55) BM_MapInsertSeq>/2 36.7ns ± 3% 36.5ns ± 3% ~ (p=0.160 n=55+56) BM_MapInsertSeq>/3 41.1ns ± 2% 41.0ns ± 4% ~ (p=0.284 n=54+56) BM_MapInsertSeq>/4 45.0ns ± 3% 53.9ns ± 4% +19.70% (p=0.000 n=57+56) BM_MapInsertSeq>/8 77.1ns ± 3% 80.9ns ± 4% +4.98% (p=0.000 n=55+57) BM_MapInsertSeq>/16 130ns ± 3% 136ns ± 4% +4.42% (p=0.000 n=56+57) BM_MapInsertSeq>/32 244ns ± 3% 246ns ± 4% +0.95% (p=0.000 n=57+57) BM_MapInsertSeq>/64 620ns ± 3% 674ns ± 3% +8.83% (p=0.000 n=55+57) BM_MapInsertSeq>/256 2.93µs ± 3% 2.88µs ± 3% -1.73% (p=0.000 n=56+56) BM_MapInsertSeq>/4096 54.0µs ± 3% 50.8µs ± 4% -6.01% (p=0.000 n=57+57) BM_MapInsertSeq>/65536 1.18ms ± 2% 1.17ms ± 4% ~ (p=0.083 n=57+57) BM_MapInsertSeq>/1048576 28.9ms ± 4% 29.1ms ± 5% +0.91% (p=0.007 n=55+56) BM_MapInsertSeq>/16777216 914ms ± 2% 919ms ± 3% +0.56% (p=0.015 n=56+57) BM_MapInsertSeq>/56 404ns ± 3% 427ns ± 4% +5.60% (p=0.000 n=57+57) BM_MapInsertSeq>/224 1.88µs ± 3% 1.87µs ± 4% -0.68% (p=0.013 n=55+53) BM_MapInsertSeq>/3584 34.2µs ± 3% 32.9µs ± 4% -4.02% (p=0.000 n=56+57) BM_MapInsertSeq>/57344 768µs ± 3% 756µs ± 3% -1.53% (p=0.000 n=57+57) BM_MapInsertSeq>/917504 16.4ms ± 5% 16.5ms ± 7% ~ (p=0.303 n=56+57) BM_MapInsertSeq>/14680064 607ms ± 2% 613ms ± 3% +0.92% (p=0.000 n=57+57) BM_MapInsertSeq>/1 34.1ns ± 3% 34.2ns ± 4% ~ (p=0.288 n=57+57) BM_MapInsertSeq>/2 37.4ns ± 3% 37.5ns ± 3% ~ (p=0.316 n=57+57) BM_MapInsertSeq>/3 41.8ns ± 4% 49.1ns ± 3% +17.45% (p=0.000 n=57+56) BM_MapInsertSeq>/4 54.6ns ± 3% 53.9ns ± 5% -1.35% (p=0.000 n=57+57) BM_MapInsertSeq>/8 81.4ns ± 3% 81.4ns ± 4% ~ (p=0.956 n=56+57) BM_MapInsertSeq>/16 139ns ± 3% 139ns ± 3% ~ (p=0.754 n=57+56) BM_MapInsertSeq>/32 256ns ± 3% 250ns ± 4% -2.32% (p=0.000 n=57+57) BM_MapInsertSeq>/64 705ns ± 4% 687ns ± 3% -2.56% (p=0.000 n=53+57) BM_MapInsertSeq>/256 2.95µs ± 5% 3.05µs ± 3% +3.42% (p=0.000 n=52+55) BM_MapInsertSeq>/4096 49.6µs ± 3% 50.8µs ± 4% +2.44% (p=0.000 n=55+57) BM_MapInsertSeq>/65536 1.39ms ± 3% 1.40ms ± 3% +0.65% (p=0.004 n=57+56) BM_MapInsertSeq>/1048576 37.7ms ± 4% 38.1ms ± 4% +1.07% (p=0.001 n=57+57) BM_MapInsertSeq>/16777216 1.20s ± 3% 1.20s ± 3% +0.50% (p=0.040 n=57+57) BM_MapInsertSeq>/56 432ns ± 3% 414ns ± 3% -3.99% (p=0.000 n=57+57) BM_MapInsertSeq>/224 1.92µs ± 4% 1.89µs ± 4% -1.48% (p=0.000 n=52+55) BM_MapInsertSeq>/3584 31.5µs ± 4% 32.1µs ± 4% +1.89% (p=0.000 n=57+57) BM_MapInsertSeq>/57344 757µs ± 3% 748µs ± 3% -1.28% (p=0.000 n=57+57) BM_MapInsertSeq>/917504 21.9ms ± 4% 22.1ms ± 5% ~ (p=0.096 n=57+57) BM_MapInsertSeq>/14680064 735ms ± 3% 737ms ± 3% ~ (p=0.208 n=57+57) BM_MapInsertSeq>/1 41.5ns ± 3% 41.4ns ± 4% ~ (p=0.790 n=54+56) BM_MapInsertSeq>/2 50.6ns ± 4% 50.6ns ± 5% ~ (p=0.684 n=53+57) BM_MapInsertSeq>/3 59.7ns ± 4% 59.4ns ± 4% ~ (p=0.277 n=55+53) BM_MapInsertSeq>/4 68.5ns ± 5% 68.2ns ± 5% ~ (p=0.623 n=54+55) BM_MapInsertSeq>/8 107ns ± 5% 107ns ± 9% ~ (p=0.359 n=54+57) BM_MapInsertSeq>/16 200ns ± 6% 200ns ± 6% ~ (p=0.772 n=56+57) BM_MapInsertSeq>/32 373ns ± 8% 371ns ± 7% ~ (p=0.541 n=57+57) BM_MapInsertSeq>/64 1.11µs ± 9% 1.09µs ± 8% -2.09% (p=0.003 n=56+56) BM_MapInsertSeq>/256 5.61µs ± 5% 5.48µs ± 7% -2.42% (p=0.000 n=54+56) BM_MapInsertSeq>/4096 153µs ± 4% 147µs ± 6% -3.80% (p=0.000 n=54+57) BM_MapInsertSeq>/65536 3.24ms ± 3% 3.10ms ± 3% -4.19% (p=0.000 n=57+57) BM_MapInsertSeq>/1048576 100ms ± 2% 98ms ± 3% -1.97% (p=0.000 n=56+57) BM_MapInsertSeq>/16777216 2.45s ± 2% 2.40s ± 3% -2.09% (p=0.000 n=57+57) BM_MapInsertSeq>/56 637ns ± 8% 630ns ± 8% ~ (p=0.101 n=56+56) BM_MapInsertSeq>/224 3.77µs ± 6% 3.68µs ± 6% -2.42% (p=0.000 n=56+56) BM_MapInsertSeq>/3584 92.1µs ± 7% 88.4µs ± 6% -4.04% (p=0.000 n=57+56) BM_MapInsertSeq>/57344 1.99ms ± 4% 1.92ms ± 3% -3.47% (p=0.000 n=57+57) BM_MapInsertSeq>/917504 62.1ms ± 4% 60.9ms ± 3% -1.93% (p=0.000 n=57+57) BM_MapInsertSeq>/14680064 1.53s ± 3% 1.50s ± 3% -1.85% (p=0.000 n=57+57) ``` --------- Co-authored-by: Chandler Carruth Co-authored-by: Carbon Infra Bot --- common/raw_hashtable.h | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/common/raw_hashtable.h b/common/raw_hashtable.h index 8200117b2578b..9613f303c2b9f 100644 --- a/common/raw_hashtable.h +++ b/common/raw_hashtable.h @@ -570,16 +570,13 @@ class BaseImpl { auto CopySlotsFrom(const BaseImpl& arg) -> void; auto MoveFrom(BaseImpl&& arg, Storage* small_storage) -> void; - template - auto InsertIntoEmpty(LookupKeyT lookup_key, KeyContextT key_context) - -> EntryT*; + auto InsertIntoEmpty(HashCode hash) -> EntryT*; static auto ComputeNextAllocSize(ssize_t old_alloc_size) -> ssize_t; static auto GrowthThresholdForAllocSize(ssize_t alloc_size) -> ssize_t; auto GrowToNextAllocSize(KeyContextT key_context) -> void; - template - auto GrowAndInsert(LookupKeyT lookup_key, KeyContextT key_context) -> EntryT*; + auto GrowAndInsert(HashCode hash, KeyContextT key_context) -> EntryT*; ViewImplT view_impl_; int growth_budget_; @@ -974,7 +971,7 @@ auto BaseImpl::InsertImpl( // empty slot. Without the growth budget we'll have to completely rehash and // so we can just bail here. if (LLVM_UNLIKELY(growth_budget_ == 0)) { - return {GrowAndInsert(lookup_key, key_context), true}; + return {GrowAndInsert(hash, key_context), true}; } --growth_budget_; @@ -1029,8 +1026,9 @@ BaseImpl::GrowToAllocSizeImpl( for (ssize_t byte_index : present_matched_range) { ++count; ssize_t index = group_index + byte_index; - EntryT* new_entry = - InsertIntoEmpty(old_entries[index].key(), key_context); + HashCode hash = + key_context.HashKey(old_entries[index].key(), ComputeSeed()); + EntryT* new_entry = InsertIntoEmpty(hash); new_entry->MoveFrom(std::move(old_entries[index])); } } @@ -1291,11 +1289,8 @@ auto BaseImpl::MoveFrom( // these are true, typically just after growth, we can dramatically simplify the // insert position search. template -template -[[clang::noinline]] auto -BaseImpl::InsertIntoEmpty( - LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* { - HashCode hash = key_context.HashKey(lookup_key, ComputeSeed()); +auto BaseImpl::InsertIntoEmpty( + HashCode hash) -> EntryT* { auto [hash_index, tag] = hash.ExtractIndexAndTag<7>(); uint8_t* local_metadata = metadata(); EntryT* local_entries = entries(); @@ -1375,7 +1370,7 @@ auto BaseImpl::GrowToNextAllocSize( // the group walk rather than after the group walk. In practice, between the // statistical rareness and using a large small size buffer here on the stack, // we can handle this most efficiently with temporary, additional storage. - llvm::SmallVector probed_indices; + llvm::SmallVector, 128> probed_indices; // Create locals for the old state of the table. ssize_t old_size = alloc_size(); @@ -1449,7 +1444,7 @@ auto BaseImpl::GrowToNextAllocSize( ssize_t old_hash_index = hash.ExtractIndexAndTag<7>().first & ComputeProbeMaskFromSize(old_size); if (LLVM_UNLIKELY(old_hash_index != group_index)) { - probed_indices.push_back(old_index); + probed_indices.push_back({old_index, hash}); if constexpr (MetadataGroup::FastByteClear) { low_g.ClearByte(byte_index); high_g.ClearByte(byte_index); @@ -1510,9 +1505,8 @@ auto BaseImpl::GrowToNextAllocSize( // We then need to do a normal insertion for anything that was probed before // growth, but we know we'll find an empty slot, so leverage that. - for (ssize_t old_index : probed_indices) { - EntryT* new_entry = - InsertIntoEmpty(old_entries[old_index].key(), key_context); + for (auto [old_index, hash] : probed_indices) { + EntryT* new_entry = InsertIntoEmpty(hash); new_entry->MoveFrom(std::move(old_entries[old_index])); } CARBON_DCHECK(count == @@ -1538,16 +1532,15 @@ auto BaseImpl::GrowToNextAllocSize( // that this function can be directly called and the result returned from // `InsertImpl`. template -template [[clang::noinline]] auto BaseImpl::GrowAndInsert( - LookupKeyT lookup_key, KeyContextT key_context) -> EntryT* { + HashCode hash, KeyContextT key_context) -> EntryT* { GrowToNextAllocSize(key_context); // And insert the lookup_key into an index in the newly grown map and return // that index for use. --growth_budget_; - return InsertIntoEmpty(lookup_key, key_context); + return InsertIntoEmpty(hash); } template