From 6a85c17e50ecb4e57ab57a5074efa9904480b09a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 29 Jun 2022 18:46:06 +0200 Subject: [PATCH] #1830: TemperedWMin: allow all nodes to be potential recipients of work --- .../balance/temperedlb/temperedlb.cc | 28 +++++++++---------- .../balance/temperedlb/temperedlb.h | 6 ++-- .../balance/temperedwmin/temperedwmin.cc | 11 ++++++++ .../balance/temperedwmin/temperedwmin.h | 1 + 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc index 3b2e20a319..8b0e33b7c3 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.cc @@ -995,7 +995,7 @@ NodeType TemperedLB::sampleFromCMF( return selected_node; } -std::vector TemperedLB::makeUnderloaded() const { +std::vector TemperedLB::getPotentialRecipients() const { std::vector under = {}; for (auto&& elm : load_info_) { if (isUnderloaded(elm.second)) { @@ -1203,10 +1203,10 @@ void TemperedLB::decide() { int n_transfers = 0, n_rejected = 0; if (canMigrate()) { - std::vector under = makeUnderloaded(); + std::vector potential_recipients = getPotentialRecipients(); std::unordered_map migrate_objs; - if (under.size() > 0) { + if (potential_recipients.size() > 0) { std::vector ordered_obj_ids = orderObjects( obj_ordering_, cur_objs_, this_new_load_, target_max_load_ ); @@ -1218,24 +1218,24 @@ void TemperedLB::decide() { if (cmf_type_ == CMFTypeEnum::Original) { // Rebuild the relaxed underloaded set based on updated load of this node - under = makeUnderloaded(); - if (under.size() == 0) { + potential_recipients = getPotentialRecipients(); + if (potential_recipients.size() == 0) { break; } } else if (cmf_type_ == CMFTypeEnum::NormByMaxExcludeIneligible) { // Rebuild the underloaded set and eliminate processors that will // fail the Criterion for this object - under = makeSufficientlyUnderloaded(obj_load); - if (under.size() == 0) { + potential_recipients = makeSufficientlyUnderloaded(obj_load); + if (potential_recipients.size() == 0) { ++n_rejected; iter++; continue; } } // Rebuild the CMF with the new loads taken into account - auto cmf = createCMF(under); + auto cmf = createCMF(potential_recipients); // Select a node using the CMF - auto const selected_node = sampleFromCMF(under, cmf); + auto const selected_node = sampleFromCMF(potential_recipients, cmf); vt_debug_print( verbose, temperedlb, @@ -1255,13 +1255,13 @@ void TemperedLB::decide() { vt_debug_print( verbose, temperedlb, - "TemperedLB::decide: trial={}, iter={}, under.size()={}, " - "selected_node={}, selected_load={:e}, obj_id={:x}, home={}, " - "obj_load={}, target_max_load={}, this_new_load_={}, " - "criterion={}\n", + "TemperedLB::decide: trial={}, iter={}, " + "potential_recipients.size()={}, selected_node={}, " + "selected_load={:e}, obj_id={:x}, home={}, obj_load={}, " + "target_max_load={}, this_new_load_={}, criterion={}\n", trial_, iter_, - under.size(), + potential_recipients.size(), selected_node, selected_load, obj_id.id, diff --git a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h index be07298b25..4ea7c7e019 100644 --- a/src/vt/vrt/collection/balance/temperedlb/temperedlb.h +++ b/src/vt/vrt/collection/balance/temperedlb/temperedlb.h @@ -103,6 +103,7 @@ struct TemperedLB : BaseLB { * TemperedLB restricts this to underloaded ranks */ virtual bool canPropagate() const { return is_underloaded_; } + bool isDeterministic() const { return deterministic_; } void propagateRound(uint8_t k_cur_async, bool sync, EpochType epoch = no_epoch); void propagateIncomingAsync(LoadMsgAsync* msg); @@ -113,7 +114,7 @@ struct TemperedLB : BaseLB { std::vector createCMF(NodeSetType const& under); NodeType sampleFromCMF(NodeSetType const& under, std::vector const& cmf); - std::vector makeUnderloaded() const; + virtual std::vector getPotentialRecipients() const; std::vector makeSufficientlyUnderloaded( LoadType load_to_accommodate ) const; @@ -130,6 +131,8 @@ struct TemperedLB : BaseLB { void setupDone(ReduceMsgType* msg); + std::unordered_map load_info_ = {}; + private: uint16_t f_ = 0; uint8_t k_max_ = 0; @@ -168,7 +171,6 @@ struct TemperedLB : BaseLB { */ bool target_pole_ = false; std::random_device seed_; - std::unordered_map load_info_ = {}; std::unordered_map new_load_info_ = {}; objgroup::proxy::Proxy proxy_ = {}; bool is_overloaded_ = false; diff --git a/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.cc b/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.cc index e904309f26..ecdf42ce7c 100644 --- a/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.cc +++ b/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.cc @@ -96,6 +96,17 @@ void TemperedWMin::inputParams(balance::SpecEntry* spec) { ); } +std::vector TemperedWMin::getPotentialRecipients() const { + std::vector nodes = {}; + for (auto&& elm : load_info_) { + nodes.push_back(elm.first); + } + if (isDeterministic()) { + std::sort(nodes.begin(), nodes.end()); + } + return nodes; +} + TimeType TemperedWMin::getModeledWork(const elm::ElementIDStruct& obj) const { balance::PhaseOffset when = {balance::PhaseOffset::NEXT_PHASE, balance::PhaseOffset::WHOLE_PHASE}; diff --git a/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.h b/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.h index e0cdc03dfa..bf16e2bf0c 100644 --- a/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.h +++ b/src/vt/vrt/collection/balance/temperedwmin/temperedwmin.h @@ -66,6 +66,7 @@ struct TemperedWMin : TemperedLB { */ bool canPropagate() const override { return true; } + std::vector getPotentialRecipients() const override; TimeType getModeledWork(const elm::ElementIDStruct& obj) const override; private: