Skip to content

Commit

Permalink
Add spilling and memory related stats counters (#7793)
Browse files Browse the repository at this point in the history
Summary:
Add stats counter to track the number of times that (1) memory reclaim fails
because of non-reclaimable section; (2) max spill level exceeded.
Aslo rename Stats counter enum to follow Velox coding convention.

Pull Request resolved: #7793

Reviewed By: bikramSingh91, mbasmanova

Differential Revision: D51674971

Pulled By: xiaoxmeng

fbshipit-source-id: 9cfaace6d75f03ad97b189378d2f06ed32c641db
  • Loading branch information
xiaoxmeng authored and facebook-github-bot committed Nov 30, 2023
1 parent 5483151 commit fbbc278
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 10 deletions.
24 changes: 17 additions & 7 deletions velox/common/base/Counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,36 +20,46 @@
namespace facebook::velox {

void registerVeloxCounters() {
// Track hive handle generation latency in range of [0, 100s] and reports
// Tracks hive handle generation latency in range of [0, 100s] and reports
// P50, P90, P99, and P100.
REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(
kCounterHiveFileHandleGenerateLatencyMs, 10, 0, 100000, 50, 90, 99, 100);

REPORT_ADD_STAT_EXPORT_TYPE(
kCounterCacheShrinkCount, facebook::velox::StatType::COUNT);

// Track cache shrink latency in range of [0, 100s] and reports P50, P90, P99,
// and P100.
// Tracks cache shrink latency in range of [0, 100s] and reports P50, P90,
// P99, and P100.
REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(
kCounterCacheShrinkTimeMs, 10, 0, 100'000, 50, 90, 99, 100);

// Track memory reclaim exec time in range of [0, 600s] and reports
// Tracks memory reclaim exec time in range of [0, 600s] and reports
// P50, P90, P99, and P100.
REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(
kCounterMemoryReclaimExecTimeMs, 20, 0, 600'000, 50, 90, 99, 100);

// Track memory reclaim task wait time in range of [0, 60s] and reports
// Tracks memory reclaim task wait time in range of [0, 60s] and reports
// P50, P90, P99, and P100.
REPORT_ADD_HISTOGRAM_EXPORT_PERCENTILE(
kCounterMemoryReclaimWaitTimeMs, 10, 0, 60'000, 50, 90, 99, 100);

// Track memory reclaim bytes.
// Tracks memory reclaim bytes.
REPORT_ADD_STAT_EXPORT_TYPE(
kCounterMemoryReclaimedBytes, facebook::velox::StatType::SUM);

// Track the number of times that the memory reclaim wait timeouts.
// Tracks the number of times that the memory reclaim wait timeouts.
REPORT_ADD_STAT_EXPORT_TYPE(
kCounterMemoryReclaimWaitTimeoutCount, facebook::velox::StatType::SUM);

// Tracks the number of times that the memory reclaim fails because of
// non-reclaimable section which is an indicator that the memory reservation
// is not sufficient.
REPORT_ADD_STAT_EXPORT_TYPE(
kCounterMemoryNonReclaimableCount, facebook::velox::StatType::COUNT);

// Tracks the number of times that we hit the max spill level limit.
REPORT_ADD_STAT_EXPORT_TYPE(
kCounterMaxSpillLevelExceededCount, facebook::velox::StatType::COUNT);
}

} // namespace facebook::velox
6 changes: 6 additions & 0 deletions velox/common/base/Counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,10 @@ constexpr folly::StringPiece kCounterMemoryReclaimWaitTimeMs{

constexpr folly::StringPiece kCounterMemoryReclaimWaitTimeoutCount{
"velox.memory_reclaim_wait_timeout_count"};

constexpr folly::StringPiece kCounterMemoryNonReclaimableCount{
"velox.memory_non_reclaimable_count"};

constexpr folly::StringPiece kCounterMaxSpillLevelExceededCount{
"velox.spill_max_level_exceeded_count"};
} // namespace facebook::velox
4 changes: 4 additions & 0 deletions velox/common/base/StatsReporter.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@
namespace facebook::velox {

enum class StatType {
/// Tracks the average of the inserted values.
AVG,
/// Tracks the sum of the inserted values.
SUM,
/// Tracks the sum of the inserted values per second.
RATE,
/// Tracks the count of inserted values.
COUNT,
};

Expand Down
8 changes: 5 additions & 3 deletions velox/connectors/hive/HiveDataSink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,17 @@

#include "velox/connectors/hive/HiveDataSink.h"

#include "velox/common/base/Counters.h"
#include "velox/common/base/Fs.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/testutil/TestValue.h"
#include "velox/connectors/hive/HiveConfig.h"
#include "velox/connectors/hive/HivePartitionFunction.h"
#include "velox/connectors/hive/TableHandle.h"
#include "velox/core/ITypedExpr.h"
#include "velox/dwio/common/SortingWriter.h"
#include "velox/exec/SortBuffer.h"

#include "velox/connectors/hive/TableHandle.h"
#include "velox/exec/OperatorUtils.h"
#include "velox/exec/SortBuffer.h"

#include <boost/lexical_cast.hpp>
#include <boost/uuid/uuid_generators.hpp>
Expand Down Expand Up @@ -876,6 +877,7 @@ uint64_t HiveDataSink::WriterReclaimer::reclaim(
}

if (*writerInfo_->nonReclaimableSectionHolder.get()) {
REPORT_ADD_STAT_VALUE(kCounterMemoryNonReclaimableCount);
LOG(WARNING) << "Can't reclaim from hive writer pool " << pool->name()
<< " which is under non-reclaimable section, "
<< " used memory: " << succinctBytes(pool->currentBytes())
Expand Down
3 changes: 3 additions & 0 deletions velox/dwio/dwrf/writer/Writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

#include <folly/ScopeGuard.h>

#include "velox/common/base/Counters.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/memory/MemoryArbitrator.h"
#include "velox/common/testutil/TestValue.h"
#include "velox/common/time/CpuWallTimer.h"
Expand Down Expand Up @@ -726,6 +728,7 @@ uint64_t Writer::MemoryReclaimer::reclaim(
}

if (*writer_->nonReclaimableSection_) {
REPORT_ADD_STAT_VALUE(kCounterMemoryNonReclaimableCount);
LOG(WARNING)
<< "Can't reclaim from dwrf writer which is under non-reclaimable section: "
<< pool->name();
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/HashBuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/

#include "velox/exec/HashBuild.h"
#include "velox/common/base/Counters.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/testutil/TestValue.h"
#include "velox/exec/OperatorUtils.h"
#include "velox/exec/Task.h"
Expand Down Expand Up @@ -220,6 +222,7 @@ void HashBuild::setupSpiller(SpillPartition* spillPartition) {
// Disable spilling if exceeding the max spill level and the query might run
// out of memory if the restored partition still can't fit in memory.
if (spillConfig.exceedJoinSpillLevelLimit(startBit)) {
REPORT_ADD_STAT_VALUE(kCounterMaxSpillLevelExceededCount);
LOG(WARNING) << "Exceeded spill level limit: "
<< spillConfig.maxSpillLevel
<< ", and disable spilling for memory pool: "
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/Operator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/
#include "velox/exec/Operator.h"
#include "velox/common/base/Counters.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/base/SuccinctPrinter.h"
#include "velox/common/testutil/TestValue.h"
#include "velox/exec/Driver.h"
Expand Down Expand Up @@ -605,6 +607,7 @@ uint64_t Operator::MemoryReclaimer::reclaim(
if (op_->nonReclaimableSection_) {
// TODO: reduce the log frequency if it is too verbose.
++stats.numNonReclaimableAttempts;
REPORT_ADD_STAT_VALUE(kCounterMemoryNonReclaimableCount);
LOG(WARNING) << "Can't reclaim from memory pool " << pool->name()
<< " which is under non-reclaimable section, memory usage: "
<< succinctBytes(pool->currentBytes())
Expand Down

0 comments on commit fbbc278

Please sign in to comment.