From da04f5007edd85f35d1af5ba8c2c5a4eb96d2149 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Wed, 2 Oct 2024 11:20:07 +0200 Subject: [PATCH] feat: mixin / add loki compaction not successfull alert (#14239) Co-authored-by: Ashwanth --- .../loki-mixin-compiled-ssd/alerts.yaml | 33 +++++++++++++ production/loki-mixin-compiled/alerts.yaml | 33 +++++++++++++ production/loki-mixin/alerts.libsonnet | 47 +++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 09b9b6f543412..e33c010ff0560 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -43,3 +43,36 @@ groups: for: 5m labels: severity: warning + - alert: LokiCompactorHasNotSuccessfullyRunCompaction + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor. + summary: Loki compaction has not run in the last 3 hours since the last compaction. + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + min ( + time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0) + ) + by (cluster, namespace) + > 60 * 60 * 3 + for: 1h + labels: + severity: critical + - alert: LokiCompactorHasNotSuccessfullyRunCompaction + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor. + summary: Loki compaction has not run in the last 3h since startup. + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + max( + max_over_time( + loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h] + ) + ) by (cluster, namespace) + == 0 + for: 1h + labels: + severity: critical diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 09b9b6f543412..e33c010ff0560 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -43,3 +43,36 @@ groups: for: 5m labels: severity: warning + - alert: LokiCompactorHasNotSuccessfullyRunCompaction + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor. + summary: Loki compaction has not run in the last 3 hours since the last compaction. + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + min ( + time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0) + ) + by (cluster, namespace) + > 60 * 60 * 3 + for: 1h + labels: + severity: critical + - alert: LokiCompactorHasNotSuccessfullyRunCompaction + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor. + summary: Loki compaction has not run in the last 3h since startup. + expr: | + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + max( + max_over_time( + loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h] + ) + ) by (cluster, namespace) + == 0 + for: 1h + labels: + severity: critical diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 9261dbccecf99..02fb2a0ee5662 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -70,6 +70,53 @@ |||, 'cluster', $._config.per_cluster_label), }, }, + { + // Alert if the compactor has not successfully run compaction in the last 3h since the last compaction. + alert: 'LokiCompactorHasNotSuccessfullyRunCompaction', + expr: ||| + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + min ( + time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0) + ) + by (%s, namespace) + > 60 * 60 * 3 + ||| % $._config.per_cluster_label, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Loki compaction has not run in the last 3 hours since the last compaction.', + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor. + |||, 'cluster', $._config.per_cluster_label), + }, + }, + { + // Alert if the compactor has not successfully run compaction in the last 3h since startup. + alert: 'LokiCompactorHasNotSuccessfullyRunCompaction', + expr: ||| + # The "last successful run" metric is updated even if the compactor owns no tenants, + # so this alert correctly doesn't fire if compactor has nothing to do. + max( + max_over_time( + loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h] + ) + ) by (%s, namespace) + == 0 + ||| % $._config.per_cluster_label, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Loki compaction has not run in the last 3h since startup.', + description: std.strReplace(||| + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor. + |||, 'cluster', $._config.per_cluster_label), + }, + }, ], }, ],