From 638a1eab732b977c7c5d9ab69c4ba0b7adeba551 Mon Sep 17 00:00:00 2001 From: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> Date: Fri, 15 Dec 2023 07:47:00 +0100 Subject: [PATCH 1/9] parameterize alert rules --- production/loki-mixin/alerts.libsonnet | 18 +++++++++--------- production/loki-mixin/config.libsonnet | 17 +++++++++++++++++ production/loki-mixin/mixin-ssd.libsonnet | 4 +--- production/loki-mixin/mixin.libsonnet | 1 + 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 0045cc194ba3a..f10fc8e415f4e 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -7,11 +7,11 @@ { alert: 'LokiRequestErrors', expr: ||| - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (%(group_by_cluster)s, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (%(group_by_cluster)s, job, route) > 10 - |||, + ||| % $._config, 'for': '15m', labels: { severity: 'critical', @@ -25,8 +25,8 @@ { alert: 'LokiRequestPanics', expr: ||| - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - |||, + sum(increase(loki_panic_total[10m])) by (%(group_by_cluster)s, job) > 0 + ||| % $._config, labels: { severity: 'critical', }, @@ -39,8 +39,8 @@ { alert: 'LokiRequestLatency', expr: ||| - %s_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - ||| % $._config.per_cluster_label, + %(per_cluster_label)s_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + ||| % $._config, 'for': '15m', labels: { severity: 'critical', @@ -54,8 +54,8 @@ { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - |||, + sum(loki_boltdb_shipper_compactor_running) by (%(group_by_cluster)s) > 1 + ||| % $._config, 'for': '5m', labels: { severity: 'warning', diff --git a/production/loki-mixin/config.libsonnet b/production/loki-mixin/config.libsonnet index 1fa22f566cc69..17801de78b82e 100644 --- a/production/loki-mixin/config.libsonnet +++ b/production/loki-mixin/config.libsonnet @@ -1,4 +1,7 @@ { + local makePrefix(groups) = std.join('_', groups), + local makeGroupBy(groups) = std.join(', ', groups), + _config+:: { // Tags for dashboards. tags: ['loki'], @@ -11,6 +14,20 @@ // The label used to differentiate between different clusters. per_cluster_label: 'cluster', + per_namespace_label: 'namespace', + per_job_label: 'job', + + // Grouping labels, to uniquely identify and group by {jobs, clusters} + job_labels: [$._config.per_cluster_label, $._config.per_namespace_label, $._config.per_job_label], + cluster_labels: [$._config.per_cluster_label, $._config.per_namespace_label], + + // Each group prefix is composed of `_`-separated labels + group_prefix_jobs: makePrefix($._config.job_labels), + group_prefix_clusters: makePrefix($._config.cluster_labels), + + // Each group-by label list is `, `-separated and unique identifies + group_by_job: makeGroupBy($._config.job_labels), + group_by_cluster: makeGroupBy($._config.cluster_labels), // Enable dashboard and panels for Grafana Labs internal components. internal_components: false, diff --git a/production/loki-mixin/mixin-ssd.libsonnet b/production/loki-mixin/mixin-ssd.libsonnet index 01c59bb6ab7cc..273777ebeda9e 100644 --- a/production/loki-mixin/mixin-ssd.libsonnet +++ b/production/loki-mixin/mixin-ssd.libsonnet @@ -1,6 +1,4 @@ -(import 'dashboards.libsonnet') + -(import 'alerts.libsonnet') + -(import 'recording_rules.libsonnet') + { +(import 'mixin.libsonnet') + { grafanaDashboardFolder: 'Loki SSD', _config+:: { diff --git a/production/loki-mixin/mixin.libsonnet b/production/loki-mixin/mixin.libsonnet index 7e21657b2e61d..7e73476d2c59a 100644 --- a/production/loki-mixin/mixin.libsonnet +++ b/production/loki-mixin/mixin.libsonnet @@ -1,5 +1,6 @@ (import 'dashboards.libsonnet') + (import 'alerts.libsonnet') + +(import 'config.libsonnet') + (import 'recording_rules.libsonnet') + { grafanaDashboardFolder: 'Loki', } From 7d1297ace5e1c8cabaf96159d7ba717474fc626b Mon Sep 17 00:00:00 2001 From: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> Date: Fri, 15 Dec 2023 08:24:19 +0100 Subject: [PATCH 2/9] parameterize recording rules --- production/loki-mixin/recording_rules.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/production/loki-mixin/recording_rules.libsonnet b/production/loki-mixin/recording_rules.libsonnet index 2feda5cac6e8a..fe666c58482cc 100644 --- a/production/loki-mixin/recording_rules.libsonnet +++ b/production/loki-mixin/recording_rules.libsonnet @@ -7,7 +7,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; rules: utils.histogramRules('loki_request_duration_seconds', [$._config.per_cluster_label, 'job']) + utils.histogramRules('loki_request_duration_seconds', [$._config.per_cluster_label, 'job', 'route']) + - utils.histogramRules('loki_request_duration_seconds', [$._config.per_cluster_label, 'namespace', 'job', 'route']), + utils.histogramRules('loki_request_duration_seconds', $._config.job_labels + ['route']), }], }, } From 943abab7d4f5a6da051e18db21df76f2c8b750e9 Mon Sep 17 00:00:00 2001 From: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> Date: Mon, 18 Dec 2023 07:37:38 +0100 Subject: [PATCH 3/9] Use group_prefix_jobs var for metric name --- production/loki-mixin/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index f10fc8e415f4e..0bbc1101a8102 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -39,7 +39,7 @@ { alert: 'LokiRequestLatency', expr: ||| - %(per_cluster_label)s_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + %(group_prefix_jobs)s_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 ||| % $._config, 'for': '15m', labels: { From 2e436704b44fe97e61089e3dc1f9889d1cfa44db Mon Sep 17 00:00:00 2001 From: Alexander Soelberg Heidarsson <89837986+alex5517@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:29:04 +0100 Subject: [PATCH 4/9] Remove config.libsonnet import from dashboards since imported in mixin.libsonnet --- production/loki-mixin/dashboards.libsonnet | 1 - 1 file changed, 1 deletion(-) diff --git a/production/loki-mixin/dashboards.libsonnet b/production/loki-mixin/dashboards.libsonnet index 9ea6c2ca82d53..20fd835bd45ec 100644 --- a/production/loki-mixin/dashboards.libsonnet +++ b/production/loki-mixin/dashboards.libsonnet @@ -1,4 +1,3 @@ -(import 'config.libsonnet') + (import 'dashboards/loki-retention.libsonnet') + (import 'dashboards/loki-chunks.libsonnet') + (import 'dashboards/loki-logs.libsonnet') + From 7754cf35a149b3b35415320c64ae130c1edd7f4c Mon Sep 17 00:00:00 2001 From: "Alexander Soelberg Heidarsson [Netic]" <89837986+alex5517@users.noreply.github.com> Date: Mon, 26 Feb 2024 07:10:29 +0100 Subject: [PATCH 5/9] Lint error - remove 2 spaces --- production/loki-mixin/config.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/production/loki-mixin/config.libsonnet b/production/loki-mixin/config.libsonnet index 17801de78b82e..e4198bda35ce6 100644 --- a/production/loki-mixin/config.libsonnet +++ b/production/loki-mixin/config.libsonnet @@ -1,7 +1,7 @@ { local makePrefix(groups) = std.join('_', groups), local makeGroupBy(groups) = std.join(', ', groups), - + _config+:: { // Tags for dashboards. tags: ['loki'], From 3a6142132778cef38ee1260b8172e90d7ad2b4a6 Mon Sep 17 00:00:00 2001 From: "Alexander Soelberg Heidarsson [Netic]" <89837986+alex5517@users.noreply.github.com> Date: Thu, 29 Feb 2024 12:07:55 +0100 Subject: [PATCH 6/9] Run make loki-mixin --- production/loki-mixin-compiled/alerts.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 77f285b99c060..1b78ca6ace62a 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -6,9 +6,9 @@ groups: message: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 for: 15m labels: @@ -18,7 +18,7 @@ groups: message: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency @@ -35,7 +35,7 @@ groups: message: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning From de90f709567f793aede3949d862dd0be96a13f0a Mon Sep 17 00:00:00 2001 From: "Alexander Soelberg Heidarsson [Netic]" <89837986+alex5517@users.noreply.github.com> Date: Thu, 18 Apr 2024 07:54:53 +0200 Subject: [PATCH 7/9] Pull from main --- production/loki-mixin-compiled-ssd/alerts.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 77f285b99c060..1b78ca6ace62a 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -6,9 +6,9 @@ groups: message: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) > 10 for: 15m labels: @@ -18,7 +18,7 @@ groups: message: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency @@ -35,7 +35,7 @@ groups: message: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning From 88e5d7341247b1fcb34973633b86a4deeb3d00c7 Mon Sep 17 00:00:00 2001 From: "Alexander Soelberg Heidarsson [Netic]" <89837986+alex5517@users.noreply.github.com> Date: Thu, 18 Apr 2024 08:07:28 +0200 Subject: [PATCH 8/9] Run make loki-mixin --- .../loki-mixin-compiled-ssd/alerts.yaml | 88 +++++++++--------- production/loki-mixin-compiled-ssd/rules.yaml | 90 +++++++++++-------- production/loki-mixin-compiled/alerts.yaml | 59 ++---------- production/loki-mixin-compiled/rules.yaml | 90 +++++++++++-------- 4 files changed, 156 insertions(+), 171 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 7c0825d8580d6..313541fababb9 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -1,45 +1,45 @@ groups: - - name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: Loki request error rate is high. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - summary: Loki requests are causing code panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: Loki request error latency is high. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - description: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - summary: Loki deployment is running more than one compactor. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning +- name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + description: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + description: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. + expr: | + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + description: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled-ssd/rules.yaml b/production/loki-mixin-compiled-ssd/rules.yaml index 5893770570f6e..2a54ed4fb2e5b 100644 --- a/production/loki-mixin-compiled-ssd/rules.yaml +++ b/production/loki-mixin-compiled-ssd/rules.yaml @@ -1,39 +1,53 @@ groups: - - name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate +- name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index fb08fc6e4d609..313541fababb9 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -1,11 +1,11 @@ groups: -<<<<<<< HEAD - name: loki_alerts rules: - alert: LokiRequestErrors annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) / @@ -16,16 +16,18 @@ groups: severity: critical - alert: LokiRequestPanics annotations: - message: | + description: | {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. expr: | sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: - message: | + description: | {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m @@ -33,56 +35,11 @@ groups: severity: critical - alert: LokiTooManyCompactorsRunning annotations: - message: | + description: | {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. expr: | sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 for: 5m labels: severity: warning -======= - - name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: Loki request error rate is high. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - summary: Loki requests are causing code panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: Loki request error latency is high. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - description: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - summary: Loki deployment is running more than one compactor. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning ->>>>>>> 985251c486f43178d01cd07c438f2ce269818d68 diff --git a/production/loki-mixin-compiled/rules.yaml b/production/loki-mixin-compiled/rules.yaml index 5893770570f6e..2a54ed4fb2e5b 100644 --- a/production/loki-mixin-compiled/rules.yaml +++ b/production/loki-mixin-compiled/rules.yaml @@ -1,39 +1,53 @@ groups: - - name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate +- name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate From fdfa18cf637fefc51d142df36b1132c9644c872f Mon Sep 17 00:00:00 2001 From: "Alexander Soelberg Heidarsson [Netic]" <89837986+alex5517@users.noreply.github.com> Date: Wed, 29 May 2024 10:41:16 +0200 Subject: [PATCH 9/9] Build mixin --- .../loki-mixin-compiled-ssd/alerts.yaml | 88 +++++++++--------- production/loki-mixin-compiled-ssd/rules.yaml | 90 ++++++++----------- production/loki-mixin-compiled/alerts.yaml | 88 +++++++++--------- production/loki-mixin-compiled/rules.yaml | 90 ++++++++----------- 4 files changed, 164 insertions(+), 192 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 313541fababb9..7e8c4226d4364 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -1,45 +1,45 @@ groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: Loki request error rate is high. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - summary: Loki requests are causing code panics. - expr: | - sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: Loki request error latency is high. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - description: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - summary: Loki deployment is running more than one compactor. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 - for: 5m - labels: - severity: warning + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + description: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + description: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. + expr: | + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + description: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled-ssd/rules.yaml b/production/loki-mixin-compiled-ssd/rules.yaml index 2a54ed4fb2e5b..5893770570f6e 100644 --- a/production/loki-mixin-compiled-ssd/rules.yaml +++ b/production/loki-mixin-compiled-ssd/rules.yaml @@ -1,53 +1,39 @@ groups: -- name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 313541fababb9..7e8c4226d4364 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -1,45 +1,45 @@ groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - summary: Loki request error rate is high. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - description: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - summary: Loki requests are causing code panics. - expr: | - sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - description: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - summary: Loki request error latency is high. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - description: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - summary: Loki deployment is running more than one compactor. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 - for: 5m - labels: - severity: warning + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + description: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: Loki request error rate is high. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + description: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + summary: Loki requests are causing code panics. + expr: | + sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + description: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: Loki request error latency is high. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + description: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + summary: Loki deployment is running more than one compactor. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled/rules.yaml b/production/loki-mixin-compiled/rules.yaml index 2a54ed4fb2e5b..5893770570f6e 100644 --- a/production/loki-mixin-compiled/rules.yaml +++ b/production/loki-mixin-compiled/rules.yaml @@ -1,53 +1,39 @@ groups: -- name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate