From 544958542f8a0b51e55e33fbd3bf495b584f2159 Mon Sep 17 00:00:00 2001 From: paulfantom Date: Sat, 23 Oct 2021 16:13:50 +0200 Subject: [PATCH] apps/monitoring: add grafana mixin --- apps/monitoring/jsonnet/jsonnetfile.json | 31 +- apps/monitoring/jsonnet/jsonnetfile.lock.json | 20 +- apps/monitoring/jsonnet/main.jsonnet | 22 +- .../grafana/dashboardDefinitions.yaml | 625 ++++++++++++++++++ .../manifests/grafana/prometheusRule.yaml | 31 + 5 files changed, 703 insertions(+), 26 deletions(-) create mode 100644 apps/monitoring/manifests/grafana/prometheusRule.yaml diff --git a/apps/monitoring/jsonnet/jsonnetfile.json b/apps/monitoring/jsonnet/jsonnetfile.json index 7e8817d3f..f46953adc 100644 --- a/apps/monitoring/jsonnet/jsonnetfile.json +++ b/apps/monitoring/jsonnet/jsonnetfile.json @@ -4,8 +4,8 @@ { "source": { "git": { - "remote": "https://github.com/prometheus-operator/kube-prometheus", - "subdir": "jsonnet/kube-prometheus" + "remote": "https://github.com/grafana/grafana.git", + "subdir": "grafana-mixin" } }, "version": "main" @@ -13,7 +13,7 @@ { "source": { "git": { - "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", "subdir": "" } }, @@ -22,7 +22,8 @@ { "source": { "git": { - "remote": "https://github.com/povilasv/coredns-mixin" + "remote": "https://github.com/povilasv/coredns-mixin.git", + "subdir": "" } }, "version": "master" @@ -30,8 +31,8 @@ { "source": { "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs", - "subdir": "apps/kube-events-exporter" + "remote": "https://github.com/prometheus-operator/kube-prometheus.git", + "subdir": "jsonnet/kube-prometheus" } }, "version": "main" @@ -39,8 +40,8 @@ { "source": { "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs", - "subdir": "apps/pagespeed" + "remote": "https://github.com/thaum-xyz/jsonnet-libs.git", + "subdir": "apps/kube-events-exporter" } }, "version": "main" @@ -48,8 +49,8 @@ { "source": { "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs", - "subdir": "apps/prometheus-exporter" + "remote": "https://github.com/thaum-xyz/jsonnet-libs.git", + "subdir": "apps/pagespeed" } }, "version": "main" @@ -57,8 +58,8 @@ { "source": { "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs", - "subdir": "apps/pushgateway" + "remote": "https://github.com/thaum-xyz/jsonnet-libs.git", + "subdir": "apps/prometheus-exporter" } }, "version": "main" @@ -66,8 +67,8 @@ { "source": { "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs", - "subdir": "apps/sloth" + "remote": "https://github.com/thaum-xyz/jsonnet-libs.git", + "subdir": "apps/pushgateway" } }, "version": "main" @@ -75,7 +76,7 @@ { "source": { "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs", + "remote": "https://github.com/thaum-xyz/jsonnet-libs.git", "subdir": "utils" } }, diff --git a/apps/monitoring/jsonnet/jsonnetfile.lock.json b/apps/monitoring/jsonnet/jsonnetfile.lock.json index 1a5762463..2485bb783 100644 --- a/apps/monitoring/jsonnet/jsonnetfile.lock.json +++ b/apps/monitoring/jsonnet/jsonnetfile.lock.json @@ -21,6 +21,16 @@ "version": "ef1f71a9f65a06b5810527731101de43a1b286d7", "sum": "cdKL5kPYfpWSpTCu4qctmh+gWQqL+4YWom6rw9qLYJU=" }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafana.git", + "subdir": "grafana-mixin" + } + }, + "version": "f496c31018cdb5ecc8b3c30ea96a235a5bcf470a", + "sum": "9MEef25py18Q3NVHaW4NJF+pEL1HDSdR8VJAFD++Efw=" + }, { "source": { "git": { @@ -205,16 +215,6 @@ "version": "376e28ea96207f72b81d145ee2c108ab1ec9e4a7", "sum": "LX92wD2SVhLG4ezLa/vNpO34kcd6/s2YCYXZBNfl7pE=" }, - { - "source": { - "git": { - "remote": "https://github.com/thaum-xyz/jsonnet-libs.git", - "subdir": "apps/sloth" - } - }, - "version": "376e28ea96207f72b81d145ee2c108ab1ec9e4a7", - "sum": "lSwta4y7YmiW8+WUKhjbOBrXz3JFnFI3CpQJtLd9ZrU=" - }, { "source": { "git": { diff --git a/apps/monitoring/jsonnet/main.jsonnet b/apps/monitoring/jsonnet/main.jsonnet index 5cdc73ca8..00d6dfeb6 100644 --- a/apps/monitoring/jsonnet/main.jsonnet +++ b/apps/monitoring/jsonnet/main.jsonnet @@ -113,7 +113,13 @@ local kp = // // TODO: figure out how to make this a JSON/YAML file! - values+:: (import '../config.jsonnet'), + values+:: (import '../config.jsonnet') + + // TODO: Remove this when https://github.com/prometheus-operator/kube-prometheus/pull/1458 is merged + { + grafana+: { + dashboards+: (import 'github.com/grafana/grafana/grafana-mixin/mixin.libsonnet').grafanaDashboards, + }, + }, // // Objects customization @@ -338,6 +344,20 @@ local kp = }, }, + // TODO: Remove PrometheusRule object when https://github.com/prometheus-operator/kube-prometheus/pull/1458 is merged + prometheusRule: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: $.grafana.deployment.metadata { + name: $.grafana.deployment.metadata.name + '-rules', + }, + spec: { + local r = std.parseYaml(importstr 'github.com/grafana/grafana/grafana-mixin/rules/rules.yaml').groups, + local a = std.parseYaml(importstr 'github.com/grafana/grafana/grafana-mixin/alerts/alerts.yaml').groups, + groups: a + r, + }, + }, + pvc: { kind: 'PersistentVolumeClaim', apiVersion: 'v1', diff --git a/apps/monitoring/manifests/grafana/dashboardDefinitions.yaml b/apps/monitoring/manifests/grafana/dashboardDefinitions.yaml index 246fabe62..e838d08b4 100644 --- a/apps/monitoring/manifests/grafana/dashboardDefinitions.yaml +++ b/apps/monitoring/manifests/grafana/dashboardDefinitions.yaml @@ -5372,6 +5372,631 @@ items: app.kubernetes.io/version: 8.2.1 name: grafana-dashboard-controller-manager namespace: monitoring +- apiVersion: v1 + data: + grafana-overview.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [ + + ], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 3085, + "iteration": 1631554945276, + "links": [ + + ], + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "mappings": [ + + ], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": { + + }, + "textMode": "auto" + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Firing Alerts", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": { + + }, + "textMode": "auto" + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Dashboards", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": null, + "displayMode": "auto" + }, + "mappings": [ + + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 10, + "options": { + "showHeader": true + }, + "pluginVersion": "8.1.3", + "targets": [ + { + "expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Build Info", + "transformations": [ + { + "id": "labelsToFields", + "options": { + + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "branch": true, + "container": true, + "goversion": true, + "namespace": true, + "pod": true, + "revision": true + }, + "indexByName": { + "Time": 7, + "Value": 11, + "branch": 4, + "container": 8, + "edition": 2, + "goversion": 6, + "instance": 1, + "job": 0, + "namespace": 9, + "pod": 10, + "revision": 5, + "version": 3 + }, + "renameByName": { + + } + } + } + ], + "type": "table" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ] + }, + "overrides": [ + + ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (status_code) (irate(grafana_http_request_duration_seconds_count{job=~\"$job\", instance=~\"$instance\"}[1m])) ", + "interval": "", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "$$hashKey": "object:157", + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:158", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "links": [ + + ] + }, + "overrides": [ + + ] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1", + "interval": "", + "legendFormat": "99th Percentile", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(irate(grafana_http_request_duration_seconds_bucket{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) by (le)) * 1", + "interval": "", + "legendFormat": "50th Percentile", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(irate(grafana_http_request_duration_seconds_sum{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval])) * 1 / sum(irate(grafana_http_request_duration_seconds_count{instance=~\"$instance\", job=~\"$job\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Average", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "$$hashKey": "object:210", + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:211", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 30, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "dev-cortex", + "value": "dev-cortex" + }, + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": [ + "default/grafana" + ], + "value": [ + "default/grafana" + ] + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, job)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [ + + ], + "query": { + "query": "label_values(grafana_build_info, job)", + "refId": "Billing Admin-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, instance)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [ + + ], + "query": { + "query": "label_values(grafana_build_info, instance)", + "refId": "Billing Admin-instance-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Grafana Overview", + "uid": "6be0s85Mk", + "version": 2 + } + kind: ConfigMap + metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.2.1 + name: grafana-dashboard-grafana-overview + namespace: monitoring - apiVersion: v1 data: k8s-resources-cluster.json: |- diff --git a/apps/monitoring/manifests/grafana/prometheusRule.yaml b/apps/monitoring/manifests/grafana/prometheusRule.yaml new file mode 100644 index 000000000..fc1d2b8d5 --- /dev/null +++ b/apps/monitoring/manifests/grafana/prometheusRule.yaml @@ -0,0 +1,31 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 8.2.1 + name: grafana-rules + namespace: monitoring +spec: + groups: + - name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + annotations: + message: '''{{ $labels.namespace }}'' / ''{{ $labels.job }}'' / ''{{ $labels.handler + }}'' is experiencing {{ $value | humanize }}% errors' + expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / + namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"} + > 0.5 + for: 5m + labels: + severity: warning + - name: grafana_rules + rules: + - expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m