From e3d6565af54595249a9b513cc5bb874c40dc0f80 Mon Sep 17 00:00:00 2001 From: Thomas Fan Date: Wed, 7 Feb 2018 16:38:18 -0500 Subject: [PATCH] Adds compound alerts --- docs/usage.md | 6 ++ server/server.go | 126 +++++++++++++++++++++++++++++++++++++----- server/server_test.go | 114 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 231 insertions(+), 15 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index e808b6c..3356d1e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -62,6 +62,12 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/) !!! note I hope that the number of shortcuts will grow with time thanks to community contributions. Please create [an issue](https://github.com/vfarcic/docker-flow-monitor/issues) with the `alertIf` statement and the suggested shortcut and I'll add it to the code as soon as possible. +### AlertIf Logical Operators + +The logical operators `and`, `unless`, and `or` can be used in combinations with AlertIf Parameter Shortcuts. For example, to create an alert that triggers when response time is low unless response time is high, set `alertIf=@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`. This alert prevents `@resp_time_below` from triggering while `@resp_time_above` is triggering. The `summary` annotation for this alert will be merged with the `and` operator: "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1". When using logical operators, there are no default alert labels. The alert labels will have to be manually set by using the `alertLabels` query parameter. + + More information on the logical operators can be found on Prometheus's querying [documentation](https://prometheus.io/docs/prometheus/latest/querying/operators/#logical-set-binary-operators). + ## Remove !!! tip diff --git a/server/server.go b/server/server.go index 0d4e89f..5eceee0 100644 --- a/server/server.go +++ b/server/server.go @@ -324,40 +324,136 @@ var alertIfShortcutData = map[string]alertIfShortcut{ func (s *serve) formatAlert(alert *prometheus.Alert) { alert.AlertNameFormatted = s.getNameFormatted(fmt.Sprintf("%s_%s", alert.ServiceName, alert.AlertName)) - if strings.HasPrefix(alert.AlertIf, "@") { + if !strings.HasPrefix(alert.AlertIf, "@") { + return + } + + _, bOp, _ := splitCompoundOp(alert.AlertIf) + if len(bOp) > 0 { + formatCompoundAlert(alert) + } else { + formatSingleAlert(alert) + } + +} + +func formatSingleAlert(alert *prometheus.Alert) { + + value := "" + alertSplit := strings.Split(alert.AlertIf, ":") + shortcut := alertSplit[0] + + if len(alertSplit) > 1 { + value = alertSplit[1] + } + + data, ok := alertIfShortcutData[shortcut] + if !ok { + return + } + + alert.AlertIf = replaceTags(data.expanded, alert, value) + + if alert.AlertAnnotations == nil { + alert.AlertAnnotations = map[string]string{} + } + for k, v := range data.annotations { + if _, ok := alert.AlertAnnotations[k]; !ok { + alert.AlertAnnotations[k] = replaceTags(v, alert, value) + } + } + + if alert.AlertLabels == nil { + alert.AlertLabels = map[string]string{} + } + for k, v := range data.labels { + if _, ok := alert.AlertLabels[k]; !ok { + alert.AlertLabels[k] = replaceTags(v, alert, value) + } + } +} + +func formatCompoundAlert(alert *prometheus.Alert) { + alertIfStr := alert.AlertIf + alertAnnotations := map[string]string{} + immutableAnnotations := map[string]struct{}{} + + // copy alert annotations and alert labels + if alert.AlertAnnotations != nil { + for k := range alert.AlertAnnotations { + immutableAnnotations[k] = struct{}{} + } + } + + var alertIfFormattedBuffer bytes.Buffer + + currentAlert, bOp, alertIfStr := splitCompoundOp(alertIfStr) + + for len(currentAlert) > 0 { value := "" - alertSplit := strings.Split(alert.AlertIf, ":") + alertSplit := strings.Split(currentAlert, ":") shortcut := alertSplit[0] if len(alertSplit) > 1 { value = alertSplit[1] } - data, ok := alertIfShortcutData[shortcut] if !ok { return } - alert.AlertIf = replaceTags(data.expanded, alert, value) - - if alert.AlertAnnotations == nil { - alert.AlertAnnotations = map[string]string{} + alertIfFormattedBuffer.WriteString(replaceTags(data.expanded, alert, value)) + if len(bOp) > 0 { + alertIfFormattedBuffer.WriteString(fmt.Sprintf(" %s ", bOp)) } + for k, v := range data.annotations { - if _, ok := alert.AlertAnnotations[k]; !ok { - alert.AlertAnnotations[k] = replaceTags(v, alert, value) + if _, ok := immutableAnnotations[k]; ok { + continue + } + alertAnnotations[k] += replaceTags(v, alert, value) + if len(bOp) > 0 { + alertAnnotations[k] += fmt.Sprintf(" %s ", bOp) } } + currentAlert, bOp, alertIfStr = splitCompoundOp(alertIfStr) + } - if alert.AlertLabels == nil { - alert.AlertLabels = map[string]string{} + alert.AlertIf = alertIfFormattedBuffer.String() + + if alert.AlertAnnotations == nil { + alert.AlertAnnotations = map[string]string{} + } + + for k, v := range alertAnnotations { + if _, ok := immutableAnnotations[k]; ok { + continue } - for k, v := range data.labels { - if _, ok := alert.AlertLabels[k]; !ok { - alert.AlertLabels[k] = replaceTags(v, alert, value) - } + alert.AlertAnnotations[k] = v + } + +} + +// splitCompoundOp find splits string into three pieces if it includes _unless_, +// _and_, or _or_. For example, hello_and_world_or_earth will return [hello, and, world_or_earth] +func splitCompoundOp(s string) (string, string, string) { + binaryOps := []string{"unless", "and", "or"} + + minIdx := len(s) + minOp := "" + for _, bOp := range binaryOps { + idx := strings.Index(s, fmt.Sprintf("_%s_", bOp)) + if idx != -1 && idx < minIdx { + minIdx = idx + minOp = bOp } } + + if len(minOp) > 0 { + return s[:minIdx], minOp, s[minIdx+len(minOp)+2:] + } + return s, "", "" + } func replaceTags(tag string, alert *prometheus.Alert, value string) string { diff --git a/server/server_test.go b/server/server_test.go index 94d3a81..a517dd9 100644 --- a/server/server_test.go +++ b/server/server_test.go @@ -7,6 +7,7 @@ import ( "net/http/httptest" "net/url" "os" + "strings" "testing" "time" @@ -310,6 +311,119 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts() { } } +func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts_CompoundOps() { + testData := []struct { + expected string + shortcut string + annotations map[string]string + labels map[string]string + }{ + { + `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`, + `@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`, + map[string]string{"summary": "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"}, + map[string]string{}, + }, + { + `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`, + `@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`, + map[string]string{"summary": "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"}, + map[string]string{"receiver": "system", "service": "my-service", "type": "service"}, + }, + { + `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99 and container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`, + `@resp_time_above:0.1,5m,0.99_and_@service_mem_limit:0.8`, + map[string]string{"summary": "Response time of the service my-service is above 0.1 and Memory of the service my-service is over 0.8"}, + map[string]string{"receiver": "system", "service": "my-service"}, + }, + { + `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99 or container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`, + `@resp_time_above:0.1,5m,0.99_or_@service_mem_limit:0.8`, + map[string]string{"summary": "Response time of the service my-service is above 0.1 or Memory of the service my-service is over 0.8"}, + map[string]string{"receiver": "system"}, + }, + { + `container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8 and sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`, + `@service_mem_limit:0.8_and_@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`, + map[string]string{"summary": "Memory of the service my-service is over 0.8 and Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"}, + map[string]string{"receiver": "system"}, + }, + } + + for _, data := range testData { + expected := prometheus.Alert{ + AlertAnnotations: data.annotations, + AlertFor: "my-for", + AlertIf: data.expected, + AlertLabels: data.labels, + AlertName: "my-alert", + AlertNameFormatted: "myservice_myalert", + ServiceName: "my-service", + Replicas: 3, + } + rwMock := ResponseWriterMock{} + alertQueries := []string{} + for k, v := range data.labels { + alertQueries = append(alertQueries, fmt.Sprintf("%s=%s", k, v)) + } + alertQueryStr := strings.Join(alertQueries, ",") + addr := fmt.Sprintf( + "/v1/docker-flow-monitor?serviceName=%s&alertName=%s&alertIf=%s&alertFor=%s&replicas=3", + expected.ServiceName, + expected.AlertName, + data.shortcut, + expected.AlertFor, + ) + if len(alertQueries) > 0 { + addr += fmt.Sprintf("&alertLabels=%s", alertQueryStr) + } + req, _ := http.NewRequest("GET", addr, nil) + + serve := New() + serve.ReconfigureHandler(rwMock, req) + + s.Equal(expected, serve.alerts[expected.AlertNameFormatted]) + } +} + +func (s *ServerTestSuite) Test_ReconfigureHandler_DoesNotExpandAnnotations_WhenTheyAreAlreadySet_CompoundOps() { + testData := struct { + expected string + shortcut string + annotations map[string]string + labels map[string]string + }{ + `sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`, + `@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`, + map[string]string{"summary": "not-again"}, + map[string]string{"receiver": "system", "service": "ugly-service"}, + } + expected := prometheus.Alert{ + AlertAnnotations: testData.annotations, + AlertFor: "my-for", + AlertIf: testData.expected, + AlertLabels: testData.labels, + AlertName: "my-alert", + AlertNameFormatted: "myservice_myalert", + ServiceName: "my-service", + Replicas: 3, + } + rwMock := ResponseWriterMock{} + addr := fmt.Sprintf( + "/v1/docker-flow-monitor?serviceName=%s&alertName=%s&alertIf=%s&alertFor=%s&replicas=3&alertAnnotations=summary=not-again&alertLabels=service=ugly-service,receiver=system", + expected.ServiceName, + expected.AlertName, + testData.shortcut, + expected.AlertFor, + ) + req, _ := http.NewRequest("GET", addr, nil) + + serve := New() + serve.ReconfigureHandler(rwMock, req) + + s.Equal(expected, serve.alerts[expected.AlertNameFormatted]) +} + func (s *ServerTestSuite) Test_ReconfigureHandler_DoesNotExpandAnnotationsAndLabels_WhenTheyAreAlreadySet() { testData := struct { expected string