Skip to content

Commit

Permalink
Merge pull request #32 from thomasjpfan/compound_alerts
Browse files Browse the repository at this point in the history
Adds compound alerts
  • Loading branch information
vfarcic authored Feb 9, 2018
2 parents 2888083 + e3d6565 commit 03da12d
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 15 deletions.
6 changes: 6 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/)
!!! note
I hope that the number of shortcuts will grow with time thanks to community contributions. Please create [an issue](https://github.com/vfarcic/docker-flow-monitor/issues) with the `alertIf` statement and the suggested shortcut and I'll add it to the code as soon as possible.

### AlertIf Logical Operators

The logical operators `and`, `unless`, and `or` can be used in combinations with AlertIf Parameter Shortcuts. For example, to create an alert that triggers when response time is low unless response time is high, set `alertIf=@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`. This alert prevents `@resp_time_below` from triggering while `@resp_time_above` is triggering. The `summary` annotation for this alert will be merged with the `and` operator: "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1". When using logical operators, there are no default alert labels. The alert labels will have to be manually set by using the `alertLabels` query parameter.

More information on the logical operators can be found on Prometheus's querying [documentation](https://prometheus.io/docs/prometheus/latest/querying/operators/#logical-set-binary-operators).

## Remove

!!! tip
Expand Down
126 changes: 111 additions & 15 deletions server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,40 +324,136 @@ var alertIfShortcutData = map[string]alertIfShortcut{

func (s *serve) formatAlert(alert *prometheus.Alert) {
alert.AlertNameFormatted = s.getNameFormatted(fmt.Sprintf("%s_%s", alert.ServiceName, alert.AlertName))
if strings.HasPrefix(alert.AlertIf, "@") {
if !strings.HasPrefix(alert.AlertIf, "@") {
return
}

_, bOp, _ := splitCompoundOp(alert.AlertIf)
if len(bOp) > 0 {
formatCompoundAlert(alert)
} else {
formatSingleAlert(alert)
}

}

func formatSingleAlert(alert *prometheus.Alert) {

value := ""
alertSplit := strings.Split(alert.AlertIf, ":")
shortcut := alertSplit[0]

if len(alertSplit) > 1 {
value = alertSplit[1]
}

data, ok := alertIfShortcutData[shortcut]
if !ok {
return
}

alert.AlertIf = replaceTags(data.expanded, alert, value)

if alert.AlertAnnotations == nil {
alert.AlertAnnotations = map[string]string{}
}
for k, v := range data.annotations {
if _, ok := alert.AlertAnnotations[k]; !ok {
alert.AlertAnnotations[k] = replaceTags(v, alert, value)
}
}

if alert.AlertLabels == nil {
alert.AlertLabels = map[string]string{}
}
for k, v := range data.labels {
if _, ok := alert.AlertLabels[k]; !ok {
alert.AlertLabels[k] = replaceTags(v, alert, value)
}
}
}

func formatCompoundAlert(alert *prometheus.Alert) {
alertIfStr := alert.AlertIf
alertAnnotations := map[string]string{}
immutableAnnotations := map[string]struct{}{}

// copy alert annotations and alert labels
if alert.AlertAnnotations != nil {
for k := range alert.AlertAnnotations {
immutableAnnotations[k] = struct{}{}
}
}

var alertIfFormattedBuffer bytes.Buffer

currentAlert, bOp, alertIfStr := splitCompoundOp(alertIfStr)

for len(currentAlert) > 0 {
value := ""
alertSplit := strings.Split(alert.AlertIf, ":")
alertSplit := strings.Split(currentAlert, ":")
shortcut := alertSplit[0]

if len(alertSplit) > 1 {
value = alertSplit[1]
}

data, ok := alertIfShortcutData[shortcut]
if !ok {
return
}

alert.AlertIf = replaceTags(data.expanded, alert, value)

if alert.AlertAnnotations == nil {
alert.AlertAnnotations = map[string]string{}
alertIfFormattedBuffer.WriteString(replaceTags(data.expanded, alert, value))
if len(bOp) > 0 {
alertIfFormattedBuffer.WriteString(fmt.Sprintf(" %s ", bOp))
}

for k, v := range data.annotations {
if _, ok := alert.AlertAnnotations[k]; !ok {
alert.AlertAnnotations[k] = replaceTags(v, alert, value)
if _, ok := immutableAnnotations[k]; ok {
continue
}
alertAnnotations[k] += replaceTags(v, alert, value)
if len(bOp) > 0 {
alertAnnotations[k] += fmt.Sprintf(" %s ", bOp)
}
}
currentAlert, bOp, alertIfStr = splitCompoundOp(alertIfStr)
}

if alert.AlertLabels == nil {
alert.AlertLabels = map[string]string{}
alert.AlertIf = alertIfFormattedBuffer.String()

if alert.AlertAnnotations == nil {
alert.AlertAnnotations = map[string]string{}
}

for k, v := range alertAnnotations {
if _, ok := immutableAnnotations[k]; ok {
continue
}
for k, v := range data.labels {
if _, ok := alert.AlertLabels[k]; !ok {
alert.AlertLabels[k] = replaceTags(v, alert, value)
}
alert.AlertAnnotations[k] = v
}

}

// splitCompoundOp find splits string into three pieces if it includes _unless_,
// _and_, or _or_. For example, hello_and_world_or_earth will return [hello, and, world_or_earth]
func splitCompoundOp(s string) (string, string, string) {
binaryOps := []string{"unless", "and", "or"}

minIdx := len(s)
minOp := ""
for _, bOp := range binaryOps {
idx := strings.Index(s, fmt.Sprintf("_%s_", bOp))
if idx != -1 && idx < minIdx {
minIdx = idx
minOp = bOp
}
}

if len(minOp) > 0 {
return s[:minIdx], minOp, s[minIdx+len(minOp)+2:]
}
return s, "", ""

}

func replaceTags(tag string, alert *prometheus.Alert, value string) string {
Expand Down
114 changes: 114 additions & 0 deletions server/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http/httptest"
"net/url"
"os"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -310,6 +311,119 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts() {
}
}

func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts_CompoundOps() {
testData := []struct {
expected string
shortcut string
annotations map[string]string
labels map[string]string
}{
{
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
`@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
map[string]string{"summary": "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"},
map[string]string{},
},
{
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
`@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
map[string]string{"summary": "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"},
map[string]string{"receiver": "system", "service": "my-service", "type": "service"},
},
{
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99 and container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`,
`@resp_time_above:0.1,5m,0.99_and_@service_mem_limit:0.8`,
map[string]string{"summary": "Response time of the service my-service is above 0.1 and Memory of the service my-service is over 0.8"},
map[string]string{"receiver": "system", "service": "my-service"},
},
{
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99 or container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`,
`@resp_time_above:0.1,5m,0.99_or_@service_mem_limit:0.8`,
map[string]string{"summary": "Response time of the service my-service is above 0.1 or Memory of the service my-service is over 0.8"},
map[string]string{"receiver": "system"},
},
{
`container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8 and sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
`@service_mem_limit:0.8_and_@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
map[string]string{"summary": "Memory of the service my-service is over 0.8 and Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"},
map[string]string{"receiver": "system"},
},
}

for _, data := range testData {
expected := prometheus.Alert{
AlertAnnotations: data.annotations,
AlertFor: "my-for",
AlertIf: data.expected,
AlertLabels: data.labels,
AlertName: "my-alert",
AlertNameFormatted: "myservice_myalert",
ServiceName: "my-service",
Replicas: 3,
}
rwMock := ResponseWriterMock{}
alertQueries := []string{}
for k, v := range data.labels {
alertQueries = append(alertQueries, fmt.Sprintf("%s=%s", k, v))
}
alertQueryStr := strings.Join(alertQueries, ",")
addr := fmt.Sprintf(
"/v1/docker-flow-monitor?serviceName=%s&alertName=%s&alertIf=%s&alertFor=%s&replicas=3",
expected.ServiceName,
expected.AlertName,
data.shortcut,
expected.AlertFor,
)
if len(alertQueries) > 0 {
addr += fmt.Sprintf("&alertLabels=%s", alertQueryStr)
}
req, _ := http.NewRequest("GET", addr, nil)

serve := New()
serve.ReconfigureHandler(rwMock, req)

s.Equal(expected, serve.alerts[expected.AlertNameFormatted])
}
}

func (s *ServerTestSuite) Test_ReconfigureHandler_DoesNotExpandAnnotations_WhenTheyAreAlreadySet_CompoundOps() {
testData := struct {
expected string
shortcut string
annotations map[string]string
labels map[string]string
}{
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
`@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
map[string]string{"summary": "not-again"},
map[string]string{"receiver": "system", "service": "ugly-service"},
}
expected := prometheus.Alert{
AlertAnnotations: testData.annotations,
AlertFor: "my-for",
AlertIf: testData.expected,
AlertLabels: testData.labels,
AlertName: "my-alert",
AlertNameFormatted: "myservice_myalert",
ServiceName: "my-service",
Replicas: 3,
}
rwMock := ResponseWriterMock{}
addr := fmt.Sprintf(
"/v1/docker-flow-monitor?serviceName=%s&alertName=%s&alertIf=%s&alertFor=%s&replicas=3&alertAnnotations=summary=not-again&alertLabels=service=ugly-service,receiver=system",
expected.ServiceName,
expected.AlertName,
testData.shortcut,
expected.AlertFor,
)
req, _ := http.NewRequest("GET", addr, nil)

serve := New()
serve.ReconfigureHandler(rwMock, req)

s.Equal(expected, serve.alerts[expected.AlertNameFormatted])
}

func (s *ServerTestSuite) Test_ReconfigureHandler_DoesNotExpandAnnotationsAndLabels_WhenTheyAreAlreadySet() {
testData := struct {
expected string
Expand Down

0 comments on commit 03da12d

Please sign in to comment.