mirror of
https://github.com/SigNoz/signoz.git
synced 2025-12-17 15:36:48 +00:00
* chore: added custom distpatcher * feat(notification-grouping): added notification grouping * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): added structure changes * feat(notification-grouping): added structure changes * feat(notification-routing): added notification routing * chore(notification-grouping): added notificaiton grouping * Update pkg/alertmanager/nfmanager/rulebasednotification/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): added renotification interval * fix(notification-grouping): added fix for renotification * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notification-grouping): added no data renotify interval * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed unwanted tests * chore(notification-grouping): removed unwanted pkg name * chore(notification-grouping): added delete notification setting * chore(notification-grouping): added delete notification setting * Update pkg/alertmanager/nfmanager/nfmanagertest/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): go linting errors * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): corrected get notification config logic * Update pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * feat(notification-routing): added test cases for dispatcher * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Apply suggestions from code review Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Update pkg/alertmanager/alertmanagerserver/distpatcher_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): sorted imports * chore(notification-routing): minor edit |pr resolve comments * chore(notification-grouping): corrected dispatcher test cases * chore(notification-routing): added notification routing policies * chore(notification-routing): corrected race condition in test * chore: resolved pr comments * chore: passing threshold value to tempalte * chore: completed delete rule functionality * chore: added grouping disabled functionality * chore: added grouping disabled functionality * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): sorted imports * chore(notification-routing): fix linting errors * chore(notification-routing): removed enabled flags * fix: test rule multiple threhsold (#9224) * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * Update pkg/sqlmigration/049_add_route_policy.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore: added org_is as foreign key * chore: resolved pr comments * chore: removed route store unused --------- Co-authored-by: Srikanth Chekuri <srikanth.chekuri92@gmail.com> Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
1311 lines
42 KiB
Go
1311 lines
42 KiB
Go
package alertmanagerserver
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"reflect"
|
|
"sort"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager"
|
|
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest"
|
|
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest"
|
|
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification"
|
|
"github.com/SigNoz/signoz/pkg/factory"
|
|
"github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest"
|
|
"github.com/SigNoz/signoz/pkg/types"
|
|
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
|
|
"github.com/SigNoz/signoz/pkg/valuer"
|
|
|
|
"github.com/prometheus/alertmanager/config"
|
|
"github.com/prometheus/alertmanager/dispatch"
|
|
"github.com/prometheus/alertmanager/notify"
|
|
"github.com/prometheus/alertmanager/provider/mem"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/common/model"
|
|
"github.com/prometheus/common/promslog"
|
|
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
func createTestProviderSettings() factory.ProviderSettings {
|
|
return instrumentationtest.New().ToProviderSettings()
|
|
}
|
|
|
|
func TestAggrGroup(t *testing.T) {
|
|
lset := model.LabelSet{
|
|
"a": "v1",
|
|
"b": "v2",
|
|
}
|
|
opts := &dispatch.RouteOpts{
|
|
Receiver: "n1",
|
|
GroupBy: map[model.LabelName]struct{}{
|
|
"a": {},
|
|
"b": {},
|
|
},
|
|
GroupWait: 1 * time.Second,
|
|
GroupInterval: 300 * time.Millisecond,
|
|
RepeatInterval: 1 * time.Hour,
|
|
}
|
|
route := &dispatch.Route{
|
|
RouteOpts: *opts,
|
|
}
|
|
orgId := "test-org-id"
|
|
ruleId := "test-rule-id"
|
|
notificationConfig := alertmanagertypes.NotificationConfig{
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 2 * time.Hour,
|
|
},
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("a"): {},
|
|
model.LabelName("b"): {},
|
|
},
|
|
}
|
|
// Setup notification manager using nfmanagertest
|
|
nfManager := nfmanagertest.NewMock()
|
|
nfManager.SetMockConfig(orgId, ruleId, ¬ificationConfig)
|
|
|
|
var (
|
|
a1 = &alertmanagertypes.Alert{
|
|
Alert: model.Alert{
|
|
Labels: model.LabelSet{
|
|
"a": "v1",
|
|
"b": "v2",
|
|
"c": "v3",
|
|
"ruleId": "test-rule-1",
|
|
},
|
|
StartsAt: time.Now().Add(time.Minute),
|
|
EndsAt: time.Now().Add(time.Hour),
|
|
},
|
|
UpdatedAt: time.Now(),
|
|
}
|
|
a2 = &alertmanagertypes.Alert{
|
|
Alert: model.Alert{
|
|
Labels: model.LabelSet{
|
|
"a": "v1",
|
|
"b": "v2",
|
|
"c": "v4",
|
|
"ruleId": "test-rule-1",
|
|
},
|
|
StartsAt: time.Now().Add(-time.Hour),
|
|
EndsAt: time.Now().Add(2 * time.Hour),
|
|
},
|
|
UpdatedAt: time.Now(),
|
|
}
|
|
a3 = &alertmanagertypes.Alert{
|
|
Alert: model.Alert{
|
|
Labels: model.LabelSet{
|
|
"a": "v1",
|
|
"b": "v2",
|
|
"c": "v5",
|
|
"ruleId": "test-rule-1",
|
|
},
|
|
StartsAt: time.Now().Add(time.Minute),
|
|
EndsAt: time.Now().Add(5 * time.Minute),
|
|
},
|
|
UpdatedAt: time.Now(),
|
|
}
|
|
)
|
|
|
|
var (
|
|
last = time.Now()
|
|
current = time.Now()
|
|
lastCurMtx = &sync.Mutex{}
|
|
alertsCh = make(chan alertmanagertypes.AlertSlice)
|
|
)
|
|
|
|
ntfy := func(ctx context.Context, alerts ...*alertmanagertypes.Alert) bool {
|
|
// Validate that the context is properly populated.
|
|
if _, ok := notify.Now(ctx); !ok {
|
|
t.Errorf("now missing")
|
|
}
|
|
if _, ok := notify.GroupKey(ctx); !ok {
|
|
t.Errorf("group key missing")
|
|
}
|
|
if lbls, ok := notify.GroupLabels(ctx); !ok || !reflect.DeepEqual(lbls, lset) {
|
|
t.Errorf("wrong group labels: %q", lbls)
|
|
}
|
|
if rcv, ok := notify.ReceiverName(ctx); !ok || rcv != opts.Receiver {
|
|
t.Errorf("wrong receiver: %q", rcv)
|
|
}
|
|
if ri, ok := notify.RepeatInterval(ctx); !ok || ri != notificationConfig.Renotify.RenotifyInterval {
|
|
t.Errorf("wrong repeat interval: %q", ri)
|
|
}
|
|
|
|
lastCurMtx.Lock()
|
|
last = current
|
|
// Subtract a millisecond to allow for races.
|
|
current = time.Now().Add(-time.Millisecond)
|
|
lastCurMtx.Unlock()
|
|
|
|
alertsCh <- alertmanagertypes.AlertSlice(alerts)
|
|
|
|
return true
|
|
}
|
|
|
|
removeEndsAt := func(as alertmanagertypes.AlertSlice) alertmanagertypes.AlertSlice {
|
|
for i, a := range as {
|
|
ac := *a
|
|
ac.EndsAt = time.Time{}
|
|
as[i] = &ac
|
|
}
|
|
return as
|
|
}
|
|
|
|
// Test regular situation where we wait for group_wait to send out alerts.
|
|
ag := newAggrGroup(context.Background(), lset, route, nil, promslog.NewNopLogger(), notificationConfig.Renotify.RenotifyInterval)
|
|
|
|
go ag.run(ntfy)
|
|
|
|
ag.insert(a1)
|
|
|
|
select {
|
|
case <-time.After(2 * opts.GroupWait):
|
|
t.Fatalf("expected initial batch after group_wait")
|
|
|
|
case batch := <-alertsCh:
|
|
lastCurMtx.Lock()
|
|
s := time.Since(last)
|
|
lastCurMtx.Unlock()
|
|
if s < opts.GroupWait {
|
|
t.Fatalf("received batch too early after %v", s)
|
|
}
|
|
exp := removeEndsAt(alertmanagertypes.AlertSlice{a1})
|
|
sort.Sort(batch)
|
|
|
|
if !reflect.DeepEqual(batch, exp) {
|
|
t.Fatalf("expected alerts %v but got %v", exp, batch)
|
|
}
|
|
}
|
|
|
|
for i := 0; i < 3; i++ {
|
|
// NewMock alert should come in after group interval.
|
|
ag.insert(a3)
|
|
|
|
select {
|
|
case <-time.After(2 * opts.GroupInterval):
|
|
t.Fatalf("expected new batch after group interval but received none")
|
|
|
|
case batch := <-alertsCh:
|
|
lastCurMtx.Lock()
|
|
s := time.Since(last)
|
|
lastCurMtx.Unlock()
|
|
if s < opts.GroupInterval {
|
|
t.Fatalf("received batch too early after %v", s)
|
|
}
|
|
exp := removeEndsAt(alertmanagertypes.AlertSlice{a1, a3})
|
|
sort.Sort(batch)
|
|
|
|
if !reflect.DeepEqual(batch, exp) {
|
|
t.Fatalf("expected alerts %v but got %v", exp, batch)
|
|
}
|
|
}
|
|
}
|
|
|
|
ag.stop()
|
|
|
|
// Add an alert that started more than group_interval in the past. We expect
|
|
// immediate flushing.
|
|
// Finally, set all alerts to be resolved. After successful notify the aggregation group
|
|
// should empty itself.
|
|
ag = newAggrGroup(context.Background(), lset, route, nil, promslog.NewNopLogger(), notificationConfig.Renotify.RenotifyInterval)
|
|
go ag.run(ntfy)
|
|
|
|
ag.insert(a1)
|
|
ag.insert(a2)
|
|
|
|
// a2 lies way in the past so the initial group_wait should be skipped.
|
|
select {
|
|
case <-time.After(opts.GroupWait / 2):
|
|
t.Fatalf("expected immediate alert but received none")
|
|
|
|
case batch := <-alertsCh:
|
|
exp := removeEndsAt(alertmanagertypes.AlertSlice{a1, a2})
|
|
sort.Sort(batch)
|
|
|
|
if !reflect.DeepEqual(batch, exp) {
|
|
t.Fatalf("expected alerts %v but got %v", exp, batch)
|
|
}
|
|
}
|
|
|
|
for i := 0; i < 3; i++ {
|
|
// NewMock alert should come in after group interval.
|
|
ag.insert(a3)
|
|
|
|
select {
|
|
case <-time.After(2 * opts.GroupInterval):
|
|
t.Fatalf("expected new batch after group interval but received none")
|
|
|
|
case batch := <-alertsCh:
|
|
lastCurMtx.Lock()
|
|
s := time.Since(last)
|
|
lastCurMtx.Unlock()
|
|
if s < opts.GroupInterval {
|
|
t.Fatalf("received batch too early after %v", s)
|
|
}
|
|
exp := removeEndsAt(alertmanagertypes.AlertSlice{a1, a2, a3})
|
|
sort.Sort(batch)
|
|
|
|
if !reflect.DeepEqual(batch, exp) {
|
|
t.Fatalf("expected alerts %v but got %v", exp, batch)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Resolve an alert, and it should be removed after the next batch was sent.
|
|
a1r := *a1
|
|
a1r.EndsAt = time.Now()
|
|
ag.insert(&a1r)
|
|
exp := append(alertmanagertypes.AlertSlice{&a1r}, removeEndsAt(alertmanagertypes.AlertSlice{a2, a3})...)
|
|
|
|
select {
|
|
case <-time.After(2 * opts.GroupInterval):
|
|
t.Fatalf("expected new batch after group interval but received none")
|
|
case batch := <-alertsCh:
|
|
lastCurMtx.Lock()
|
|
s := time.Since(last)
|
|
lastCurMtx.Unlock()
|
|
if s < opts.GroupInterval {
|
|
t.Fatalf("received batch too early after %v", s)
|
|
}
|
|
sort.Sort(batch)
|
|
|
|
if !reflect.DeepEqual(batch, exp) {
|
|
t.Fatalf("expected alerts %v but got %v", exp, batch)
|
|
}
|
|
}
|
|
|
|
// Resolve all remaining alerts, they should be removed after the next batch was sent.
|
|
// Do not add a1r as it should have been deleted following the previous batch.
|
|
a2r, a3r := *a2, *a3
|
|
resolved := alertmanagertypes.AlertSlice{&a2r, &a3r}
|
|
for _, a := range resolved {
|
|
a.EndsAt = time.Now()
|
|
ag.insert(a)
|
|
}
|
|
|
|
select {
|
|
case <-time.After(2 * opts.GroupInterval):
|
|
t.Fatalf("expected new batch after group interval but received none")
|
|
|
|
case batch := <-alertsCh:
|
|
lastCurMtx.Lock()
|
|
s := time.Since(last)
|
|
lastCurMtx.Unlock()
|
|
if s < opts.GroupInterval {
|
|
t.Fatalf("received batch too early after %v", s)
|
|
}
|
|
sort.Sort(batch)
|
|
|
|
if !reflect.DeepEqual(batch, resolved) {
|
|
t.Fatalf("expected alerts %v but got %v", resolved, batch)
|
|
}
|
|
|
|
if !ag.empty() {
|
|
t.Fatalf("Expected aggregation group to be empty after resolving alerts: %v", ag)
|
|
}
|
|
}
|
|
|
|
ag.stop()
|
|
}
|
|
|
|
func TestGroupLabels(t *testing.T) {
|
|
a := &alertmanagertypes.Alert{
|
|
Alert: model.Alert{
|
|
Labels: model.LabelSet{
|
|
"a": "v1",
|
|
"b": "v2",
|
|
"c": "v3",
|
|
},
|
|
},
|
|
}
|
|
|
|
route := &dispatch.Route{
|
|
RouteOpts: dispatch.RouteOpts{
|
|
GroupBy: map[model.LabelName]struct{}{
|
|
"a": {},
|
|
"b": {},
|
|
},
|
|
GroupByAll: false,
|
|
},
|
|
}
|
|
|
|
expLs := model.LabelSet{
|
|
"a": "v1",
|
|
"b": "v2",
|
|
}
|
|
|
|
ls := getGroupLabels(a, route.RouteOpts.GroupBy, false)
|
|
|
|
if !reflect.DeepEqual(ls, expLs) {
|
|
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
|
|
}
|
|
}
|
|
|
|
func TestAggrRouteMap(t *testing.T) {
|
|
// Simplified config with just receivers and default route - no hardcoded routing rules
|
|
confData := `receivers:
|
|
- name: 'slack'
|
|
- name: 'email'
|
|
- name: 'pagerduty'
|
|
|
|
route:
|
|
group_by: ['alertname']
|
|
group_wait: 1m
|
|
group_interval: 1m
|
|
receiver: 'slack'`
|
|
conf, err := config.Load(confData)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
providerSettings := createTestProviderSettings()
|
|
logger := providerSettings.Logger
|
|
route := dispatch.NewRoute(conf.Route, nil)
|
|
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer alerts.Close()
|
|
|
|
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
|
|
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)}
|
|
metrics := NewDispatcherMetrics(false, prometheus.NewRegistry())
|
|
store := nfroutingstoretest.NewMockSQLRouteStore()
|
|
store.MatchExpectationsInOrder(false)
|
|
nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
orgId := "test-org"
|
|
|
|
ctx := context.Background()
|
|
routes := []*alertmanagertypes.RoutePolicy{
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-OtherAlert" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-OtherAlert",
|
|
Description: "Route for OtherAlert to Slack",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"slack"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-OtherAlert" && threshold.name == "warning"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-OtherAlert",
|
|
Description: "Route for cluster aa and service api to Email",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"email"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-HighLatency" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-HighLatency",
|
|
Description: "High priority route for HighLatency to PagerDuty",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"pagerduty"},
|
|
},
|
|
}
|
|
// Set up SQL mock expectations for the CreateBatch call
|
|
store.ExpectCreateBatch(routes)
|
|
err = nfManager.CreateRoutePolicies(ctx, orgId, routes)
|
|
require.NoError(t, err)
|
|
|
|
// Set up expectations for getting routes during matching (multiple calls expected)
|
|
|
|
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId)
|
|
go dispatcher.Run()
|
|
defer dispatcher.Stop()
|
|
inputAlerts := []*alertmanagertypes.Alert{
|
|
newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "dc", "service": "dd", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "aa", "service": "api", "kafka": "yes", "instance": "inst3"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "test-db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}),
|
|
}
|
|
for i := 0; i < 9; i++ {
|
|
store.ExpectGetAllByName(orgId, string(inputAlerts[i].Labels["ruleId"]), routes)
|
|
}
|
|
notiConfigs := map[string]alertmanagertypes.NotificationConfig{
|
|
"ruleId-OtherAlert": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("cluster"): {},
|
|
model.LabelName("service"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 10,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
"ruleId-TestingAlert": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("service"): {},
|
|
model.LabelName("instance"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 11,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
"ruleId-HighErrorRate": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("cluster"): {},
|
|
model.LabelName("instance"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 12,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
"ruleId-HighLatency": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("service"): {},
|
|
model.LabelName("kafka"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 13,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
}
|
|
|
|
for ruleID, config := range notiConfigs {
|
|
err := nfManager.SetNotificationConfig(orgId, ruleID, &config)
|
|
require.NoError(t, err)
|
|
}
|
|
err = alerts.Put(inputAlerts...)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Let alerts get processed.
|
|
for i := 0; len(recorder.Alerts()) != 4; i++ {
|
|
time.Sleep(400 * time.Millisecond)
|
|
}
|
|
require.Len(t, recorder.Alerts(), 4)
|
|
|
|
alertGroups, receivers := dispatcher.Groups(
|
|
func(*dispatch.Route) bool {
|
|
return true
|
|
}, func(*alertmanagertypes.Alert, time.Time) bool {
|
|
return true
|
|
},
|
|
)
|
|
|
|
dispatcher.mtx.RLock()
|
|
aggrGroupsPerRoute := dispatcher.aggrGroupsPerRoute
|
|
dispatcher.mtx.RUnlock()
|
|
|
|
require.NotEmpty(t, aggrGroupsPerRoute, "Should have aggregation groups per route")
|
|
|
|
routeIDsFound := make(map[string]bool)
|
|
totalAggrGroups := 0
|
|
|
|
//first lets check for valid route id
|
|
for route, groups := range aggrGroupsPerRoute {
|
|
routeID := route.ID()
|
|
routeIDsFound[routeID] = true
|
|
expectedReceiver := ""
|
|
switch routeID {
|
|
case "{__receiver__=\"slack\"}":
|
|
expectedReceiver = "slack"
|
|
case "{__receiver__=\"email\"}":
|
|
expectedReceiver = "email"
|
|
case "{__receiver__=\"pagerduty\"}":
|
|
expectedReceiver = "pagerduty"
|
|
}
|
|
if expectedReceiver != "" {
|
|
require.Equal(t, expectedReceiver, route.RouteOpts.Receiver,
|
|
"Route %s should have receiver %s", routeID, expectedReceiver)
|
|
}
|
|
totalAggrGroups += len(groups)
|
|
}
|
|
|
|
require.Equal(t, 4, totalAggrGroups, "Should have exactly 4 aggregation groups")
|
|
|
|
// Verify specific route group counts
|
|
expectedGroupCounts := map[string]int{
|
|
"{__receiver__=\"slack\"}": 2,
|
|
"{__receiver__=\"pagerduty\"}": 2,
|
|
}
|
|
|
|
for route, groups := range aggrGroupsPerRoute {
|
|
routeID := route.ID()
|
|
if expectedCount, exists := expectedGroupCounts[routeID]; exists {
|
|
require.Equal(t, expectedCount, len(groups),
|
|
"Route %s should have %d groups, got %d", routeID, expectedCount, len(groups))
|
|
}
|
|
}
|
|
|
|
require.Equal(t, AlertGroups{
|
|
&AlertGroup{
|
|
Alerts: []*alertmanagertypes.Alert{inputAlerts[7]},
|
|
Labels: model.LabelSet{
|
|
"kafka": "yes",
|
|
"ruleId": "ruleId-HighLatency",
|
|
"service": "db",
|
|
},
|
|
Receiver: "pagerduty",
|
|
GroupKey: "{__receiver__=\"pagerduty\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}",
|
|
RouteID: "{__receiver__=\"pagerduty\"}",
|
|
Renotify: 13,
|
|
},
|
|
&AlertGroup{
|
|
Alerts: []*alertmanagertypes.Alert{inputAlerts[8]},
|
|
Labels: model.LabelSet{
|
|
"kafka": "yes",
|
|
"ruleId": "ruleId-HighLatency",
|
|
"service": "test-db",
|
|
},
|
|
Receiver: "pagerduty",
|
|
GroupKey: "{__receiver__=\"pagerduty\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"test-db\"}",
|
|
RouteID: "{__receiver__=\"pagerduty\"}",
|
|
Renotify: 13,
|
|
},
|
|
&AlertGroup{
|
|
Alerts: []*alertmanagertypes.Alert{inputAlerts[0]},
|
|
Labels: model.LabelSet{
|
|
"cluster": "cc",
|
|
"ruleId": "ruleId-OtherAlert",
|
|
"service": "dd",
|
|
},
|
|
Renotify: 10,
|
|
Receiver: "slack",
|
|
GroupKey: "{__receiver__=\"slack\"}:{cluster=\"cc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}",
|
|
RouteID: "{__receiver__=\"slack\"}",
|
|
},
|
|
&AlertGroup{
|
|
Alerts: []*alertmanagertypes.Alert{inputAlerts[1]},
|
|
Labels: model.LabelSet{
|
|
"cluster": "dc",
|
|
"service": "dd",
|
|
"ruleId": "ruleId-OtherAlert",
|
|
},
|
|
Renotify: 10,
|
|
Receiver: "slack",
|
|
GroupKey: "{__receiver__=\"slack\"}:{cluster=\"dc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}",
|
|
RouteID: "{__receiver__=\"slack\"}",
|
|
},
|
|
}, alertGroups)
|
|
require.Equal(t, map[model.Fingerprint][]string{
|
|
inputAlerts[0].Fingerprint(): {"slack"},
|
|
inputAlerts[1].Fingerprint(): {"slack"},
|
|
inputAlerts[7].Fingerprint(): {"pagerduty"},
|
|
inputAlerts[8].Fingerprint(): {"pagerduty"},
|
|
}, receivers)
|
|
}
|
|
|
|
func TestGroupsWithNodata(t *testing.T) {
|
|
// Simplified config with just receivers and default route - no hardcoded routing rules
|
|
confData := `receivers:
|
|
- name: 'slack'
|
|
- name: 'email'
|
|
- name: 'pagerduty'
|
|
|
|
route:
|
|
group_by: ['alertname']
|
|
group_wait: 10ms
|
|
group_interval: 10ms
|
|
receiver: 'slack'`
|
|
conf, err := config.Load(confData)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
providerSettings := createTestProviderSettings()
|
|
logger := providerSettings.Logger
|
|
route := dispatch.NewRoute(conf.Route, nil)
|
|
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer alerts.Close()
|
|
|
|
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
|
|
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)}
|
|
metrics := NewDispatcherMetrics(false, prometheus.NewRegistry())
|
|
store := nfroutingstoretest.NewMockSQLRouteStore()
|
|
store.MatchExpectationsInOrder(false)
|
|
nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
orgId := "test-org"
|
|
|
|
ctx := context.Background()
|
|
routes := []*alertmanagertypes.RoutePolicy{
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-OtherAlert" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-OtherAlert",
|
|
Description: "Route for OtherAlert critical to Slack",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"slack"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-TestingAlert" && threshold.name == "warning"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-TestingAlert",
|
|
Description: "Route for TestingAlert warning to Slack",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"slack"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-HighErrorRate" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-HighErrorRate",
|
|
Description: "Route for HighErrorRate critical to Email",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"email"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-HighLatency" && threshold.name == "warning"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-HighLatency",
|
|
Description: "Route for HighLatency warning to Email",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"email"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `ruleId == "ruleId-HighLatency" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-HighLatency",
|
|
Description: "Route for HighLatency critical to PagerDuty",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"pagerduty"},
|
|
},
|
|
}
|
|
// Set up SQL mock expectations for the CreateBatch call
|
|
store.ExpectCreateBatch(routes)
|
|
err = nfManager.CreateRoutePolicies(ctx, orgId, routes)
|
|
require.NoError(t, err)
|
|
|
|
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId)
|
|
go dispatcher.Run()
|
|
defer dispatcher.Stop()
|
|
|
|
inputAlerts := []*alertmanagertypes.Alert{
|
|
newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1", "threshold.name": "warning"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3", "threshold.name": "warning"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"ruleId": "ruleId-HighLatency", "nodata": "true"}),
|
|
}
|
|
// Set up expectations with route filtering for each alert
|
|
store.ExpectGetAllByName(orgId, "ruleId-OtherAlert", []*alertmanagertypes.RoutePolicy{routes[0]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-TestingAlert", []*alertmanagertypes.RoutePolicy{routes[1]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-HighErrorRate", []*alertmanagertypes.RoutePolicy{routes[2]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-HighErrorRate", []*alertmanagertypes.RoutePolicy{routes[2]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-HighErrorRate", []*alertmanagertypes.RoutePolicy{routes[2]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-HighLatency", []*alertmanagertypes.RoutePolicy{routes[3], routes[4]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-HighLatency", []*alertmanagertypes.RoutePolicy{routes[3], routes[4]})
|
|
store.ExpectGetAllByName(orgId, "ruleId-HighLatency", []*alertmanagertypes.RoutePolicy{routes[3], routes[4]})
|
|
notiConfigs := map[string]alertmanagertypes.NotificationConfig{
|
|
"ruleId-OtherAlert": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("cluster"): {},
|
|
model.LabelName("service"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 10,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
"ruleId-TestingAlert": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("service"): {},
|
|
model.LabelName("instance"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 11,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
"ruleId-HighErrorRate": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("cluster"): {},
|
|
model.LabelName("instance"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 12,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
"ruleId-HighLatency": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("service"): {},
|
|
model.LabelName("kafka"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 13,
|
|
NoDataInterval: 14,
|
|
},
|
|
UsePolicy: false,
|
|
},
|
|
}
|
|
|
|
for ruleID, config := range notiConfigs {
|
|
err := nfManager.SetNotificationConfig(orgId, ruleID, &config)
|
|
require.NoError(t, err)
|
|
}
|
|
err = alerts.Put(inputAlerts...)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
for i := 0; len(recorder.Alerts()) != 9; i++ {
|
|
time.Sleep(400 * time.Millisecond)
|
|
}
|
|
require.Len(t, recorder.Alerts(), 9)
|
|
|
|
alertGroups, receivers := dispatcher.Groups(
|
|
func(*dispatch.Route) bool {
|
|
return true
|
|
}, func(*alertmanagertypes.Alert, time.Time) bool {
|
|
return true
|
|
},
|
|
)
|
|
|
|
dispatcher.mtx.RLock()
|
|
aggrGroupsPerRoute := dispatcher.aggrGroupsPerRoute
|
|
dispatcher.mtx.RUnlock()
|
|
|
|
require.NotEmpty(t, aggrGroupsPerRoute, "Should have aggregation groups per route")
|
|
|
|
routeIDsFound := make(map[string]bool)
|
|
totalAggrGroups := 0
|
|
|
|
for route, groups := range aggrGroupsPerRoute {
|
|
routeID := route.ID()
|
|
routeIDsFound[routeID] = true
|
|
expectedReceiver := ""
|
|
switch routeID {
|
|
case "{__receiver__=\"slack\"}":
|
|
expectedReceiver = "slack"
|
|
case "{__receiver__=\"email\"}":
|
|
expectedReceiver = "email"
|
|
case "{__receiver__=\"pagerduty\"}":
|
|
expectedReceiver = "pagerduty"
|
|
}
|
|
if expectedReceiver != "" {
|
|
require.Equal(t, expectedReceiver, route.RouteOpts.Receiver,
|
|
"Route %s should have receiver %s", routeID, expectedReceiver)
|
|
}
|
|
totalAggrGroups += len(groups)
|
|
}
|
|
|
|
require.Equal(t, 9, totalAggrGroups, "Should have exactly 9 aggregation groups")
|
|
|
|
expectedGroupCounts := map[string]int{
|
|
"{__receiver__=\"slack\"}": 2,
|
|
"{__receiver__=\"email\"}": 5,
|
|
"{__receiver__=\"pagerduty\"}": 2,
|
|
}
|
|
|
|
for route, groups := range aggrGroupsPerRoute {
|
|
routeID := route.ID()
|
|
if expectedCount, exists := expectedGroupCounts[routeID]; exists {
|
|
require.Equal(t, expectedCount, len(groups),
|
|
"Route %s should have %d groups, got %d", routeID, expectedCount, len(groups))
|
|
}
|
|
}
|
|
|
|
// Verify alert groups contain expected alerts
|
|
require.Len(t, alertGroups, 9)
|
|
|
|
// Verify receivers mapping - exact expectations based on actual routing behavior
|
|
expectedReceivers := map[model.Fingerprint][]string{
|
|
inputAlerts[0].Fingerprint(): {"slack"}, // OtherAlert critical -> slack
|
|
inputAlerts[1].Fingerprint(): {"slack"}, // TestingAlert warning -> slack
|
|
inputAlerts[2].Fingerprint(): {"email"}, // HighErrorRate critical -> email
|
|
inputAlerts[3].Fingerprint(): {"email"}, // HighErrorRate critical -> email
|
|
inputAlerts[4].Fingerprint(): {"email"}, // HighErrorRate critical -> email
|
|
inputAlerts[5].Fingerprint(): {"email"}, // HighLatency warning -> email
|
|
inputAlerts[6].Fingerprint(): {"pagerduty"},
|
|
inputAlerts[7].Fingerprint(): {"email", "pagerduty"},
|
|
}
|
|
require.Equal(t, expectedReceivers, receivers)
|
|
}
|
|
|
|
func TestGroupsWithNotificationPolicy(t *testing.T) {
|
|
// Simplified config with just receivers and default route - no hardcoded routing rules
|
|
confData := `receivers:
|
|
- name: 'slack'
|
|
- name: 'email'
|
|
- name: 'pagerduty'
|
|
|
|
route:
|
|
group_by: ['alertname']
|
|
group_wait: 10ms
|
|
group_interval: 10ms
|
|
receiver: 'slack'`
|
|
conf, err := config.Load(confData)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
providerSettings := createTestProviderSettings()
|
|
logger := providerSettings.Logger
|
|
route := dispatch.NewRoute(conf.Route, nil)
|
|
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer alerts.Close()
|
|
|
|
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
|
|
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)}
|
|
metrics := NewDispatcherMetrics(false, prometheus.NewRegistry())
|
|
store := nfroutingstoretest.NewMockSQLRouteStore()
|
|
store.MatchExpectationsInOrder(false)
|
|
nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
orgId := "test-org"
|
|
|
|
ctx := context.Background()
|
|
routes := []*alertmanagertypes.RoutePolicy{
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `cluster == "bb" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-OtherAlert",
|
|
Description: "Route for OtherAlert critical to Slack",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"slack"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `service == "db" && threshold.name == "critical"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-TestingAlert",
|
|
Description: "Route for TestingAlert warning to Slack",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"slack"},
|
|
},
|
|
{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: `cluster == "bb" && instance == "inst1"`,
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: "ruleId-HighErrorRate",
|
|
Description: "Route for HighErrorRate critical to Email",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"email"},
|
|
},
|
|
}
|
|
// Set up SQL mock expectations for the CreateBatch call
|
|
store.ExpectCreateBatch(routes)
|
|
err = nfManager.CreateRoutePolicies(ctx, orgId, routes)
|
|
require.NoError(t, err)
|
|
|
|
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId)
|
|
go dispatcher.Run()
|
|
defer dispatcher.Stop()
|
|
|
|
inputAlerts := []*alertmanagertypes.Alert{
|
|
newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "db", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1", "threshold.name": "warning"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst1", "threshold.name": "warning"}),
|
|
newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}),
|
|
newAlert(model.LabelSet{"ruleId": "ruleId-HighLatency", "nodata": "true"}),
|
|
}
|
|
// Set up expectations with route filtering for each alert
|
|
for i := 0; i < len(inputAlerts); i++ {
|
|
store.ExpectGetAllByKindAndOrgID(orgId, alertmanagertypes.PolicyBasedExpression, routes)
|
|
}
|
|
notiConfigs := map[string]alertmanagertypes.NotificationConfig{
|
|
"ruleId-OtherAlert": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("cluster"): {},
|
|
model.LabelName("service"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 10,
|
|
},
|
|
UsePolicy: true,
|
|
},
|
|
"ruleId-TestingAlert": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("service"): {},
|
|
model.LabelName("instance"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 11,
|
|
},
|
|
UsePolicy: true,
|
|
},
|
|
"ruleId-HighErrorRate": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("cluster"): {},
|
|
model.LabelName("instance"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 12,
|
|
},
|
|
UsePolicy: true,
|
|
},
|
|
"ruleId-HighLatency": {
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
model.LabelName("service"): {},
|
|
model.LabelName("kafka"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 13,
|
|
NoDataInterval: 14,
|
|
},
|
|
UsePolicy: true,
|
|
},
|
|
}
|
|
|
|
for ruleID, config := range notiConfigs {
|
|
err := nfManager.SetNotificationConfig(orgId, ruleID, &config)
|
|
require.NoError(t, err)
|
|
}
|
|
err = alerts.Put(inputAlerts...)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
for i := 0; len(recorder.Alerts()) != 3 && i < 15; i++ {
|
|
time.Sleep(400 * time.Millisecond)
|
|
}
|
|
require.Len(t, recorder.Alerts(), 5)
|
|
|
|
alertGroups, receivers := dispatcher.Groups(
|
|
func(*dispatch.Route) bool {
|
|
return true
|
|
}, func(*alertmanagertypes.Alert, time.Time) bool {
|
|
return true
|
|
},
|
|
)
|
|
|
|
dispatcher.mtx.RLock()
|
|
aggrGroupsPerRoute := dispatcher.aggrGroupsPerRoute
|
|
dispatcher.mtx.RUnlock()
|
|
|
|
require.NotEmpty(t, aggrGroupsPerRoute, "Should have aggregation groups per route")
|
|
|
|
routeIDsFound := make(map[string]bool)
|
|
totalAggrGroups := 0
|
|
|
|
for route, groups := range aggrGroupsPerRoute {
|
|
routeID := route.ID()
|
|
routeIDsFound[routeID] = true
|
|
expectedReceiver := ""
|
|
switch routeID {
|
|
case "{__receiver__=\"slack\"}":
|
|
expectedReceiver = "slack"
|
|
case "{__receiver__=\"email\"}":
|
|
expectedReceiver = "email"
|
|
case "{__receiver__=\"pagerduty\"}":
|
|
expectedReceiver = "pagerduty"
|
|
}
|
|
if expectedReceiver != "" {
|
|
require.Equal(t, expectedReceiver, route.RouteOpts.Receiver,
|
|
"Route %s should have receiver %s", routeID, expectedReceiver)
|
|
}
|
|
totalAggrGroups += len(groups)
|
|
}
|
|
|
|
require.Equal(t, 5, totalAggrGroups, "Should have exactly 5 aggregation groups")
|
|
|
|
expectedGroupCounts := map[string]int{
|
|
"{__receiver__=\"slack\"}": 3,
|
|
"{__receiver__=\"email\"}": 2,
|
|
}
|
|
|
|
for route, groups := range aggrGroupsPerRoute {
|
|
routeID := route.ID()
|
|
if expectedCount, exists := expectedGroupCounts[routeID]; exists {
|
|
require.Equal(t, expectedCount, len(groups),
|
|
"Route %s should have %d groups, got %d", routeID, expectedCount, len(groups))
|
|
}
|
|
}
|
|
|
|
// Verify alert groups contain expected alerts
|
|
require.Len(t, alertGroups, 5)
|
|
|
|
// Verify receivers mapping - based on NotificationPolicy routing without ruleID
|
|
expectedReceivers := map[model.Fingerprint][]string{
|
|
inputAlerts[0].Fingerprint(): {"slack"},
|
|
inputAlerts[6].Fingerprint(): {"slack"},
|
|
inputAlerts[4].Fingerprint(): {"email", "slack"},
|
|
inputAlerts[5].Fingerprint(): {"email"},
|
|
}
|
|
require.Equal(t, expectedReceivers, receivers)
|
|
}
|
|
|
|
type recordStage struct {
|
|
mtx sync.RWMutex
|
|
alerts map[string]map[model.Fingerprint]*alertmanagertypes.Alert
|
|
}
|
|
|
|
func (r *recordStage) Alerts() []*alertmanagertypes.Alert {
|
|
r.mtx.RLock()
|
|
defer r.mtx.RUnlock()
|
|
alerts := make([]*alertmanagertypes.Alert, 0)
|
|
for k := range r.alerts {
|
|
for _, a := range r.alerts[k] {
|
|
alerts = append(alerts, a)
|
|
}
|
|
}
|
|
return alerts
|
|
}
|
|
|
|
func (r *recordStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*alertmanagertypes.Alert) (context.Context, []*alertmanagertypes.Alert, error) {
|
|
r.mtx.Lock()
|
|
defer r.mtx.Unlock()
|
|
gk, ok := notify.GroupKey(ctx)
|
|
if !ok {
|
|
panic("GroupKey not present!")
|
|
}
|
|
if _, ok := r.alerts[gk]; !ok {
|
|
r.alerts[gk] = make(map[model.Fingerprint]*alertmanagertypes.Alert)
|
|
}
|
|
for _, a := range alerts {
|
|
r.alerts[gk][a.Fingerprint()] = a
|
|
}
|
|
return ctx, nil, nil
|
|
}
|
|
|
|
var (
|
|
// Set the start time in the past to trigger a flush immediately.
|
|
t0 = time.Now().Add(-time.Minute)
|
|
// Set the end time in the future to avoid deleting the alert.
|
|
t1 = t0.Add(2 * time.Minute)
|
|
)
|
|
|
|
func newAlert(labels model.LabelSet) *alertmanagertypes.Alert {
|
|
return &alertmanagertypes.Alert{
|
|
Alert: model.Alert{
|
|
Labels: labels,
|
|
Annotations: model.LabelSet{"foo": "bar"},
|
|
StartsAt: t0,
|
|
EndsAt: t1,
|
|
GeneratorURL: "http://example.com/prometheus",
|
|
},
|
|
UpdatedAt: t0,
|
|
Timeout: false,
|
|
}
|
|
}
|
|
|
|
func TestDispatcherRace(t *testing.T) {
|
|
logger := promslog.NewNopLogger()
|
|
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer alerts.Close()
|
|
|
|
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
|
|
metrics := NewDispatcherMetrics(false, prometheus.NewRegistry())
|
|
nfManager := nfmanagertest.NewMock()
|
|
// Set up default expectation that won't be called in this race test
|
|
dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, nil, logger, metrics, nfManager, "test-org")
|
|
go dispatcher.Run()
|
|
dispatcher.Stop()
|
|
}
|
|
|
|
func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T) {
|
|
const numAlerts = 5000
|
|
confData := `receivers:
|
|
- name: 'slack'
|
|
- name: 'email'
|
|
- name: 'pagerduty'
|
|
|
|
route:
|
|
group_by: ['alertname']
|
|
group_wait: 1h
|
|
group_interval: 1h
|
|
receiver: 'slack'`
|
|
conf, err := config.Load(confData)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
route := dispatch.NewRoute(conf.Route, nil)
|
|
providerSettings := createTestProviderSettings()
|
|
logger := providerSettings.Logger
|
|
marker := alertmanagertypes.NewMarker(prometheus.NewRegistry())
|
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer alerts.Close()
|
|
timeout := func(d time.Duration) time.Duration { return d }
|
|
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)}
|
|
metrics := NewDispatcherMetrics(false, prometheus.NewRegistry())
|
|
store := nfroutingstoretest.NewMockSQLRouteStore()
|
|
store.MatchExpectationsInOrder(false)
|
|
nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
orgId := "test-org"
|
|
|
|
for i := 0; i < numAlerts; i++ {
|
|
ruleId := fmt.Sprintf("Alert_%d", i)
|
|
|
|
notifConfig := alertmanagertypes.NotificationConfig{
|
|
NotificationGroup: map[model.LabelName]struct{}{
|
|
model.LabelName("ruleId"): {},
|
|
},
|
|
Renotify: alertmanagertypes.ReNotificationConfig{
|
|
RenotifyInterval: 1 * time.Hour,
|
|
},
|
|
UsePolicy: false,
|
|
}
|
|
route := &alertmanagertypes.RoutePolicy{
|
|
Identifiable: types.Identifiable{
|
|
ID: valuer.GenerateUUID(),
|
|
},
|
|
Expression: fmt.Sprintf(`ruleId == "%s"`, ruleId),
|
|
ExpressionKind: alertmanagertypes.PolicyBasedExpression,
|
|
Name: ruleId,
|
|
Description: "Route for OtherAlert critical to Slack",
|
|
Enabled: true,
|
|
OrgID: orgId,
|
|
Channels: []string{"slack"},
|
|
}
|
|
|
|
store.ExpectGetAllByName(orgId, ruleId, []*alertmanagertypes.RoutePolicy{route})
|
|
err := nfManager.SetNotificationConfig(orgId, ruleId, ¬ifConfig)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId)
|
|
go dispatcher.Run()
|
|
defer dispatcher.Stop()
|
|
|
|
for i := 0; i < numAlerts; i++ {
|
|
ruleId := fmt.Sprintf("Alert_%d", i)
|
|
alert := newAlert(model.LabelSet{"ruleId": model.LabelValue(ruleId)})
|
|
require.NoError(t, alerts.Put(alert))
|
|
}
|
|
|
|
for deadline := time.Now().Add(5 * time.Second); time.Now().Before(deadline); {
|
|
if len(recorder.Alerts()) >= numAlerts {
|
|
break
|
|
}
|
|
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
|
|
require.Len(t, recorder.Alerts(), numAlerts)
|
|
}
|
|
|
|
func TestDispatcher_DoMaintenance(t *testing.T) {
|
|
r := prometheus.NewRegistry()
|
|
marker := alertmanagertypes.NewMarker(r)
|
|
|
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Minute, nil, promslog.NewNopLogger(), nil)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
route := &dispatch.Route{
|
|
RouteOpts: dispatch.RouteOpts{
|
|
GroupBy: map[model.LabelName]struct{}{"alertname": {}},
|
|
GroupWait: 0,
|
|
GroupInterval: 5 * time.Minute, // Should never hit in this test.
|
|
},
|
|
}
|
|
timeout := func(d time.Duration) time.Duration { return d }
|
|
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)}
|
|
|
|
ctx := context.Background()
|
|
metrics := NewDispatcherMetrics(false, r)
|
|
nfManager := nfmanagertest.NewMock()
|
|
// Set up default expectation that may be called during maintenance
|
|
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, promslog.NewNopLogger(), metrics, nfManager, "test-org")
|
|
aggrGroups := make(map[*dispatch.Route]map[model.Fingerprint]*aggrGroup)
|
|
aggrGroups[route] = make(map[model.Fingerprint]*aggrGroup)
|
|
|
|
// Insert an aggregation group with no alerts.
|
|
labels := model.LabelSet{"alertname": "1"}
|
|
aggrGroup1 := newAggrGroup(ctx, labels, route, timeout, promslog.NewNopLogger(), time.Hour)
|
|
aggrGroups[route][aggrGroup1.fingerprint()] = aggrGroup1
|
|
dispatcher.aggrGroupsPerRoute = aggrGroups
|
|
// Must run otherwise doMaintenance blocks on aggrGroup1.stop().
|
|
go aggrGroup1.run(func(context.Context, ...*alertmanagertypes.Alert) bool { return true })
|
|
|
|
// Insert a marker for the aggregation group's group key.
|
|
marker.SetMuted(route.ID(), aggrGroup1.GroupKey(), []string{"weekends"})
|
|
mutedBy, isMuted := marker.Muted(route.ID(), aggrGroup1.GroupKey())
|
|
require.True(t, isMuted)
|
|
require.Equal(t, []string{"weekends"}, mutedBy)
|
|
|
|
// Run the maintenance and the marker should be removed.
|
|
dispatcher.doMaintenance()
|
|
mutedBy, isMuted = marker.Muted(route.ID(), aggrGroup1.GroupKey())
|
|
require.False(t, isMuted)
|
|
require.Empty(t, mutedBy)
|
|
}
|