From f9a70a3a69388cea693ac214e5a60e01b23bc355 Mon Sep 17 00:00:00 2001 From: aniketio-ctrl Date: Fri, 3 Oct 2025 19:47:15 +0530 Subject: [PATCH] chore: notification routing | added notificaiton routing via expression based routes (#9195) * chore: added custom distpatcher * feat(notification-grouping): added notification grouping * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): added structure changes * feat(notification-grouping): added structure changes * feat(notification-routing): added notification routing * chore(notification-grouping): added notificaiton grouping * Update pkg/alertmanager/nfmanager/rulebasednotification/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): added renotification interval * fix(notification-grouping): added fix for renotification * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notification-grouping): added no data renotify interval * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed unwanted tests * chore(notification-grouping): removed unwanted pkg name * chore(notification-grouping): added delete notification setting * chore(notification-grouping): added delete notification setting * Update pkg/alertmanager/nfmanager/nfmanagertest/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): go linting errors * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): corrected get notification config logic * Update pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * feat(notification-routing): added test cases for dispatcher * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Apply suggestions from code review Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Update pkg/alertmanager/alertmanagerserver/distpatcher_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): sorted imports * chore(notification-routing): minor edit |pr resolve comments * chore(notification-grouping): corrected dispatcher test cases * chore(notification-routing): added notification routing policies * chore(notification-routing): corrected race condition in test * chore: resolved pr comments * chore: passing threshold value to tempalte * chore: completed delete rule functionality * chore: added grouping disabled functionality * chore: added grouping disabled functionality * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): sorted imports * chore(notification-routing): fix linting errors * chore(notification-routing): removed enabled flags * fix: test rule multiple threhsold (#9224) * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * Update pkg/sqlmigration/049_add_route_policy.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore: added org_is as foreign key * chore: resolved pr comments * chore: removed route store unused --------- Co-authored-by: Srikanth Chekuri Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- ee/query-service/rules/anomaly.go | 20 +- ee/query-service/rules/manager.go | 1 - go.mod | 2 +- pkg/alertmanager/alertmanager.go | 17 +- .../alertmanagerserver/dispatcher.go | 63 +- .../alertmanagerserver/distpatcher_test.go | 915 ++++++++++++------ pkg/alertmanager/alertmanagerserver/server.go | 112 ++- .../alertmanagerserver/server_e2e_test.go | 223 +++++ .../alertmanagerserver/server_test.go | 187 ++++ pkg/alertmanager/api.go | 126 +++ .../nfmanager/nfmanagertest/provider.go | 257 ++++- .../nfroutingstoretest/route.go | 176 ++++ .../nfroutingstore/sqlroutingstore/store.go | 93 ++ .../nfmanager/notificationmanager.go | 17 +- .../rulebasednotification/provider.go | 157 ++- .../rulebasednotification/provider_test.go | 461 ++++++++- pkg/alertmanager/service.go | 8 +- .../signozalertmanager/provider.go | 344 ++++++- pkg/query-service/app/http_handler.go | 8 +- pkg/query-service/rules/base_rule.go | 248 ----- pkg/query-service/rules/base_rule_test.go | 25 +- pkg/query-service/rules/manager.go | 170 ++-- pkg/query-service/rules/manager_test.go | 238 ++++- pkg/query-service/rules/prom_rule.go | 18 +- pkg/query-service/rules/promrule_test.go | 2 +- pkg/query-service/rules/test_notification.go | 1 - pkg/query-service/rules/threshold_rule.go | 18 +- .../rules/threshold_rule_test.go | 76 +- pkg/signoz/provider.go | 6 +- pkg/signoz/signoz.go | 5 +- pkg/sqlmigration/049_add_route_policy.go | 260 +++++ pkg/types/alertmanagertypes/alert.go | 14 + pkg/types/alertmanagertypes/config.go | 46 +- .../alertmanagertypes/expressionroute.go | 139 +++ pkg/types/alertmanagertypes/receiver.go | 9 +- pkg/types/ruletypes/api_params.go | 114 ++- pkg/types/ruletypes/api_params_test.go | 22 +- pkg/types/ruletypes/constants.go | 1 + pkg/types/ruletypes/result_types.go | 4 + pkg/types/ruletypes/threshold.go | 133 +-- 40 files changed, 3842 insertions(+), 894 deletions(-) create mode 100644 pkg/alertmanager/alertmanagerserver/server_e2e_test.go create mode 100644 pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest/route.go create mode 100644 pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore/store.go create mode 100644 pkg/sqlmigration/049_add_route_policy.go create mode 100644 pkg/types/alertmanagertypes/expressionroute.go diff --git a/ee/query-service/rules/anomaly.go b/ee/query-service/rules/anomaly.go index 53f205e8d004..af988b5d6777 100644 --- a/ee/query-service/rules/anomaly.go +++ b/ee/query-service/rules/anomaly.go @@ -251,7 +251,7 @@ func (r *AnomalyRule) buildAndRunQuery(ctx context.Context, orgID valuer.UUID, t continue } } - results, err := r.Threshold.ShouldAlert(*series) + results, err := r.Threshold.ShouldAlert(*series, r.Unit()) if err != nil { return nil, err } @@ -301,7 +301,7 @@ func (r *AnomalyRule) buildAndRunQueryV5(ctx context.Context, orgID valuer.UUID, continue } } - results, err := r.Threshold.ShouldAlert(*series) + results, err := r.Threshold.ShouldAlert(*series, r.Unit()) if err != nil { return nil, err } @@ -336,14 +336,19 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro resultFPs := map[uint64]struct{}{} var alerts = make(map[uint64]*ruletypes.Alert, len(res)) + ruleReceivers := r.Threshold.GetRuleReceivers() + ruleReceiverMap := make(map[string][]string) + for _, value := range ruleReceivers { + ruleReceiverMap[value.Name] = value.Channels + } + for _, smpl := range res { l := make(map[string]string, len(smpl.Metric)) for _, lbl := range smpl.Metric { l[lbl.Name] = lbl.Value } - value := valueFormatter.Format(smpl.V, r.Unit()) - threshold := valueFormatter.Format(r.TargetVal(), r.Unit()) + threshold := valueFormatter.Format(smpl.Target, smpl.TargetUnit) r.logger.DebugContext(ctx, "Alert template data for rule", "rule_name", r.Name(), "formatter", valueFormatter.Name(), "value", value, "threshold", threshold) tmplData := ruletypes.AlertTemplateData(l, value, threshold) @@ -408,13 +413,12 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro State: model.StatePending, Value: smpl.V, GeneratorURL: r.GeneratorURL(), - Receivers: r.PreferredChannels(), + Receivers: ruleReceiverMap[lbs.Map()[ruletypes.LabelThresholdName]], Missing: smpl.IsMissing, } } r.logger.InfoContext(ctx, "number of alerts found", "rule_name", r.Name(), "alerts_count", len(alerts)) - // alerts[h] is ready, add or update active list now for h, a := range alerts { // Check whether we already have alerting state for the identifying label set. @@ -423,7 +427,9 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro alert.Value = a.Value alert.Annotations = a.Annotations - alert.Receivers = r.PreferredChannels() + if v, ok := alert.Labels.Map()[ruletypes.LabelThresholdName]; ok { + alert.Receivers = ruleReceiverMap[v] + } continue } diff --git a/ee/query-service/rules/manager.go b/ee/query-service/rules/manager.go index 3212031f9f3f..31009b3c3091 100644 --- a/ee/query-service/rules/manager.go +++ b/ee/query-service/rules/manager.go @@ -126,7 +126,6 @@ func TestNotification(opts baserules.PrepareTestRuleOptions) (int, *basemodel.Ap if parsedRule.RuleType == ruletypes.RuleTypeThreshold { // add special labels for test alerts - parsedRule.Annotations[labels.AlertSummaryLabel] = fmt.Sprintf("The rule threshold is set to %.4f, and the observed metric value is {{$value}}.", *parsedRule.RuleCondition.Target) parsedRule.Labels[labels.RuleSourceLabel] = "" parsedRule.Labels[labels.AlertRuleIdLabel] = "" diff --git a/go.mod b/go.mod index 32eee2547a71..fcd2e0124a9b 100644 --- a/go.mod +++ b/go.mod @@ -127,7 +127,7 @@ require ( github.com/elastic/lunes v0.1.0 // indirect github.com/emirpasic/gods v1.18.1 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect - github.com/expr-lang/expr v1.17.5 // indirect + github.com/expr-lang/expr v1.17.5 github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index e38ddbe633e9..4c3ecce299f4 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -3,6 +3,8 @@ package alertmanager import ( "context" + amConfig "github.com/prometheus/alertmanager/config" + "github.com/SigNoz/signoz/pkg/errors" "github.com/SigNoz/signoz/pkg/factory" "github.com/SigNoz/signoz/pkg/statsreporter" @@ -26,7 +28,7 @@ type Alertmanager interface { TestReceiver(context.Context, string, alertmanagertypes.Receiver) error // TestAlert sends an alert to a list of receivers. - TestAlert(ctx context.Context, orgID string, alert *alertmanagertypes.PostableAlert, receivers []string) error + TestAlert(ctx context.Context, orgID string, ruleID string, receiversMap map[*alertmanagertypes.PostableAlert][]string) error // ListChannels lists all channels for the organization. ListChannels(context.Context, string) ([]*alertmanagertypes.Channel, error) @@ -59,6 +61,19 @@ type Alertmanager interface { DeleteNotificationConfig(ctx context.Context, orgID valuer.UUID, ruleId string) error + // Notification Policy CRUD + CreateRoutePolicy(ctx context.Context, route *alertmanagertypes.PostableRoutePolicy) (*alertmanagertypes.GettableRoutePolicy, error) + CreateRoutePolicies(ctx context.Context, routeRequests []*alertmanagertypes.PostableRoutePolicy) ([]*alertmanagertypes.GettableRoutePolicy, error) + GetRoutePolicyByID(ctx context.Context, routeID string) (*alertmanagertypes.GettableRoutePolicy, error) + GetAllRoutePolicies(ctx context.Context) ([]*alertmanagertypes.GettableRoutePolicy, error) + UpdateRoutePolicyByID(ctx context.Context, routeID string, route *alertmanagertypes.PostableRoutePolicy) (*alertmanagertypes.GettableRoutePolicy, error) + DeleteRoutePolicyByID(ctx context.Context, routeID string) error + DeleteAllRoutePoliciesByRuleId(ctx context.Context, ruleId string) error + UpdateAllRoutePoliciesByRuleId(ctx context.Context, ruleId string, routes []*alertmanagertypes.PostableRoutePolicy) error + + CreateInhibitRules(ctx context.Context, orgID valuer.UUID, rules []amConfig.InhibitRule) error + DeleteAllInhibitRulesByRuleId(ctx context.Context, orgID valuer.UUID, ruleId string) error + // Collects stats for the organization. statsreporter.StatsCollector } diff --git a/pkg/alertmanager/alertmanagerserver/dispatcher.go b/pkg/alertmanager/alertmanagerserver/dispatcher.go index f5063177534d..7f040b8e5b06 100644 --- a/pkg/alertmanager/alertmanagerserver/dispatcher.go +++ b/pkg/alertmanager/alertmanagerserver/dispatcher.go @@ -10,19 +10,17 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/errors" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" "github.com/prometheus/alertmanager/dispatch" "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/pkg/labels" "github.com/prometheus/alertmanager/provider" "github.com/prometheus/alertmanager/store" "github.com/prometheus/alertmanager/types" "github.com/prometheus/common/model" ) -const ( - noDataLabel = model.LabelName("nodata") -) - // Dispatcher sorts incoming alerts into aggregation groups and // assigns the correct notifiers to each. type Dispatcher struct { @@ -46,6 +44,7 @@ type Dispatcher struct { logger *slog.Logger notificationManager nfmanager.NotificationManager orgID string + receiverRoutes map[string]*dispatch.Route } // We use the upstream Limits interface from Prometheus @@ -90,6 +89,7 @@ func (d *Dispatcher) Run() { d.mtx.Lock() d.aggrGroupsPerRoute = map[*dispatch.Route]map[model.Fingerprint]*aggrGroup{} + d.receiverRoutes = map[string]*dispatch.Route{} d.aggrGroupsNum = 0 d.metrics.aggrGroups.Set(0) d.ctx, d.cancel = context.WithCancel(context.Background()) @@ -125,8 +125,14 @@ func (d *Dispatcher) run(it provider.AlertIterator) { } now := time.Now() - for _, r := range d.route.Match(alert.Labels) { - d.processAlert(alert, r) + channels, err := d.notificationManager.Match(d.ctx, d.orgID, getRuleIDFromAlert(alert), alert.Labels) + if err != nil { + d.logger.ErrorContext(d.ctx, "Error on alert match", "err", err) + continue + } + for _, channel := range channels { + route := d.getOrCreateRoute(channel) + d.processAlert(alert, route) } d.metrics.processingDuration.Observe(time.Since(now).Seconds()) @@ -266,6 +272,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool // processAlert determines in which aggregation group the alert falls // and inserts it. +// no data alert will only have ruleId and no data label func (d *Dispatcher) processAlert(alert *types.Alert, route *dispatch.Route) { ruleId := getRuleIDFromAlert(alert) config, err := d.notificationManager.GetNotificationConfig(d.orgID, ruleId) @@ -273,8 +280,14 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *dispatch.Route) { d.logger.ErrorContext(d.ctx, "error getting alert notification config", "rule_id", ruleId, "error", err) return } + renotifyInterval := config.Renotify.RenotifyInterval - groupLabels := getGroupLabels(alert, config.NotificationGroup) + groupLabels := getGroupLabels(alert, config.NotificationGroup, config.GroupByAll) + + if alertmanagertypes.NoDataAlert(alert) { + renotifyInterval = config.Renotify.NoDataInterval + groupLabels[alertmanagertypes.NoDataLabel] = alert.Labels[alertmanagertypes.NoDataLabel] //to create new group key for no data alerts + } fp := groupLabels.Fingerprint() @@ -299,12 +312,6 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *dispatch.Route) { d.logger.ErrorContext(d.ctx, "Too many aggregation groups, cannot create new group for alert", "groups", d.aggrGroupsNum, "limit", limit, "alert", alert.Name()) return } - renotifyInterval := config.Renotify.RenotifyInterval - - if noDataAlert(alert) { - renotifyInterval = config.Renotify.NoDataInterval - groupLabels[noDataLabel] = alert.Labels[noDataLabel] - } ag = newAggrGroup(d.ctx, groupLabels, route, d.timeout, d.logger, renotifyInterval) @@ -543,21 +550,35 @@ func deepCopyRouteOpts(opts dispatch.RouteOpts, renotify time.Duration) dispatch return newOpts } -func getGroupLabels(alert *types.Alert, groups map[model.LabelName]struct{}) model.LabelSet { +func getGroupLabels(alert *types.Alert, groups map[model.LabelName]struct{}, groupByAll bool) model.LabelSet { groupLabels := model.LabelSet{} for ln, lv := range alert.Labels { - if _, ok := groups[ln]; ok { + if _, ok := groups[ln]; ok || groupByAll { groupLabels[ln] = lv } } - return groupLabels } -func noDataAlert(alert *types.Alert) bool { - if _, ok := alert.Labels[noDataLabel]; ok { - return true - } else { - return false +func (d *Dispatcher) getOrCreateRoute(receiver string) *dispatch.Route { + d.mtx.Lock() + defer d.mtx.Unlock() + if route, exists := d.receiverRoutes[receiver]; exists { + return route } + route := &dispatch.Route{ + RouteOpts: dispatch.RouteOpts{ + Receiver: receiver, + GroupWait: 30 * time.Second, + GroupInterval: 5 * time.Minute, + GroupByAll: false, + }, + Matchers: labels.Matchers{{ + Name: "__receiver__", + Value: receiver, + Type: labels.MatchEqual, + }}, + } + d.receiverRoutes[receiver] = route + return route } diff --git a/pkg/alertmanager/alertmanagerserver/distpatcher_test.go b/pkg/alertmanager/alertmanagerserver/distpatcher_test.go index 36369a35049d..5f084ec21470 100644 --- a/pkg/alertmanager/alertmanagerserver/distpatcher_test.go +++ b/pkg/alertmanager/alertmanagerserver/distpatcher_test.go @@ -10,21 +10,31 @@ import ( "testing" "time" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification" + "github.com/SigNoz/signoz/pkg/factory" + "github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest" + "github.com/SigNoz/signoz/pkg/types" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/SigNoz/signoz/pkg/valuer" + "github.com/prometheus/alertmanager/config" "github.com/prometheus/alertmanager/dispatch" + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/provider/mem" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/prometheus/common/promslog" - "github.com/stretchr/testify/require" - "github.com/prometheus/alertmanager/config" - "github.com/prometheus/alertmanager/notify" - "github.com/prometheus/alertmanager/provider/mem" - "github.com/prometheus/alertmanager/types" + "github.com/stretchr/testify/require" ) +func createTestProviderSettings() factory.ProviderSettings { + return instrumentationtest.New().ToProviderSettings() +} + func TestAggrGroup(t *testing.T) { lset := model.LabelSet{ "a": "v1", @@ -59,7 +69,7 @@ func TestAggrGroup(t *testing.T) { nfManager.SetMockConfig(orgId, ruleId, ¬ificationConfig) var ( - a1 = &types.Alert{ + a1 = &alertmanagertypes.Alert{ Alert: model.Alert{ Labels: model.LabelSet{ "a": "v1", @@ -72,7 +82,7 @@ func TestAggrGroup(t *testing.T) { }, UpdatedAt: time.Now(), } - a2 = &types.Alert{ + a2 = &alertmanagertypes.Alert{ Alert: model.Alert{ Labels: model.LabelSet{ "a": "v1", @@ -85,7 +95,7 @@ func TestAggrGroup(t *testing.T) { }, UpdatedAt: time.Now(), } - a3 = &types.Alert{ + a3 = &alertmanagertypes.Alert{ Alert: model.Alert{ Labels: model.LabelSet{ "a": "v1", @@ -104,10 +114,10 @@ func TestAggrGroup(t *testing.T) { last = time.Now() current = time.Now() lastCurMtx = &sync.Mutex{} - alertsCh = make(chan types.AlertSlice) + alertsCh = make(chan alertmanagertypes.AlertSlice) ) - ntfy := func(ctx context.Context, alerts ...*types.Alert) bool { + ntfy := func(ctx context.Context, alerts ...*alertmanagertypes.Alert) bool { // Validate that the context is properly populated. if _, ok := notify.Now(ctx); !ok { t.Errorf("now missing") @@ -131,12 +141,12 @@ func TestAggrGroup(t *testing.T) { current = time.Now().Add(-time.Millisecond) lastCurMtx.Unlock() - alertsCh <- types.AlertSlice(alerts) + alertsCh <- alertmanagertypes.AlertSlice(alerts) return true } - removeEndsAt := func(as types.AlertSlice) types.AlertSlice { + removeEndsAt := func(as alertmanagertypes.AlertSlice) alertmanagertypes.AlertSlice { for i, a := range as { ac := *a ac.EndsAt = time.Time{} @@ -163,7 +173,7 @@ func TestAggrGroup(t *testing.T) { if s < opts.GroupWait { t.Fatalf("received batch too early after %v", s) } - exp := removeEndsAt(types.AlertSlice{a1}) + exp := removeEndsAt(alertmanagertypes.AlertSlice{a1}) sort.Sort(batch) if !reflect.DeepEqual(batch, exp) { @@ -186,7 +196,7 @@ func TestAggrGroup(t *testing.T) { if s < opts.GroupInterval { t.Fatalf("received batch too early after %v", s) } - exp := removeEndsAt(types.AlertSlice{a1, a3}) + exp := removeEndsAt(alertmanagertypes.AlertSlice{a1, a3}) sort.Sort(batch) if !reflect.DeepEqual(batch, exp) { @@ -213,7 +223,7 @@ func TestAggrGroup(t *testing.T) { t.Fatalf("expected immediate alert but received none") case batch := <-alertsCh: - exp := removeEndsAt(types.AlertSlice{a1, a2}) + exp := removeEndsAt(alertmanagertypes.AlertSlice{a1, a2}) sort.Sort(batch) if !reflect.DeepEqual(batch, exp) { @@ -236,7 +246,7 @@ func TestAggrGroup(t *testing.T) { if s < opts.GroupInterval { t.Fatalf("received batch too early after %v", s) } - exp := removeEndsAt(types.AlertSlice{a1, a2, a3}) + exp := removeEndsAt(alertmanagertypes.AlertSlice{a1, a2, a3}) sort.Sort(batch) if !reflect.DeepEqual(batch, exp) { @@ -249,7 +259,7 @@ func TestAggrGroup(t *testing.T) { a1r := *a1 a1r.EndsAt = time.Now() ag.insert(&a1r) - exp := append(types.AlertSlice{&a1r}, removeEndsAt(types.AlertSlice{a2, a3})...) + exp := append(alertmanagertypes.AlertSlice{&a1r}, removeEndsAt(alertmanagertypes.AlertSlice{a2, a3})...) select { case <-time.After(2 * opts.GroupInterval): @@ -271,7 +281,7 @@ func TestAggrGroup(t *testing.T) { // Resolve all remaining alerts, they should be removed after the next batch was sent. // Do not add a1r as it should have been deleted following the previous batch. a2r, a3r := *a2, *a3 - resolved := types.AlertSlice{&a2r, &a3r} + resolved := alertmanagertypes.AlertSlice{&a2r, &a3r} for _, a := range resolved { a.EndsAt = time.Now() ag.insert(a) @@ -303,7 +313,7 @@ func TestAggrGroup(t *testing.T) { } func TestGroupLabels(t *testing.T) { - a := &types.Alert{ + a := &alertmanagertypes.Alert{ Alert: model.Alert{ Labels: model.LabelSet{ "a": "v1", @@ -328,7 +338,7 @@ func TestGroupLabels(t *testing.T) { "b": "v2", } - ls := getGroupLabels(a, route.RouteOpts.GroupBy) + ls := getGroupLabels(a, route.RouteOpts.GroupBy, false) if !reflect.DeepEqual(ls, expLs) { t.Fatalf("expected labels are %v, but got %v", expLs, ls) @@ -336,35 +346,25 @@ func TestGroupLabels(t *testing.T) { } func TestAggrRouteMap(t *testing.T) { + // Simplified config with just receivers and default route - no hardcoded routing rules confData := `receivers: - name: 'slack' -- name: 'email' +- name: 'email' - name: 'pagerduty' route: group_by: ['alertname'] - group_wait: 10ms - group_interval: 10ms - receiver: 'slack' - routes: - - matchers: - - 'ruleId=~"ruleId-OtherAlert|ruleId-TestingAlert"' - receiver: 'slack' - - matchers: - - 'ruleId=~"ruleId-HighLatency|ruleId-HighErrorRate"' - receiver: 'email' - continue: true - - matchers: - - 'ruleId="ruleId-HighLatency"' - receiver: 'pagerduty'` + group_wait: 1m + group_interval: 1m + receiver: 'slack'` conf, err := config.Load(confData) if err != nil { t.Fatal(err) } - - logger := promslog.NewNopLogger() + providerSettings := createTestProviderSettings() + logger := providerSettings.Logger route := dispatch.NewRoute(conf.Route, nil) - marker := types.NewMarker(prometheus.NewRegistry()) + marker := alertmanagertypes.NewMarker(prometheus.NewRegistry()) alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) @@ -372,21 +372,78 @@ route: defer alerts.Close() timeout := func(d time.Duration) time.Duration { return time.Duration(0) } - recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)} metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) - nfManager := nfmanagertest.NewMock() + store := nfroutingstoretest.NewMockSQLRouteStore() + store.MatchExpectationsInOrder(false) + nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store) + if err != nil { + t.Fatal(err) + } orgId := "test-org" + + ctx := context.Background() + routes := []*alertmanagertypes.RoutePolicy{ + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-OtherAlert" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-OtherAlert", + Description: "Route for OtherAlert to Slack", + Enabled: true, + OrgID: orgId, + Channels: []string{"slack"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-OtherAlert" && threshold.name == "warning"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-OtherAlert", + Description: "Route for cluster aa and service api to Email", + Enabled: true, + OrgID: orgId, + Channels: []string{"email"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-HighLatency" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-HighLatency", + Description: "High priority route for HighLatency to PagerDuty", + Enabled: true, + OrgID: orgId, + Channels: []string{"pagerduty"}, + }, + } + // Set up SQL mock expectations for the CreateBatch call + store.ExpectCreateBatch(routes) + err = nfManager.CreateRoutePolicies(ctx, orgId, routes) + require.NoError(t, err) + + // Set up expectations for getting routes during matching (multiple calls expected) + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId) go dispatcher.Run() defer dispatcher.Stop() - inputAlerts := []*types.Alert{ - newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd"}), + inputAlerts := []*alertmanagertypes.Alert{ + newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd", "threshold.name": "critical"}), + newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "dc", "service": "dd", "threshold.name": "critical"}), newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1"}), newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}), newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}), newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}), - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}), - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "aa", "service": "api", "kafka": "yes", "instance": "inst3"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "test-db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}), + } + for i := 0; i < 9; i++ { + store.ExpectGetAllByName(orgId, string(inputAlerts[i].Labels["ruleId"]), routes) } notiConfigs := map[string]alertmanagertypes.NotificationConfig{ "ruleId-OtherAlert": { @@ -398,6 +455,7 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 10, }, + UsePolicy: false, }, "ruleId-TestingAlert": { NotificationGroup: map[model.LabelName]struct{}{ @@ -408,6 +466,7 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 11, }, + UsePolicy: false, }, "ruleId-HighErrorRate": { NotificationGroup: map[model.LabelName]struct{}{ @@ -418,6 +477,7 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 12, }, + UsePolicy: false, }, "ruleId-HighLatency": { NotificationGroup: map[model.LabelName]struct{}{ @@ -428,11 +488,13 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 13, }, + UsePolicy: false, }, } for ruleID, config := range notiConfigs { - nfManager.SetMockConfig(orgId, ruleID, &config) + err := nfManager.SetNotificationConfig(orgId, ruleID, &config) + require.NoError(t, err) } err = alerts.Put(inputAlerts...) if err != nil { @@ -440,15 +502,15 @@ route: } // Let alerts get processed. - for i := 0; len(recorder.Alerts()) != 9 && i < 10; i++ { - time.Sleep(200 * time.Millisecond) + for i := 0; len(recorder.Alerts()) != 4; i++ { + time.Sleep(400 * time.Millisecond) } - require.Len(t, recorder.Alerts(), 9) + require.Len(t, recorder.Alerts(), 4) alertGroups, receivers := dispatcher.Groups( func(*dispatch.Route) bool { return true - }, func(*types.Alert, time.Time) bool { + }, func(*alertmanagertypes.Alert, time.Time) bool { return true }, ) @@ -468,11 +530,11 @@ route: routeIDsFound[routeID] = true expectedReceiver := "" switch routeID { - case "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0": + case "{__receiver__=\"slack\"}": expectedReceiver = "slack" - case "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1": + case "{__receiver__=\"email\"}": expectedReceiver = "email" - case "{}/{ruleId=\"ruleId-HighLatency\"}/2": + case "{__receiver__=\"pagerduty\"}": expectedReceiver = "pagerduty" } if expectedReceiver != "" { @@ -482,13 +544,12 @@ route: totalAggrGroups += len(groups) } - require.Equal(t, 7, totalAggrGroups, "Should have exactly 7 aggregation groups") + require.Equal(t, 4, totalAggrGroups, "Should have exactly 4 aggregation groups") // Verify specific route group counts expectedGroupCounts := map[string]int{ - "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0": 2, // OtherAlert + TestingAlert - "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1": 4, // 3 HighErrorRate + 1 HighLatency - "{}/{ruleId=\"ruleId-HighLatency\"}/2": 1, // 1 HighLatency group + "{__receiver__=\"slack\"}": 2, + "{__receiver__=\"pagerduty\"}": 2, } for route, groups := range aggrGroupsPerRoute { @@ -501,79 +562,31 @@ route: require.Equal(t, AlertGroups{ &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, - Labels: model.LabelSet{ - "kafka": "yes", - "ruleId": "ruleId-HighLatency", - "service": "db", - }, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - Renotify: 13, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, + Alerts: []*alertmanagertypes.Alert{inputAlerts[7]}, Labels: model.LabelSet{ "kafka": "yes", "ruleId": "ruleId-HighLatency", "service": "db", }, Receiver: "pagerduty", - GroupKey: "{}/{ruleId=\"ruleId-HighLatency\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", - RouteID: "{}/{ruleId=\"ruleId-HighLatency\"}/2", + GroupKey: "{__receiver__=\"pagerduty\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", + RouteID: "{__receiver__=\"pagerduty\"}", Renotify: 13, }, &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[1]}, + Alerts: []*alertmanagertypes.Alert{inputAlerts[8]}, Labels: model.LabelSet{ - "instance": "inst1", - "ruleId": "ruleId-TestingAlert", - "service": "api", + "kafka": "yes", + "ruleId": "ruleId-HighLatency", + "service": "test-db", }, - Renotify: 11, - Receiver: "slack", - GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{instance=\"inst1\", ruleId=\"ruleId-TestingAlert\", service=\"api\"}", - RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", + Receiver: "pagerduty", + GroupKey: "{__receiver__=\"pagerduty\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"test-db\"}", + RouteID: "{__receiver__=\"pagerduty\"}", + Renotify: 13, }, &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[2]}, - Labels: model.LabelSet{ - "cluster": "aa", - "instance": "inst1", - "ruleId": "ruleId-HighErrorRate", - }, - Renotify: 12, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[3]}, - Labels: model.LabelSet{ - "cluster": "aa", - "instance": "inst2", - "ruleId": "ruleId-HighErrorRate", - }, - Renotify: 12, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst2\", ruleId=\"ruleId-HighErrorRate\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[4]}, - Labels: model.LabelSet{ - "cluster": "bb", - "instance": "inst1", - "ruleId": "ruleId-HighErrorRate", - }, - Renotify: 12, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"bb\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[0]}, + Alerts: []*alertmanagertypes.Alert{inputAlerts[0]}, Labels: model.LabelSet{ "cluster": "cc", "ruleId": "ruleId-OtherAlert", @@ -581,51 +594,50 @@ route: }, Renotify: 10, Receiver: "slack", - GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{cluster=\"cc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}", - RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", + GroupKey: "{__receiver__=\"slack\"}:{cluster=\"cc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}", + RouteID: "{__receiver__=\"slack\"}", + }, + &AlertGroup{ + Alerts: []*alertmanagertypes.Alert{inputAlerts[1]}, + Labels: model.LabelSet{ + "cluster": "dc", + "service": "dd", + "ruleId": "ruleId-OtherAlert", + }, + Renotify: 10, + Receiver: "slack", + GroupKey: "{__receiver__=\"slack\"}:{cluster=\"dc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}", + RouteID: "{__receiver__=\"slack\"}", }, }, alertGroups) require.Equal(t, map[model.Fingerprint][]string{ inputAlerts[0].Fingerprint(): {"slack"}, inputAlerts[1].Fingerprint(): {"slack"}, - inputAlerts[2].Fingerprint(): {"email"}, - inputAlerts[3].Fingerprint(): {"email"}, - inputAlerts[4].Fingerprint(): {"email"}, - inputAlerts[5].Fingerprint(): {"email", "pagerduty"}, - inputAlerts[6].Fingerprint(): {"email", "pagerduty"}, + inputAlerts[7].Fingerprint(): {"pagerduty"}, + inputAlerts[8].Fingerprint(): {"pagerduty"}, }, receivers) } func TestGroupsWithNodata(t *testing.T) { + // Simplified config with just receivers and default route - no hardcoded routing rules confData := `receivers: - name: 'slack' -- name: 'email' +- name: 'email' - name: 'pagerduty' route: group_by: ['alertname'] group_wait: 10ms group_interval: 10ms - receiver: 'slack' - routes: - - matchers: - - 'ruleId=~"ruleId-OtherAlert|ruleId-TestingAlert"' - receiver: 'slack' - - matchers: - - 'ruleId=~"ruleId-HighLatency|ruleId-HighErrorRate"' - receiver: 'email' - continue: true - - matchers: - - 'ruleId="ruleId-HighLatency"' - receiver: 'pagerduty'` + receiver: 'slack'` conf, err := config.Load(confData) if err != nil { t.Fatal(err) } - - logger := promslog.NewNopLogger() + providerSettings := createTestProviderSettings() + logger := providerSettings.Logger route := dispatch.NewRoute(conf.Route, nil) - marker := types.NewMarker(prometheus.NewRegistry()) + marker := alertmanagertypes.NewMarker(prometheus.NewRegistry()) alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) @@ -633,30 +645,107 @@ route: defer alerts.Close() timeout := func(d time.Duration) time.Duration { return time.Duration(0) } - recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)} metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) - nfManager := nfmanagertest.NewMock() + store := nfroutingstoretest.NewMockSQLRouteStore() + store.MatchExpectationsInOrder(false) + nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store) + if err != nil { + t.Fatal(err) + } orgId := "test-org" + + ctx := context.Background() + routes := []*alertmanagertypes.RoutePolicy{ + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-OtherAlert" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-OtherAlert", + Description: "Route for OtherAlert critical to Slack", + Enabled: true, + OrgID: orgId, + Channels: []string{"slack"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-TestingAlert" && threshold.name == "warning"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-TestingAlert", + Description: "Route for TestingAlert warning to Slack", + Enabled: true, + OrgID: orgId, + Channels: []string{"slack"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-HighErrorRate" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-HighErrorRate", + Description: "Route for HighErrorRate critical to Email", + Enabled: true, + OrgID: orgId, + Channels: []string{"email"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-HighLatency" && threshold.name == "warning"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-HighLatency", + Description: "Route for HighLatency warning to Email", + Enabled: true, + OrgID: orgId, + Channels: []string{"email"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "ruleId-HighLatency" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-HighLatency", + Description: "Route for HighLatency critical to PagerDuty", + Enabled: true, + OrgID: orgId, + Channels: []string{"pagerduty"}, + }, + } + // Set up SQL mock expectations for the CreateBatch call + store.ExpectCreateBatch(routes) + err = nfManager.CreateRoutePolicies(ctx, orgId, routes) + require.NoError(t, err) + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId) go dispatcher.Run() defer dispatcher.Stop() - // Create alerts. the dispatcher will automatically create the groups. - inputAlerts := []*types.Alert{ - // Matches the parent route. - newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd"}), - // Matches the first sub-route. - newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1"}), - // Matches the second sub-route. - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}), - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}), - // Matches the second sub-route. - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}), - // Matches the second and third sub-route. - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}), - newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4"}), + inputAlerts := []*alertmanagertypes.Alert{ + newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1", "threshold.name": "warning"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3", "threshold.name": "warning"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}), newAlert(model.LabelSet{"ruleId": "ruleId-HighLatency", "nodata": "true"}), } + // Set up expectations with route filtering for each alert + store.ExpectGetAllByName(orgId, "ruleId-OtherAlert", []*alertmanagertypes.RoutePolicy{routes[0]}) + store.ExpectGetAllByName(orgId, "ruleId-TestingAlert", []*alertmanagertypes.RoutePolicy{routes[1]}) + store.ExpectGetAllByName(orgId, "ruleId-HighErrorRate", []*alertmanagertypes.RoutePolicy{routes[2]}) + store.ExpectGetAllByName(orgId, "ruleId-HighErrorRate", []*alertmanagertypes.RoutePolicy{routes[2]}) + store.ExpectGetAllByName(orgId, "ruleId-HighErrorRate", []*alertmanagertypes.RoutePolicy{routes[2]}) + store.ExpectGetAllByName(orgId, "ruleId-HighLatency", []*alertmanagertypes.RoutePolicy{routes[3], routes[4]}) + store.ExpectGetAllByName(orgId, "ruleId-HighLatency", []*alertmanagertypes.RoutePolicy{routes[3], routes[4]}) + store.ExpectGetAllByName(orgId, "ruleId-HighLatency", []*alertmanagertypes.RoutePolicy{routes[3], routes[4]}) notiConfigs := map[string]alertmanagertypes.NotificationConfig{ "ruleId-OtherAlert": { NotificationGroup: map[model.LabelName]struct{}{ @@ -667,6 +756,7 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 10, }, + UsePolicy: false, }, "ruleId-TestingAlert": { NotificationGroup: map[model.LabelName]struct{}{ @@ -677,6 +767,7 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 11, }, + UsePolicy: false, }, "ruleId-HighErrorRate": { NotificationGroup: map[model.LabelName]struct{}{ @@ -687,6 +778,7 @@ route: Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 12, }, + UsePolicy: false, }, "ruleId-HighLatency": { NotificationGroup: map[model.LabelName]struct{}{ @@ -698,160 +790,327 @@ route: RenotifyInterval: 13, NoDataInterval: 14, }, + UsePolicy: false, }, } for ruleID, config := range notiConfigs { - nfManager.SetMockConfig(orgId, ruleID, &config) + err := nfManager.SetNotificationConfig(orgId, ruleID, &config) + require.NoError(t, err) } err = alerts.Put(inputAlerts...) if err != nil { t.Fatal(err) } - // Let alerts get processed. - for i := 0; len(recorder.Alerts()) != 11 && i < 15; i++ { - time.Sleep(200 * time.Millisecond) + for i := 0; len(recorder.Alerts()) != 9; i++ { + time.Sleep(400 * time.Millisecond) } - require.Len(t, recorder.Alerts(), 11) + require.Len(t, recorder.Alerts(), 9) alertGroups, receivers := dispatcher.Groups( func(*dispatch.Route) bool { return true - }, func(*types.Alert, time.Time) bool { + }, func(*alertmanagertypes.Alert, time.Time) bool { return true }, ) - require.Equal(t, AlertGroups{ - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[7]}, - Labels: model.LabelSet{ - "ruleId": "ruleId-HighLatency", - "nodata": "true", - }, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{nodata=\"true\", ruleId=\"ruleId-HighLatency\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - Renotify: 14, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[7]}, - Labels: model.LabelSet{ - "ruleId": "ruleId-HighLatency", - "nodata": "true", - }, - Receiver: "pagerduty", - GroupKey: "{}/{ruleId=\"ruleId-HighLatency\"}:{nodata=\"true\", ruleId=\"ruleId-HighLatency\"}", - RouteID: "{}/{ruleId=\"ruleId-HighLatency\"}/2", - Renotify: 14, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, - Labels: model.LabelSet{ - "kafka": "yes", - "ruleId": "ruleId-HighLatency", - "service": "db", - }, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - Renotify: 13, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, - Labels: model.LabelSet{ - "kafka": "yes", - "ruleId": "ruleId-HighLatency", - "service": "db", - }, - Receiver: "pagerduty", - GroupKey: "{}/{ruleId=\"ruleId-HighLatency\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", - RouteID: "{}/{ruleId=\"ruleId-HighLatency\"}/2", - Renotify: 13, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[1]}, - Labels: model.LabelSet{ - "instance": "inst1", - "ruleId": "ruleId-TestingAlert", - "service": "api", - }, - Receiver: "slack", - GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{instance=\"inst1\", ruleId=\"ruleId-TestingAlert\", service=\"api\"}", - RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", - Renotify: 11, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[2]}, - Labels: model.LabelSet{ - "cluster": "aa", - "instance": "inst1", - "ruleId": "ruleId-HighErrorRate", - }, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - Renotify: 12, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[3]}, - Labels: model.LabelSet{ - "cluster": "aa", - "instance": "inst2", - "ruleId": "ruleId-HighErrorRate", - }, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst2\", ruleId=\"ruleId-HighErrorRate\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - Renotify: 12, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[4]}, - Labels: model.LabelSet{ - "cluster": "bb", - "instance": "inst1", - "ruleId": "ruleId-HighErrorRate", - }, - Receiver: "email", - GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"bb\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", - RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", - Renotify: 12, - }, - &AlertGroup{ - Alerts: []*types.Alert{inputAlerts[0]}, - Labels: model.LabelSet{ - "cluster": "cc", - "ruleId": "ruleId-OtherAlert", - "service": "dd", - }, - Receiver: "slack", - GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{cluster=\"cc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}", - RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", - Renotify: 10, - }, - }, alertGroups) - require.Equal(t, map[model.Fingerprint][]string{ - inputAlerts[0].Fingerprint(): {"slack"}, - inputAlerts[1].Fingerprint(): {"slack"}, - inputAlerts[2].Fingerprint(): {"email"}, - inputAlerts[3].Fingerprint(): {"email"}, - inputAlerts[4].Fingerprint(): {"email"}, - inputAlerts[5].Fingerprint(): {"email", "pagerduty"}, - inputAlerts[6].Fingerprint(): {"email", "pagerduty"}, + dispatcher.mtx.RLock() + aggrGroupsPerRoute := dispatcher.aggrGroupsPerRoute + dispatcher.mtx.RUnlock() + + require.NotEmpty(t, aggrGroupsPerRoute, "Should have aggregation groups per route") + + routeIDsFound := make(map[string]bool) + totalAggrGroups := 0 + + for route, groups := range aggrGroupsPerRoute { + routeID := route.ID() + routeIDsFound[routeID] = true + expectedReceiver := "" + switch routeID { + case "{__receiver__=\"slack\"}": + expectedReceiver = "slack" + case "{__receiver__=\"email\"}": + expectedReceiver = "email" + case "{__receiver__=\"pagerduty\"}": + expectedReceiver = "pagerduty" + } + if expectedReceiver != "" { + require.Equal(t, expectedReceiver, route.RouteOpts.Receiver, + "Route %s should have receiver %s", routeID, expectedReceiver) + } + totalAggrGroups += len(groups) + } + + require.Equal(t, 9, totalAggrGroups, "Should have exactly 9 aggregation groups") + + expectedGroupCounts := map[string]int{ + "{__receiver__=\"slack\"}": 2, + "{__receiver__=\"email\"}": 5, + "{__receiver__=\"pagerduty\"}": 2, + } + + for route, groups := range aggrGroupsPerRoute { + routeID := route.ID() + if expectedCount, exists := expectedGroupCounts[routeID]; exists { + require.Equal(t, expectedCount, len(groups), + "Route %s should have %d groups, got %d", routeID, expectedCount, len(groups)) + } + } + + // Verify alert groups contain expected alerts + require.Len(t, alertGroups, 9) + + // Verify receivers mapping - exact expectations based on actual routing behavior + expectedReceivers := map[model.Fingerprint][]string{ + inputAlerts[0].Fingerprint(): {"slack"}, // OtherAlert critical -> slack + inputAlerts[1].Fingerprint(): {"slack"}, // TestingAlert warning -> slack + inputAlerts[2].Fingerprint(): {"email"}, // HighErrorRate critical -> email + inputAlerts[3].Fingerprint(): {"email"}, // HighErrorRate critical -> email + inputAlerts[4].Fingerprint(): {"email"}, // HighErrorRate critical -> email + inputAlerts[5].Fingerprint(): {"email"}, // HighLatency warning -> email + inputAlerts[6].Fingerprint(): {"pagerduty"}, inputAlerts[7].Fingerprint(): {"email", "pagerduty"}, - }, receivers) + } + require.Equal(t, expectedReceivers, receivers) +} + +func TestGroupsWithNotificationPolicy(t *testing.T) { + // Simplified config with just receivers and default route - no hardcoded routing rules + confData := `receivers: +- name: 'slack' +- name: 'email' +- name: 'pagerduty' + +route: + group_by: ['alertname'] + group_wait: 10ms + group_interval: 10ms + receiver: 'slack'` + conf, err := config.Load(confData) + if err != nil { + t.Fatal(err) + } + providerSettings := createTestProviderSettings() + logger := providerSettings.Logger + route := dispatch.NewRoute(conf.Route, nil) + marker := alertmanagertypes.NewMarker(prometheus.NewRegistry()) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) + if err != nil { + t.Fatal(err) + } + defer alerts.Close() + + timeout := func(d time.Duration) time.Duration { return time.Duration(0) } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)} + metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) + store := nfroutingstoretest.NewMockSQLRouteStore() + store.MatchExpectationsInOrder(false) + nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store) + if err != nil { + t.Fatal(err) + } + orgId := "test-org" + + ctx := context.Background() + routes := []*alertmanagertypes.RoutePolicy{ + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `cluster == "bb" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-OtherAlert", + Description: "Route for OtherAlert critical to Slack", + Enabled: true, + OrgID: orgId, + Channels: []string{"slack"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `service == "db" && threshold.name == "critical"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-TestingAlert", + Description: "Route for TestingAlert warning to Slack", + Enabled: true, + OrgID: orgId, + Channels: []string{"slack"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `cluster == "bb" && instance == "inst1"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "ruleId-HighErrorRate", + Description: "Route for HighErrorRate critical to Email", + Enabled: true, + OrgID: orgId, + Channels: []string{"email"}, + }, + } + // Set up SQL mock expectations for the CreateBatch call + store.ExpectCreateBatch(routes) + err = nfManager.CreateRoutePolicies(ctx, orgId, routes) + require.NoError(t, err) + + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId) + go dispatcher.Run() + defer dispatcher.Stop() + + inputAlerts := []*alertmanagertypes.Alert{ + newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "db", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1", "threshold.name": "warning"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1", "threshold.name": "critical"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst1", "threshold.name": "warning"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4", "threshold.name": "critical"}), + newAlert(model.LabelSet{"ruleId": "ruleId-HighLatency", "nodata": "true"}), + } + // Set up expectations with route filtering for each alert + for i := 0; i < len(inputAlerts); i++ { + store.ExpectGetAllByKindAndOrgID(orgId, alertmanagertypes.PolicyBasedExpression, routes) + } + notiConfigs := map[string]alertmanagertypes.NotificationConfig{ + "ruleId-OtherAlert": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("cluster"): {}, + model.LabelName("service"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 10, + }, + UsePolicy: true, + }, + "ruleId-TestingAlert": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("service"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 11, + }, + UsePolicy: true, + }, + "ruleId-HighErrorRate": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("cluster"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 12, + }, + UsePolicy: true, + }, + "ruleId-HighLatency": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("service"): {}, + model.LabelName("kafka"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 13, + NoDataInterval: 14, + }, + UsePolicy: true, + }, + } + + for ruleID, config := range notiConfigs { + err := nfManager.SetNotificationConfig(orgId, ruleID, &config) + require.NoError(t, err) + } + err = alerts.Put(inputAlerts...) + if err != nil { + t.Fatal(err) + } + + for i := 0; len(recorder.Alerts()) != 3 && i < 15; i++ { + time.Sleep(400 * time.Millisecond) + } + require.Len(t, recorder.Alerts(), 5) + + alertGroups, receivers := dispatcher.Groups( + func(*dispatch.Route) bool { + return true + }, func(*alertmanagertypes.Alert, time.Time) bool { + return true + }, + ) + + dispatcher.mtx.RLock() + aggrGroupsPerRoute := dispatcher.aggrGroupsPerRoute + dispatcher.mtx.RUnlock() + + require.NotEmpty(t, aggrGroupsPerRoute, "Should have aggregation groups per route") + + routeIDsFound := make(map[string]bool) + totalAggrGroups := 0 + + for route, groups := range aggrGroupsPerRoute { + routeID := route.ID() + routeIDsFound[routeID] = true + expectedReceiver := "" + switch routeID { + case "{__receiver__=\"slack\"}": + expectedReceiver = "slack" + case "{__receiver__=\"email\"}": + expectedReceiver = "email" + case "{__receiver__=\"pagerduty\"}": + expectedReceiver = "pagerduty" + } + if expectedReceiver != "" { + require.Equal(t, expectedReceiver, route.RouteOpts.Receiver, + "Route %s should have receiver %s", routeID, expectedReceiver) + } + totalAggrGroups += len(groups) + } + + require.Equal(t, 5, totalAggrGroups, "Should have exactly 5 aggregation groups") + + expectedGroupCounts := map[string]int{ + "{__receiver__=\"slack\"}": 3, + "{__receiver__=\"email\"}": 2, + } + + for route, groups := range aggrGroupsPerRoute { + routeID := route.ID() + if expectedCount, exists := expectedGroupCounts[routeID]; exists { + require.Equal(t, expectedCount, len(groups), + "Route %s should have %d groups, got %d", routeID, expectedCount, len(groups)) + } + } + + // Verify alert groups contain expected alerts + require.Len(t, alertGroups, 5) + + // Verify receivers mapping - based on NotificationPolicy routing without ruleID + expectedReceivers := map[model.Fingerprint][]string{ + inputAlerts[0].Fingerprint(): {"slack"}, + inputAlerts[6].Fingerprint(): {"slack"}, + inputAlerts[4].Fingerprint(): {"email", "slack"}, + inputAlerts[5].Fingerprint(): {"email"}, + } + require.Equal(t, expectedReceivers, receivers) } type recordStage struct { mtx sync.RWMutex - alerts map[string]map[model.Fingerprint]*types.Alert + alerts map[string]map[model.Fingerprint]*alertmanagertypes.Alert } -func (r *recordStage) Alerts() []*types.Alert { +func (r *recordStage) Alerts() []*alertmanagertypes.Alert { r.mtx.RLock() defer r.mtx.RUnlock() - alerts := make([]*types.Alert, 0) + alerts := make([]*alertmanagertypes.Alert, 0) for k := range r.alerts { for _, a := range r.alerts[k] { alerts = append(alerts, a) @@ -860,7 +1119,7 @@ func (r *recordStage) Alerts() []*types.Alert { return alerts } -func (r *recordStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { +func (r *recordStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*alertmanagertypes.Alert) (context.Context, []*alertmanagertypes.Alert, error) { r.mtx.Lock() defer r.mtx.Unlock() gk, ok := notify.GroupKey(ctx) @@ -868,7 +1127,7 @@ func (r *recordStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*types panic("GroupKey not present!") } if _, ok := r.alerts[gk]; !ok { - r.alerts[gk] = make(map[model.Fingerprint]*types.Alert) + r.alerts[gk] = make(map[model.Fingerprint]*alertmanagertypes.Alert) } for _, a := range alerts { r.alerts[gk][a.Fingerprint()] = a @@ -883,8 +1142,8 @@ var ( t1 = t0.Add(2 * time.Minute) ) -func newAlert(labels model.LabelSet) *types.Alert { - return &types.Alert{ +func newAlert(labels model.LabelSet) *alertmanagertypes.Alert { + return &alertmanagertypes.Alert{ Alert: model.Alert{ Labels: labels, Annotations: model.LabelSet{"foo": "bar"}, @@ -899,7 +1158,7 @@ func newAlert(labels model.LabelSet) *types.Alert { func TestDispatcherRace(t *testing.T) { logger := promslog.NewNopLogger() - marker := types.NewMarker(prometheus.NewRegistry()) + marker := alertmanagertypes.NewMarker(prometheus.NewRegistry()) alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) @@ -917,56 +1176,94 @@ func TestDispatcherRace(t *testing.T) { func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T) { const numAlerts = 5000 + confData := `receivers: +- name: 'slack' +- name: 'email' +- name: 'pagerduty' - logger := promslog.NewNopLogger() - marker := types.NewMarker(prometheus.NewRegistry()) +route: + group_by: ['alertname'] + group_wait: 1h + group_interval: 1h + receiver: 'slack'` + conf, err := config.Load(confData) + if err != nil { + t.Fatal(err) + } + route := dispatch.NewRoute(conf.Route, nil) + providerSettings := createTestProviderSettings() + logger := providerSettings.Logger + marker := alertmanagertypes.NewMarker(prometheus.NewRegistry()) alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) if err != nil { t.Fatal(err) } defer alerts.Close() + timeout := func(d time.Duration) time.Duration { return d } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)} + metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) + store := nfroutingstoretest.NewMockSQLRouteStore() + store.MatchExpectationsInOrder(false) + nfManager, err := rulebasednotification.New(context.Background(), providerSettings, nfmanager.Config{}, store) + if err != nil { + t.Fatal(err) + } + orgId := "test-org" - route := &dispatch.Route{ - RouteOpts: dispatch.RouteOpts{ - Receiver: "default", - GroupBy: map[model.LabelName]struct{}{"ruleId": {}}, - GroupWait: 0, - GroupInterval: 1 * time.Hour, // Should never hit in this test. - RepeatInterval: 1 * time.Hour, // Should never hit in this test. - }, + for i := 0; i < numAlerts; i++ { + ruleId := fmt.Sprintf("Alert_%d", i) + + notifConfig := alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 1 * time.Hour, + }, + UsePolicy: false, + } + route := &alertmanagertypes.RoutePolicy{ + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: fmt.Sprintf(`ruleId == "%s"`, ruleId), + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: ruleId, + Description: "Route for OtherAlert critical to Slack", + Enabled: true, + OrgID: orgId, + Channels: []string{"slack"}, + } + + store.ExpectGetAllByName(orgId, ruleId, []*alertmanagertypes.RoutePolicy{route}) + err := nfManager.SetNotificationConfig(orgId, ruleId, ¬ifConfig) + require.NoError(t, err) } - timeout := func(d time.Duration) time.Duration { return d } - recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} - metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) - nfManager := nfmanagertest.NewMock() - dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, "test-org") + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId) go dispatcher.Run() defer dispatcher.Stop() - // Push all alerts. for i := 0; i < numAlerts; i++ { - alert := newAlert(model.LabelSet{"ruleId": model.LabelValue(fmt.Sprintf("Alert_%d", i))}) + ruleId := fmt.Sprintf("Alert_%d", i) + alert := newAlert(model.LabelSet{"ruleId": model.LabelValue(ruleId)}) require.NoError(t, alerts.Put(alert)) } - // Wait until the alerts have been notified or the waiting timeout expires. for deadline := time.Now().Add(5 * time.Second); time.Now().Before(deadline); { if len(recorder.Alerts()) >= numAlerts { break } - // Throttle. time.Sleep(10 * time.Millisecond) } - // We expect all alerts to be notified immediately, since they all belong to different groups. require.Len(t, recorder.Alerts(), numAlerts) } func TestDispatcher_DoMaintenance(t *testing.T) { r := prometheus.NewRegistry() - marker := types.NewMarker(r) + marker := alertmanagertypes.NewMarker(r) alerts, err := mem.NewAlerts(context.Background(), marker, time.Minute, nil, promslog.NewNopLogger(), nil) if err != nil { @@ -981,7 +1278,7 @@ func TestDispatcher_DoMaintenance(t *testing.T) { }, } timeout := func(d time.Duration) time.Duration { return d } - recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*alertmanagertypes.Alert)} ctx := context.Background() metrics := NewDispatcherMetrics(false, r) @@ -997,7 +1294,7 @@ func TestDispatcher_DoMaintenance(t *testing.T) { aggrGroups[route][aggrGroup1.fingerprint()] = aggrGroup1 dispatcher.aggrGroupsPerRoute = aggrGroups // Must run otherwise doMaintenance blocks on aggrGroup1.stop(). - go aggrGroup1.run(func(context.Context, ...*types.Alert) bool { return true }) + go aggrGroup1.run(func(context.Context, ...*alertmanagertypes.Alert) bool { return true }) // Insert a marker for the aggregation group's group key. marker.SetMuted(route.ID(), aggrGroup1.GroupKey(), []string{"weekends"}) diff --git a/pkg/alertmanager/alertmanagerserver/server.go b/pkg/alertmanager/alertmanagerserver/server.go index d4c0ddad7215..208a5ee8880a 100644 --- a/pkg/alertmanager/alertmanagerserver/server.go +++ b/pkg/alertmanager/alertmanagerserver/server.go @@ -2,6 +2,9 @@ package alertmanagerserver import ( "context" + "fmt" + "github.com/prometheus/alertmanager/types" + "golang.org/x/sync/errgroup" "log/slog" "strings" "sync" @@ -321,39 +324,104 @@ func (server *Server) SetConfig(ctx context.Context, alertmanagerConfig *alertma } func (server *Server) TestReceiver(ctx context.Context, receiver alertmanagertypes.Receiver) error { - return alertmanagertypes.TestReceiver(ctx, receiver, alertmanagernotify.NewReceiverIntegrations, server.alertmanagerConfig, server.tmpl, server.logger, alertmanagertypes.NewTestAlert(receiver, time.Now(), time.Now())) + testAlert := alertmanagertypes.NewTestAlert(receiver, time.Now(), time.Now()) + return alertmanagertypes.TestReceiver(ctx, receiver, alertmanagernotify.NewReceiverIntegrations, server.alertmanagerConfig, server.tmpl, server.logger, testAlert.Labels, testAlert) } -func (server *Server) TestAlert(ctx context.Context, postableAlert *alertmanagertypes.PostableAlert, receivers []string) error { - alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts(alertmanagertypes.PostableAlerts{postableAlert}, time.Duration(server.srvConfig.Global.ResolveTimeout), time.Now()) +func (server *Server) TestAlert(ctx context.Context, receiversMap map[*alertmanagertypes.PostableAlert][]string, config *alertmanagertypes.NotificationConfig) error { + if len(receiversMap) == 0 { + return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, + "expected at least 1 alert, got 0") + } + + postableAlerts := make(alertmanagertypes.PostableAlerts, 0, len(receiversMap)) + for alert := range receiversMap { + postableAlerts = append(postableAlerts, alert) + } + + alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts( + postableAlerts, + time.Duration(server.srvConfig.Global.ResolveTimeout), + time.Now(), + ) if err != nil { - return errors.Join(err...) + return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, + "failed to construct alerts from postable alerts: %v", err) } - if len(alerts) != 1 { - return errors.Newf(errors.TypeInvalidInput, errors.CodeInvalidInput, "expected 1 alert, got %d", len(alerts)) + type alertGroup struct { + groupLabels model.LabelSet + alerts []*types.Alert + receivers map[string]struct{} } - ch := make(chan error, len(receivers)) - for _, receiverName := range receivers { - go func(receiverName string) { - receiver, err := server.alertmanagerConfig.GetReceiver(receiverName) - if err != nil { - ch <- err - return + groupMap := make(map[model.Fingerprint]*alertGroup) + + for i, alert := range alerts { + labels := getGroupLabels(alert, config.NotificationGroup, config.GroupByAll) + fp := labels.Fingerprint() + + postableAlert := postableAlerts[i] + alertReceivers := receiversMap[postableAlert] + + if group, exists := groupMap[fp]; exists { + group.alerts = append(group.alerts, alert) + for _, r := range alertReceivers { + group.receivers[r] = struct{}{} + } + } else { + receiverSet := make(map[string]struct{}) + for _, r := range alertReceivers { + receiverSet[r] = struct{}{} + } + groupMap[fp] = &alertGroup{ + groupLabels: labels, + alerts: []*types.Alert{alert}, + receivers: receiverSet, } - ch <- alertmanagertypes.TestReceiver(ctx, receiver, alertmanagernotify.NewReceiverIntegrations, server.alertmanagerConfig, server.tmpl, server.logger, alerts[0]) - }(receiverName) - } - - var errs []error - for i := 0; i < len(receivers); i++ { - if err := <-ch; err != nil { - errs = append(errs, err) } } - if errs != nil { + var mu sync.Mutex + var errs []error + + g, gCtx := errgroup.WithContext(ctx) + for _, group := range groupMap { + for receiverName := range group.receivers { + group := group + receiverName := receiverName + + g.Go(func() error { + receiver, err := server.alertmanagerConfig.GetReceiver(receiverName) + if err != nil { + mu.Lock() + errs = append(errs, fmt.Errorf("failed to get receiver %q: %w", receiverName, err)) + mu.Unlock() + return nil // Return nil to continue processing other goroutines + } + + err = alertmanagertypes.TestReceiver( + gCtx, + receiver, + alertmanagernotify.NewReceiverIntegrations, + server.alertmanagerConfig, + server.tmpl, + server.logger, + group.groupLabels, + group.alerts..., + ) + if err != nil { + mu.Lock() + errs = append(errs, fmt.Errorf("receiver %q test failed: %w", receiverName, err)) + mu.Unlock() + } + return nil // Return nil to continue processing other goroutines + }) + } + } + _ = g.Wait() + + if len(errs) > 0 { return errors.Join(errs...) } diff --git a/pkg/alertmanager/alertmanagerserver/server_e2e_test.go b/pkg/alertmanager/alertmanagerserver/server_e2e_test.go new file mode 100644 index 000000000000..14cbe02e126c --- /dev/null +++ b/pkg/alertmanager/alertmanagerserver/server_e2e_test.go @@ -0,0 +1,223 @@ +package alertmanagerserver + +import ( + "context" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes/alertmanagertypestest" + "github.com/prometheus/alertmanager/dispatch" + "io" + "log/slog" + "net/http" + "testing" + "time" + + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification" + "github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest" + "github.com/SigNoz/signoz/pkg/types" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/SigNoz/signoz/pkg/valuer" + + "github.com/go-openapi/strfmt" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" + + "github.com/stretchr/testify/require" +) + +func TestEndToEndAlertManagerFlow(t *testing.T) { + ctx := context.Background() + providerSettings := instrumentationtest.New().ToProviderSettings() + + store := nfroutingstoretest.NewMockSQLRouteStore() + store.MatchExpectationsInOrder(false) + notificationManager, err := rulebasednotification.New(ctx, providerSettings, nfmanager.Config{}, store) + require.NoError(t, err) + orgID := "test-org" + + routes := []*alertmanagertypes.RoutePolicy{ + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "high-cpu-usage" && severity == "critical"`, + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Name: "high-cpu-usage", + Description: "High CPU critical alerts to webhook", + Enabled: true, + OrgID: orgID, + Channels: []string{"webhook"}, + }, + { + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + Expression: `ruleId == "high-cpu-usage" && severity == "warning"`, + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Name: "high-cpu-usage", + Description: "High CPU warning alerts to webhook", + Enabled: true, + OrgID: orgID, + Channels: []string{"webhook"}, + }, + } + + store.ExpectCreateBatch(routes) + err = notificationManager.CreateRoutePolicies(ctx, orgID, routes) + require.NoError(t, err) + + for range routes { + ruleID := "high-cpu-usage" + store.ExpectGetAllByName(orgID, ruleID, routes) + store.ExpectGetAllByName(orgID, ruleID, routes) + } + + notifConfig := alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("cluster"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 5 * time.Minute, + }, + UsePolicy: false, + } + + err = notificationManager.SetNotificationConfig(orgID, "high-cpu-usage", ¬ifConfig) + require.NoError(t, err) + + srvCfg := NewConfig() + stateStore := alertmanagertypestest.NewStateStore() + registry := prometheus.NewRegistry() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + server, err := New(context.Background(), logger, registry, srvCfg, orgID, stateStore, notificationManager) + require.NoError(t, err) + amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, orgID) + require.NoError(t, err) + err = server.SetConfig(ctx, amConfig) + require.NoError(t, err) + + // Create test alerts + now := time.Now() + testAlerts := []*alertmanagertypes.PostableAlert{ + { + Alert: alertmanagertypes.AlertModel{ + Labels: map[string]string{ + "ruleId": "high-cpu-usage", + "severity": "critical", + "cluster": "prod-cluster", + "instance": "server-01", + "alertname": "HighCPUUsage", + }, + }, + Annotations: map[string]string{ + "summary": "High CPU usage detected", + "description": "CPU usage is above 90% for 5 minutes", + }, + StartsAt: strfmt.DateTime(now.Add(-5 * time.Minute)), + EndsAt: strfmt.DateTime(time.Time{}), // Active alert + }, + { + Alert: alertmanagertypes.AlertModel{ + Labels: map[string]string{ + "ruleId": "high-cpu-usage", + "severity": "warning", + "cluster": "prod-cluster", + "instance": "server-02", + "alertname": "HighCPUUsage", + }, + }, + Annotations: map[string]string{ + "summary": "Moderate CPU usage detected", + "description": "CPU usage is above 70% for 10 minutes", + }, + StartsAt: strfmt.DateTime(now.Add(-10 * time.Minute)), + EndsAt: strfmt.DateTime(time.Time{}), // Active alert + }, + { + Alert: alertmanagertypes.AlertModel{ + Labels: map[string]string{ + "ruleId": "high-cpu-usage", + "severity": "critical", + "cluster": "prod-cluster", + "instance": "server-03", + "alertname": "HighCPUUsage", + }, + }, + Annotations: map[string]string{ + "summary": "High CPU usage detected on server-03", + "description": "CPU usage is above 95% for 3 minutes", + }, + StartsAt: strfmt.DateTime(now.Add(-3 * time.Minute)), + EndsAt: strfmt.DateTime(time.Time{}), // Active alert + }, + } + + err = server.PutAlerts(ctx, testAlerts) + require.NoError(t, err) + + time.Sleep(2 * time.Second) + + t.Run("verify_alerts_processed", func(t *testing.T) { + dummyRequest, err := http.NewRequest(http.MethodGet, "/alerts", nil) + require.NoError(t, err) + + params, err := alertmanagertypes.NewGettableAlertsParams(dummyRequest) + require.NoError(t, err) + alerts, err := server.GetAlerts(context.Background(), params) + require.NoError(t, err) + require.Len(t, alerts, 3, "Expected 3 active alerts") + + for _, alert := range alerts { + require.Equal(t, "high-cpu-usage", alert.Alert.Labels["ruleId"]) + require.NotEmpty(t, alert.Alert.Labels["severity"]) + require.Contains(t, []string{"critical", "warning"}, alert.Alert.Labels["severity"]) + require.Equal(t, "prod-cluster", alert.Alert.Labels["cluster"]) + require.NotEmpty(t, alert.Alert.Labels["instance"]) + } + + criticalAlerts := 0 + warningAlerts := 0 + for _, alert := range alerts { + if alert.Alert.Labels["severity"] == "critical" { + criticalAlerts++ + } else if alert.Alert.Labels["severity"] == "warning" { + warningAlerts++ + } + } + require.Equal(t, 2, criticalAlerts, "Expected 2 critical alerts") + require.Equal(t, 1, warningAlerts, "Expected 1 warning alert") + }) + + t.Run("verify_notification_routing", func(t *testing.T) { + + notifConfig, err := notificationManager.GetNotificationConfig(orgID, "high-cpu-usage") + require.NoError(t, err) + require.NotNil(t, notifConfig) + require.Equal(t, 5*time.Minute, notifConfig.Renotify.RenotifyInterval) + require.Contains(t, notifConfig.NotificationGroup, model.LabelName("ruleId")) + require.Contains(t, notifConfig.NotificationGroup, model.LabelName("cluster")) + require.Contains(t, notifConfig.NotificationGroup, model.LabelName("instance")) + }) + + t.Run("verify_alert_groups_and_stages", func(t *testing.T) { + time.Sleep(2 * time.Second) + + alertGroups, _ := server.dispatcher.Groups( + func(route *dispatch.Route) bool { return true }, // Accept all routes + func(alert *alertmanagertypes.Alert, now time.Time) bool { return true }, // Accept all alerts + ) + require.Len(t, alertGroups, 3) + + require.NotEmpty(t, alertGroups, "Should have alert groups created by dispatcher") + + totalAlerts := 0 + for _, group := range alertGroups { + totalAlerts += len(group.Alerts) + } + require.Equal(t, 3, totalAlerts, "Should have 3 alerts total across all groups") + require.Equal(t, "{__receiver__=\"webhook\"}:{cluster=\"prod-cluster\", instance=\"server-01\", ruleId=\"high-cpu-usage\"}", alertGroups[0].GroupKey) + require.Equal(t, "{__receiver__=\"webhook\"}:{cluster=\"prod-cluster\", instance=\"server-02\", ruleId=\"high-cpu-usage\"}", alertGroups[1].GroupKey) + require.Equal(t, "{__receiver__=\"webhook\"}:{cluster=\"prod-cluster\", instance=\"server-03\", ruleId=\"high-cpu-usage\"}", alertGroups[2].GroupKey) + }) +} diff --git a/pkg/alertmanager/alertmanagerserver/server_test.go b/pkg/alertmanager/alertmanagerserver/server_test.go index 8aad88b2ff4a..e222e319e883 100644 --- a/pkg/alertmanager/alertmanagerserver/server_test.go +++ b/pkg/alertmanager/alertmanagerserver/server_test.go @@ -19,6 +19,7 @@ import ( "github.com/prometheus/alertmanager/config" "github.com/prometheus/client_golang/prometheus" commoncfg "github.com/prometheus/common/config" + "github.com/prometheus/common/model" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -127,3 +128,189 @@ func TestServerPutAlerts(t *testing.T) { assert.Equal(t, gettableAlerts[0].Alert.Labels["alertname"], "test-alert") assert.NoError(t, server.Stop(context.Background())) } + +func TestServerTestAlert(t *testing.T) { + stateStore := alertmanagertypestest.NewStateStore() + srvCfg := NewConfig() + srvCfg.Route.GroupInterval = 1 * time.Second + notificationManager := nfmanagertest.NewMock() + server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager) + require.NoError(t, err) + + amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1") + require.NoError(t, err) + + webhook1Listener, err := net.Listen("tcp", "localhost:0") + require.NoError(t, err) + webhook2Listener, err := net.Listen("tcp", "localhost:0") + require.NoError(t, err) + + requestCount1 := 0 + requestCount2 := 0 + webhook1Server := &http.Server{ + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCount1++ + w.WriteHeader(http.StatusOK) + }), + } + webhook2Server := &http.Server{ + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCount2++ + w.WriteHeader(http.StatusOK) + }), + } + + go func() { + _ = webhook1Server.Serve(webhook1Listener) + }() + go func() { + _ = webhook2Server.Serve(webhook2Listener) + }() + + webhook1URL, err := url.Parse("http://" + webhook1Listener.Addr().String() + "/webhook") + require.NoError(t, err) + webhook2URL, err := url.Parse("http://" + webhook2Listener.Addr().String() + "/webhook") + require.NoError(t, err) + + require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{ + Name: "receiver-1", + WebhookConfigs: []*config.WebhookConfig{ + { + HTTPConfig: &commoncfg.HTTPClientConfig{}, + URL: &config.SecretURL{URL: webhook1URL}, + }, + }, + })) + + require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{ + Name: "receiver-2", + WebhookConfigs: []*config.WebhookConfig{ + { + HTTPConfig: &commoncfg.HTTPClientConfig{}, + URL: &config.SecretURL{URL: webhook2URL}, + }, + }, + })) + + require.NoError(t, server.SetConfig(context.Background(), amConfig)) + defer func() { + _ = server.Stop(context.Background()) + _ = webhook1Server.Close() + _ = webhook2Server.Close() + }() + + // Test with multiple alerts going to different receivers + alert1 := &alertmanagertypes.PostableAlert{ + Annotations: models.LabelSet{"alertname": "test-alert-1"}, + StartsAt: strfmt.DateTime(time.Now()), + Alert: models.Alert{ + Labels: models.LabelSet{"alertname": "test-alert-1", "severity": "critical"}, + }, + } + alert2 := &alertmanagertypes.PostableAlert{ + Annotations: models.LabelSet{"alertname": "test-alert-2"}, + StartsAt: strfmt.DateTime(time.Now()), + Alert: models.Alert{ + Labels: models.LabelSet{"alertname": "test-alert-2", "severity": "warning"}, + }, + } + + receiversMap := map[*alertmanagertypes.PostableAlert][]string{ + alert1: {"receiver-1", "receiver-2"}, + alert2: {"receiver-2"}, + } + + config := &alertmanagertypes.NotificationConfig{ + NotificationGroup: make(map[model.LabelName]struct{}), + GroupByAll: false, + } + + err = server.TestAlert(context.Background(), receiversMap, config) + require.NoError(t, err) + + time.Sleep(100 * time.Millisecond) + + assert.Greater(t, requestCount1, 0, "receiver-1 should have received at least one request") + assert.Greater(t, requestCount2, 0, "receiver-2 should have received at least one request") +} + +func TestServerTestAlertContinuesOnFailure(t *testing.T) { + stateStore := alertmanagertypestest.NewStateStore() + srvCfg := NewConfig() + srvCfg.Route.GroupInterval = 1 * time.Second + notificationManager := nfmanagertest.NewMock() + server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager) + require.NoError(t, err) + + amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1") + require.NoError(t, err) + + // Create one working webhook and one failing receiver (non-existent) + webhookListener, err := net.Listen("tcp", "localhost:0") + require.NoError(t, err) + + requestCount := 0 + webhookServer := &http.Server{ + Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCount++ + w.WriteHeader(http.StatusOK) + }), + } + + go func() { + _ = webhookServer.Serve(webhookListener) + }() + + webhookURL, err := url.Parse("http://" + webhookListener.Addr().String() + "/webhook") + require.NoError(t, err) + + require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{ + Name: "working-receiver", + WebhookConfigs: []*config.WebhookConfig{ + { + HTTPConfig: &commoncfg.HTTPClientConfig{}, + URL: &config.SecretURL{URL: webhookURL}, + }, + }, + })) + + require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{ + Name: "failing-receiver", + WebhookConfigs: []*config.WebhookConfig{ + { + HTTPConfig: &commoncfg.HTTPClientConfig{}, + URL: &config.SecretURL{URL: &url.URL{Scheme: "http", Host: "localhost:1", Path: "/webhook"}}, + }, + }, + })) + + require.NoError(t, server.SetConfig(context.Background(), amConfig)) + defer func() { + _ = server.Stop(context.Background()) + _ = webhookServer.Close() + }() + + alert := &alertmanagertypes.PostableAlert{ + Annotations: models.LabelSet{"alertname": "test-alert"}, + StartsAt: strfmt.DateTime(time.Now()), + Alert: models.Alert{ + Labels: models.LabelSet{"alertname": "test-alert"}, + }, + } + + receiversMap := map[*alertmanagertypes.PostableAlert][]string{ + alert: {"working-receiver", "failing-receiver"}, + } + + config := &alertmanagertypes.NotificationConfig{ + NotificationGroup: make(map[model.LabelName]struct{}), + GroupByAll: false, + } + + err = server.TestAlert(context.Background(), receiversMap, config) + assert.Error(t, err) + + time.Sleep(100 * time.Millisecond) + + assert.Greater(t, requestCount, 0, "working-receiver should have received at least one request even though failing-receiver failed") +} diff --git a/pkg/alertmanager/api.go b/pkg/alertmanager/api.go index ece7dcfa371b..c6be90b4970f 100644 --- a/pkg/alertmanager/api.go +++ b/pkg/alertmanager/api.go @@ -2,6 +2,7 @@ package alertmanager import ( "context" + "encoding/json" "io" "net/http" "time" @@ -273,3 +274,128 @@ func (api *API) CreateChannel(rw http.ResponseWriter, req *http.Request) { render.Success(rw, http.StatusNoContent, nil) } + +func (api *API) CreateRoutePolicy(rw http.ResponseWriter, req *http.Request) { + ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second) + defer cancel() + + body, err := io.ReadAll(req.Body) + if err != nil { + render.Error(rw, err) + return + } + defer req.Body.Close() + var policy alertmanagertypes.PostableRoutePolicy + err = json.Unmarshal(body, &policy) + if err != nil { + render.Error(rw, err) + return + } + + policy.ExpressionKind = alertmanagertypes.PolicyBasedExpression + + // Validate the postable route + if err := policy.Validate(); err != nil { + render.Error(rw, err) + return + } + + result, err := api.alertmanager.CreateRoutePolicy(ctx, &policy) + if err != nil { + render.Error(rw, err) + return + } + + render.Success(rw, http.StatusCreated, result) +} + +func (api *API) GetAllRoutePolicies(rw http.ResponseWriter, req *http.Request) { + ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second) + defer cancel() + + policies, err := api.alertmanager.GetAllRoutePolicies(ctx) + if err != nil { + render.Error(rw, err) + return + } + + render.Success(rw, http.StatusOK, policies) +} + +func (api *API) GetRoutePolicyByID(rw http.ResponseWriter, req *http.Request) { + ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second) + defer cancel() + + vars := mux.Vars(req) + policyID := vars["id"] + if policyID == "" { + render.Error(rw, errors.NewInvalidInputf(errors.CodeInvalidInput, "policy ID is required")) + return + } + + policy, err := api.alertmanager.GetRoutePolicyByID(ctx, policyID) + if err != nil { + render.Error(rw, err) + return + } + + render.Success(rw, http.StatusOK, policy) +} + +func (api *API) DeleteRoutePolicyByID(rw http.ResponseWriter, req *http.Request) { + ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second) + defer cancel() + + vars := mux.Vars(req) + policyID := vars["id"] + if policyID == "" { + render.Error(rw, errors.NewInvalidInputf(errors.CodeInvalidInput, "policy ID is required")) + return + } + + err := api.alertmanager.DeleteRoutePolicyByID(ctx, policyID) + if err != nil { + render.Error(rw, err) + return + } + + render.Success(rw, http.StatusNoContent, nil) +} + +func (api *API) UpdateRoutePolicy(rw http.ResponseWriter, req *http.Request) { + ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second) + defer cancel() + + vars := mux.Vars(req) + policyID := vars["id"] + if policyID == "" { + render.Error(rw, errors.NewInvalidInputf(errors.CodeInvalidInput, "policy ID is required")) + return + } + body, err := io.ReadAll(req.Body) + if err != nil { + render.Error(rw, err) + return + } + defer req.Body.Close() + var policy alertmanagertypes.PostableRoutePolicy + err = json.Unmarshal(body, &policy) + if err != nil { + render.Error(rw, err) + return + } + policy.ExpressionKind = alertmanagertypes.PolicyBasedExpression + + // Validate the postable route + if err := policy.Validate(); err != nil { + render.Error(rw, err) + return + } + + result, err := api.alertmanager.UpdateRoutePolicyByID(ctx, policyID, &policy) + if err != nil { + render.Error(rw, err) + return + } + render.Success(rw, http.StatusOK, result) +} diff --git a/pkg/alertmanager/nfmanager/nfmanagertest/provider.go b/pkg/alertmanager/nfmanager/nfmanagertest/provider.go index 2a321ce80712..dfb930495686 100644 --- a/pkg/alertmanager/nfmanager/nfmanagertest/provider.go +++ b/pkg/alertmanager/nfmanager/nfmanagertest/provider.go @@ -1,20 +1,29 @@ package nfmanagertest import ( + "context" + "fmt" + "strings" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/prometheus/common/model" ) // MockNotificationManager is a simple mock implementation of NotificationManager type MockNotificationManager struct { - configs map[string]*alertmanagertypes.NotificationConfig - errors map[string]error + configs map[string]*alertmanagertypes.NotificationConfig + routes map[string]*alertmanagertypes.RoutePolicy + routesByName map[string][]*alertmanagertypes.RoutePolicy + errors map[string]error } // NewMock creates a new mock notification manager func NewMock() *MockNotificationManager { return &MockNotificationManager{ - configs: make(map[string]*alertmanagertypes.NotificationConfig), - errors: make(map[string]error), + configs: make(map[string]*alertmanagertypes.NotificationConfig), + routes: make(map[string]*alertmanagertypes.RoutePolicy), + routesByName: make(map[string][]*alertmanagertypes.RoutePolicy), + errors: make(map[string]error), } } @@ -65,6 +74,8 @@ func (m *MockNotificationManager) SetMockError(orgID, ruleID string, err error) func (m *MockNotificationManager) ClearMockData() { m.configs = make(map[string]*alertmanagertypes.NotificationConfig) + m.routes = make(map[string]*alertmanagertypes.RoutePolicy) + m.routesByName = make(map[string][]*alertmanagertypes.RoutePolicy) m.errors = make(map[string]error) } @@ -73,3 +84,241 @@ func (m *MockNotificationManager) HasConfig(orgID, ruleID string) bool { _, exists := m.configs[key] return exists } + +// Route Policy CRUD + +func (m *MockNotificationManager) CreateRoutePolicy(ctx context.Context, orgID string, route *alertmanagertypes.RoutePolicy) error { + key := getKey(orgID, "create_route") + if err := m.errors[key]; err != nil { + return err + } + + if route == nil { + return fmt.Errorf("route cannot be nil") + } + + if err := route.Validate(); err != nil { + return err + } + + routeKey := getKey(orgID, route.ID.StringValue()) + m.routes[routeKey] = route + nameKey := getKey(orgID, route.Name) + m.routesByName[nameKey] = append(m.routesByName[nameKey], route) + + return nil +} + +func (m *MockNotificationManager) CreateRoutePolicies(ctx context.Context, orgID string, routes []*alertmanagertypes.RoutePolicy) error { + key := getKey(orgID, "create_routes") + if err := m.errors[key]; err != nil { + return err + } + + if len(routes) == 0 { + return fmt.Errorf("routes cannot be empty") + } + for i, route := range routes { + if route == nil { + return fmt.Errorf("route at index %d cannot be nil", i) + } + if err := route.Validate(); err != nil { + return fmt.Errorf("route at index %d: %s", i, err.Error()) + } + } + for _, route := range routes { + if err := m.CreateRoutePolicy(ctx, orgID, route); err != nil { + return err + } + } + + return nil +} + +func (m *MockNotificationManager) GetRoutePolicyByID(ctx context.Context, orgID string, routeID string) (*alertmanagertypes.RoutePolicy, error) { + key := getKey(orgID, "get_route") + if err := m.errors[key]; err != nil { + return nil, err + } + + if routeID == "" { + return nil, fmt.Errorf("routeID cannot be empty") + } + + routeKey := getKey(orgID, routeID) + route, exists := m.routes[routeKey] + if !exists { + return nil, fmt.Errorf("route with ID %s not found", routeID) + } + + return route, nil +} + +func (m *MockNotificationManager) GetAllRoutePolicies(ctx context.Context, orgID string) ([]*alertmanagertypes.RoutePolicy, error) { + key := getKey(orgID, "get_all_routes") + if err := m.errors[key]; err != nil { + return nil, err + } + + if orgID == "" { + return nil, fmt.Errorf("orgID cannot be empty") + } + + var routes []*alertmanagertypes.RoutePolicy + for routeKey, route := range m.routes { + if route.OrgID == orgID { + routes = append(routes, route) + } + _ = routeKey + } + + return routes, nil +} + +func (m *MockNotificationManager) DeleteRoutePolicy(ctx context.Context, orgID string, routeID string) error { + key := getKey(orgID, "delete_route") + if err := m.errors[key]; err != nil { + return err + } + + if routeID == "" { + return fmt.Errorf("routeID cannot be empty") + } + + routeKey := getKey(orgID, routeID) + route, exists := m.routes[routeKey] + if !exists { + return fmt.Errorf("route with ID %s not found", routeID) + } + delete(m.routes, routeKey) + + nameKey := getKey(orgID, route.Name) + if nameRoutes, exists := m.routesByName[nameKey]; exists { + var filtered []*alertmanagertypes.RoutePolicy + for _, r := range nameRoutes { + if r.ID.StringValue() != routeID { + filtered = append(filtered, r) + } + } + if len(filtered) == 0 { + delete(m.routesByName, nameKey) + } else { + m.routesByName[nameKey] = filtered + } + } + + return nil +} + +func (m *MockNotificationManager) DeleteAllRoutePoliciesByName(ctx context.Context, orgID string, name string) error { + key := getKey(orgID, "delete_routes_by_name") + if err := m.errors[key]; err != nil { + return err + } + + if orgID == "" { + return fmt.Errorf("orgID cannot be empty") + } + + if name == "" { + return fmt.Errorf("name cannot be empty") + } + + nameKey := getKey(orgID, name) + routes, exists := m.routesByName[nameKey] + if !exists { + return nil // No routes to delete + } + + for _, route := range routes { + routeKey := getKey(orgID, route.ID.StringValue()) + delete(m.routes, routeKey) + } + + delete(m.routesByName, nameKey) + + return nil +} + +func (m *MockNotificationManager) Match(ctx context.Context, orgID string, ruleID string, set model.LabelSet) ([]string, error) { + key := getKey(orgID, ruleID) + if err := m.errors[key]; err != nil { + return nil, err + } + + config, err := m.GetNotificationConfig(orgID, ruleID) + if err != nil { + return nil, err + } + + var expressionRoutes []*alertmanagertypes.RoutePolicy + if config.UsePolicy { + for _, route := range m.routes { + if route.OrgID == orgID && route.ExpressionKind == alertmanagertypes.PolicyBasedExpression { + expressionRoutes = append(expressionRoutes, route) + } + } + } else { + nameKey := getKey(orgID, ruleID) + if routes, exists := m.routesByName[nameKey]; exists { + expressionRoutes = routes + } + } + + var matchedChannels []string + for _, route := range expressionRoutes { + if m.evaluateExpr(route.Expression, set) { + matchedChannels = append(matchedChannels, route.Channels...) + } + } + + return matchedChannels, nil +} + +func (m *MockNotificationManager) evaluateExpr(expression string, labelSet model.LabelSet) bool { + ruleID, ok := labelSet["ruleId"] + if !ok { + return false + } + if strings.Contains(expression, `ruleId in ["ruleId-OtherAlert", "ruleId-TestingAlert"]`) { + return ruleID == "ruleId-OtherAlert" || ruleID == "ruleId-TestingAlert" + } + if strings.Contains(expression, `ruleId in ["ruleId-HighLatency", "ruleId-HighErrorRate"]`) { + return ruleID == "ruleId-HighLatency" || ruleID == "ruleId-HighErrorRate" + } + if strings.Contains(expression, `ruleId == "ruleId-HighLatency"`) { + return ruleID == "ruleId-HighLatency" + } + + return false +} + +// Helper methods for testing + +func (m *MockNotificationManager) SetMockRoute(orgID string, route *alertmanagertypes.RoutePolicy) { + routeKey := getKey(orgID, route.ID.StringValue()) + m.routes[routeKey] = route + + nameKey := getKey(orgID, route.Name) + m.routesByName[nameKey] = append(m.routesByName[nameKey], route) +} + +func (m *MockNotificationManager) SetMockRouteError(orgID, operation string, err error) { + key := getKey(orgID, operation) + m.errors[key] = err +} + +func (m *MockNotificationManager) ClearMockRoutes() { + m.routes = make(map[string]*alertmanagertypes.RoutePolicy) + m.routesByName = make(map[string][]*alertmanagertypes.RoutePolicy) +} + +func (m *MockNotificationManager) GetRouteCount() int { + return len(m.routes) +} + +func (m *MockNotificationManager) HasRoute(orgID, routeID string) bool { + routeKey := getKey(orgID, routeID) + _, exists := m.routes[routeKey] + return exists +} diff --git a/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest/route.go b/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest/route.go new file mode 100644 index 000000000000..f0fb06689ec5 --- /dev/null +++ b/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest/route.go @@ -0,0 +1,176 @@ +package nfroutingstoretest + +import ( + "context" + "regexp" + "strings" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore" + "github.com/SigNoz/signoz/pkg/sqlstore" + "github.com/SigNoz/signoz/pkg/sqlstore/sqlstoretest" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" +) + +type MockSQLRouteStore struct { + routeStore alertmanagertypes.RouteStore + mock sqlmock.Sqlmock +} + +func NewMockSQLRouteStore() *MockSQLRouteStore { + sqlStore := sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherRegexp) + routeStore := sqlroutingstore.NewStore(sqlStore) + + return &MockSQLRouteStore{ + routeStore: routeStore, + mock: sqlStore.Mock(), + } +} + +func (m *MockSQLRouteStore) Mock() sqlmock.Sqlmock { + return m.mock +} + +func (m *MockSQLRouteStore) GetByID(ctx context.Context, orgId string, id string) (*alertmanagertypes.RoutePolicy, error) { + return m.routeStore.GetByID(ctx, orgId, id) +} + +func (m *MockSQLRouteStore) Create(ctx context.Context, route *alertmanagertypes.RoutePolicy) error { + return m.routeStore.Create(ctx, route) +} + +func (m *MockSQLRouteStore) CreateBatch(ctx context.Context, routes []*alertmanagertypes.RoutePolicy) error { + return m.routeStore.CreateBatch(ctx, routes) +} + +func (m *MockSQLRouteStore) Delete(ctx context.Context, orgId string, id string) error { + return m.routeStore.Delete(ctx, orgId, id) +} + +func (m *MockSQLRouteStore) GetAllByKind(ctx context.Context, orgID string, kind alertmanagertypes.ExpressionKind) ([]*alertmanagertypes.RoutePolicy, error) { + return m.routeStore.GetAllByKind(ctx, orgID, kind) +} + +func (m *MockSQLRouteStore) GetAllByName(ctx context.Context, orgID string, name string) ([]*alertmanagertypes.RoutePolicy, error) { + return m.routeStore.GetAllByName(ctx, orgID, name) +} + +func (m *MockSQLRouteStore) DeleteRouteByName(ctx context.Context, orgID string, name string) error { + return m.routeStore.DeleteRouteByName(ctx, orgID, name) +} + +func (m *MockSQLRouteStore) ExpectGetByID(orgID, id string, route *alertmanagertypes.RoutePolicy) { + rows := sqlmock.NewRows([]string{"id", "org_id", "name", "expression", "kind", "description", "enabled", "tags", "channels", "created_at", "updated_at", "created_by", "updated_by"}) + + if route != nil { + rows.AddRow( + route.ID.StringValue(), + route.OrgID, + route.Name, + route.Expression, + route.ExpressionKind.StringValue(), + route.Description, + route.Enabled, + "[]", // tags as JSON + `["`+strings.Join(route.Channels, `","`)+`"]`, // channels as JSON + "0001-01-01T00:00:00Z", // created_at + "0001-01-01T00:00:00Z", // updated_at + "", // created_by + "", // updated_by + ) + } + + m.mock.ExpectQuery(`SELECT (.+) FROM "route_policy" WHERE \(id = \$1\) AND \(org_id = \$2\)`). + WithArgs(id, orgID). + WillReturnRows(rows) +} + +func (m *MockSQLRouteStore) ExpectCreate(route *alertmanagertypes.RoutePolicy) { + expectedPattern := `INSERT INTO "route_policy" \(.+\) VALUES .+` + m.mock.ExpectExec(expectedPattern). + WillReturnResult(sqlmock.NewResult(1, 1)) +} + +func (m *MockSQLRouteStore) ExpectCreateBatch(routes []*alertmanagertypes.RoutePolicy) { + if len(routes) == 0 { + return + } + + // Simplified pattern that should match any INSERT into route_policy + expectedPattern := `INSERT INTO "route_policy" \(.+\) VALUES .+` + + m.mock.ExpectExec(expectedPattern). + WillReturnResult(sqlmock.NewResult(1, int64(len(routes)))) +} + +func (m *MockSQLRouteStore) ExpectDelete(orgID, id string) { + m.mock.ExpectExec(`DELETE FROM "route_policy" AS "route_policy" WHERE \(org_id = '` + regexp.QuoteMeta(orgID) + `'\) AND \(id = '` + regexp.QuoteMeta(id) + `'\)`). + WillReturnResult(sqlmock.NewResult(0, 1)) +} + +func (m *MockSQLRouteStore) ExpectGetAllByKindAndOrgID(orgID string, kind alertmanagertypes.ExpressionKind, routes []*alertmanagertypes.RoutePolicy) { + rows := sqlmock.NewRows([]string{"id", "org_id", "name", "expression", "kind", "description", "enabled", "tags", "channels", "created_at", "updated_at", "created_by", "updated_by"}) + + for _, route := range routes { + if route.OrgID == orgID && route.ExpressionKind == kind { + rows.AddRow( + route.ID.StringValue(), + route.OrgID, + route.Name, + route.Expression, + route.ExpressionKind.StringValue(), + route.Description, + route.Enabled, + "[]", // tags as JSON + `["`+strings.Join(route.Channels, `","`)+`"]`, // channels as JSON + "0001-01-01T00:00:00Z", // created_at + "0001-01-01T00:00:00Z", // updated_at + "", // created_by + "", // updated_by + ) + } + } + + m.mock.ExpectQuery(`SELECT (.+) FROM "route_policy" WHERE \(org_id = '` + regexp.QuoteMeta(orgID) + `'\) AND \(kind = '` + regexp.QuoteMeta(kind.StringValue()) + `'\)`). + WillReturnRows(rows) +} + +func (m *MockSQLRouteStore) ExpectGetAllByName(orgID, name string, routes []*alertmanagertypes.RoutePolicy) { + rows := sqlmock.NewRows([]string{"id", "org_id", "name", "expression", "kind", "description", "enabled", "tags", "channels", "created_at", "updated_at", "created_by", "updated_by"}) + + for _, route := range routes { + if route.OrgID == orgID && route.Name == name { + rows.AddRow( + route.ID.StringValue(), + route.OrgID, + route.Name, + route.Expression, + route.ExpressionKind.StringValue(), + route.Description, + route.Enabled, + "[]", // tags as JSON + `["`+strings.Join(route.Channels, `","`)+`"]`, // channels as JSON + "0001-01-01T00:00:00Z", // created_at + "0001-01-01T00:00:00Z", // updated_at + "", // created_by + "", // updated_by + ) + } + } + + m.mock.ExpectQuery(`SELECT (.+) FROM "route_policy" WHERE \(org_id = '` + regexp.QuoteMeta(orgID) + `'\) AND \(name = '` + regexp.QuoteMeta(name) + `'\)`). + WillReturnRows(rows) +} + +func (m *MockSQLRouteStore) ExpectDeleteRouteByName(orgID, name string) { + m.mock.ExpectExec(`DELETE FROM "route_policy" AS "route_policy" WHERE \(org_id = '` + regexp.QuoteMeta(orgID) + `'\) AND \(name = '` + regexp.QuoteMeta(name) + `'\)`). + WillReturnResult(sqlmock.NewResult(0, 1)) +} + +func (m *MockSQLRouteStore) ExpectationsWereMet() error { + return m.mock.ExpectationsWereMet() +} + +func (m *MockSQLRouteStore) MatchExpectationsInOrder(match bool) { + m.mock.MatchExpectationsInOrder(match) +} diff --git a/pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore/store.go b/pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore/store.go new file mode 100644 index 000000000000..78504834c554 --- /dev/null +++ b/pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore/store.go @@ -0,0 +1,93 @@ +package sqlroutingstore + +import ( + "context" + "database/sql" + + "github.com/SigNoz/signoz/pkg/errors" + "github.com/SigNoz/signoz/pkg/sqlstore" + routeTypes "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" +) + +type store struct { + sqlstore sqlstore.SQLStore +} + +func NewStore(sqlstore sqlstore.SQLStore) routeTypes.RouteStore { + return &store{ + sqlstore: sqlstore, + } +} + +func (store *store) GetByID(ctx context.Context, orgId string, id string) (*routeTypes.RoutePolicy, error) { + route := new(routeTypes.RoutePolicy) + err := store.sqlstore.BunDBCtx(ctx).NewSelect().Model(route).Where("id = ?", id).Where("org_id = ?", orgId).Scan(ctx) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, store.sqlstore.WrapNotFoundErrf(err, errors.CodeNotFound, "routing policy with ID: %s does not exist", id) + } + return nil, errors.Wrapf(err, errors.TypeInternal, errors.CodeInternal, "unable to fetch routing policy with ID: %s", id) + } + + return route, nil +} + +func (store *store) Create(ctx context.Context, route *routeTypes.RoutePolicy) error { + _, err := store.sqlstore.BunDBCtx(ctx).NewInsert().Model(route).Exec(ctx) + if err != nil { + return errors.NewInternalf(errors.CodeInternal, "error creating routing policy with ID: %s", route.ID) + } + + return nil +} + +func (store *store) CreateBatch(ctx context.Context, route []*routeTypes.RoutePolicy) error { + _, err := store.sqlstore.BunDBCtx(ctx).NewInsert().Model(&route).Exec(ctx) + if err != nil { + return errors.NewInternalf(errors.CodeInternal, "error creating routing policies: %v", err) + } + + return nil +} + +func (store *store) Delete(ctx context.Context, orgId string, id string) error { + _, err := store.sqlstore.BunDBCtx(ctx).NewDelete().Model((*routeTypes.RoutePolicy)(nil)).Where("org_id = ?", orgId).Where("id = ?", id).Exec(ctx) + if err != nil { + return errors.Wrapf(err, errors.TypeInternal, errors.CodeInternal, "unable to delete routing policy with ID: %s", id) + } + + return nil +} + +func (store *store) GetAllByKind(ctx context.Context, orgID string, kind routeTypes.ExpressionKind) ([]*routeTypes.RoutePolicy, error) { + var routes []*routeTypes.RoutePolicy + err := store.sqlstore.BunDBCtx(ctx).NewSelect().Model(&routes).Where("org_id = ?", orgID).Where("kind = ?", kind).Scan(ctx) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil, errors.NewNotFoundf(errors.CodeNotFound, "no routing policies found for orgID: %s", orgID) + } + return nil, errors.Wrapf(err, errors.TypeInternal, errors.CodeInternal, "unable to fetch routing policies for orgID: %s", orgID) + } + return routes, nil +} + +func (store *store) GetAllByName(ctx context.Context, orgID string, name string) ([]*routeTypes.RoutePolicy, error) { + var routes []*routeTypes.RoutePolicy + err := store.sqlstore.BunDBCtx(ctx).NewSelect().Model(&routes).Where("org_id = ?", orgID).Where("name = ?", name).Scan(ctx) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return routes, errors.NewNotFoundf(errors.CodeNotFound, "no routing policies found for orgID: %s and name: %s", orgID, name) + } + return nil, errors.Wrapf(err, errors.TypeInternal, errors.CodeInternal, "unable to fetch routing policies for orgID: %s and name: %s", orgID, name) + } + return routes, nil +} + +func (store *store) DeleteRouteByName(ctx context.Context, orgID string, name string) error { + _, err := store.sqlstore.BunDBCtx(ctx).NewDelete().Model((*routeTypes.RoutePolicy)(nil)).Where("org_id = ?", orgID).Where("name = ?", name).Exec(ctx) + if err != nil { + return errors.Wrapf(err, errors.TypeInternal, errors.CodeInternal, "unable to delete routing policies with name: %s", name) + } + + return nil +} diff --git a/pkg/alertmanager/nfmanager/notificationmanager.go b/pkg/alertmanager/nfmanager/notificationmanager.go index 531c2baae725..5f44e385bee7 100644 --- a/pkg/alertmanager/nfmanager/notificationmanager.go +++ b/pkg/alertmanager/nfmanager/notificationmanager.go @@ -2,12 +2,27 @@ package nfmanager import ( + "context" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/prometheus/common/model" ) -// NotificationManager defines how alerts should be grouped and configured for notification with multi-tenancy support. +// NotificationManager defines how alerts should be grouped and configured for notification. type NotificationManager interface { + // Notification Config CRUD GetNotificationConfig(orgID string, ruleID string) (*alertmanagertypes.NotificationConfig, error) SetNotificationConfig(orgID string, ruleID string, config *alertmanagertypes.NotificationConfig) error DeleteNotificationConfig(orgID string, ruleID string) error + + // Route Policy CRUD + CreateRoutePolicy(ctx context.Context, orgID string, route *alertmanagertypes.RoutePolicy) error + CreateRoutePolicies(ctx context.Context, orgID string, routes []*alertmanagertypes.RoutePolicy) error + GetRoutePolicyByID(ctx context.Context, orgID string, routeID string) (*alertmanagertypes.RoutePolicy, error) + GetAllRoutePolicies(ctx context.Context, orgID string) ([]*alertmanagertypes.RoutePolicy, error) + DeleteRoutePolicy(ctx context.Context, orgID string, routeID string) error + DeleteAllRoutePoliciesByName(ctx context.Context, orgID string, name string) error + + // Route matching + Match(ctx context.Context, orgID string, ruleID string, set model.LabelSet) ([]string, error) } diff --git a/pkg/alertmanager/nfmanager/rulebasednotification/provider.go b/pkg/alertmanager/nfmanager/rulebasednotification/provider.go index 0ce4141ad1d9..13a33184edfc 100644 --- a/pkg/alertmanager/nfmanager/rulebasednotification/provider.go +++ b/pkg/alertmanager/nfmanager/rulebasednotification/provider.go @@ -2,11 +2,14 @@ package rulebasednotification import ( "context" + "strings" "sync" "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/errors" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/expr-lang/expr" + "github.com/prometheus/common/model" "github.com/SigNoz/signoz/pkg/factory" ) @@ -14,26 +17,28 @@ import ( type provider struct { settings factory.ScopedProviderSettings orgToFingerprintToNotificationConfig map[string]map[string]alertmanagertypes.NotificationConfig + routeStore alertmanagertypes.RouteStore mutex sync.RWMutex } // NewFactory creates a new factory for the rule-based grouping strategy. -func NewFactory() factory.ProviderFactory[nfmanager.NotificationManager, nfmanager.Config] { +func NewFactory(routeStore alertmanagertypes.RouteStore) factory.ProviderFactory[nfmanager.NotificationManager, nfmanager.Config] { return factory.NewProviderFactory( factory.MustNewName("rulebased"), func(ctx context.Context, settings factory.ProviderSettings, config nfmanager.Config) (nfmanager.NotificationManager, error) { - return New(ctx, settings, config) + return New(ctx, settings, config, routeStore) }, ) } // New creates a new rule-based grouping strategy provider. -func New(ctx context.Context, providerSettings factory.ProviderSettings, config nfmanager.Config) (nfmanager.NotificationManager, error) { +func New(ctx context.Context, providerSettings factory.ProviderSettings, config nfmanager.Config, routeStore alertmanagertypes.RouteStore) (nfmanager.NotificationManager, error) { settings := factory.NewScopedProviderSettings(providerSettings, "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification") return &provider{ settings: settings, orgToFingerprintToNotificationConfig: make(map[string]map[string]alertmanagertypes.NotificationConfig), + routeStore: routeStore, }, nil } @@ -58,6 +63,8 @@ func (r *provider) GetNotificationConfig(orgID string, ruleID string) (*alertman for k, v := range config.NotificationGroup { notificationConfig.NotificationGroup[k] = v } + notificationConfig.UsePolicy = config.UsePolicy + notificationConfig.GroupByAll = config.GroupByAll } } @@ -101,3 +108,147 @@ func (r *provider) DeleteNotificationConfig(orgID string, ruleID string) error { return nil } + +func (r *provider) CreateRoutePolicy(ctx context.Context, orgID string, route *alertmanagertypes.RoutePolicy) error { + if route == nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "route policy cannot be nil") + } + + err := route.Validate() + if err != nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "invalid route policy: %v", err) + } + + return r.routeStore.Create(ctx, route) +} + +func (r *provider) CreateRoutePolicies(ctx context.Context, orgID string, routes []*alertmanagertypes.RoutePolicy) error { + if len(routes) == 0 { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "route policies cannot be empty") + } + + for _, route := range routes { + if route == nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "route policy cannot be nil") + } + if err := route.Validate(); err != nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "route policy with name %s: %s", route.Name, err.Error()) + } + } + return r.routeStore.CreateBatch(ctx, routes) +} + +func (r *provider) GetRoutePolicyByID(ctx context.Context, orgID string, routeID string) (*alertmanagertypes.RoutePolicy, error) { + if routeID == "" { + return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "routeID cannot be empty") + } + + return r.routeStore.GetByID(ctx, orgID, routeID) +} + +func (r *provider) GetAllRoutePolicies(ctx context.Context, orgID string) ([]*alertmanagertypes.RoutePolicy, error) { + if orgID == "" { + return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "orgID cannot be empty") + } + + return r.routeStore.GetAllByKind(ctx, orgID, alertmanagertypes.PolicyBasedExpression) +} + +func (r *provider) DeleteRoutePolicy(ctx context.Context, orgID string, routeID string) error { + if routeID == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "routeID cannot be empty") + } + + return r.routeStore.Delete(ctx, orgID, routeID) +} + +func (r *provider) DeleteAllRoutePoliciesByName(ctx context.Context, orgID string, name string) error { + if orgID == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "orgID cannot be empty") + } + if name == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "name cannot be empty") + } + return r.routeStore.DeleteRouteByName(ctx, orgID, name) +} + +func (r *provider) Match(ctx context.Context, orgID string, ruleID string, set model.LabelSet) ([]string, error) { + config, err := r.GetNotificationConfig(orgID, ruleID) + if err != nil { + return nil, errors.NewInternalf(errors.CodeInternal, "error getting notification configuration: %v", err) + } + var expressionRoutes []*alertmanagertypes.RoutePolicy + if config.UsePolicy { + expressionRoutes, err = r.routeStore.GetAllByKind(ctx, orgID, alertmanagertypes.PolicyBasedExpression) + if err != nil { + return []string{}, errors.NewInternalf(errors.CodeInternal, "error getting route policies: %v", err) + } + } else { + expressionRoutes, err = r.routeStore.GetAllByName(ctx, orgID, ruleID) + if err != nil { + return []string{}, errors.NewInternalf(errors.CodeInternal, "error getting route policies: %v", err) + } + } + var matchedChannels []string + if _, ok := set[alertmanagertypes.NoDataLabel]; ok && !config.UsePolicy { + for _, expressionRoute := range expressionRoutes { + matchedChannels = append(matchedChannels, expressionRoute.Channels...) + } + return matchedChannels, nil + } + + for _, route := range expressionRoutes { + evaluateExpr, err := r.evaluateExpr(route.Expression, set) + if err != nil { + continue + } + if evaluateExpr { + matchedChannels = append(matchedChannels, route.Channels...) + } + } + + return matchedChannels, nil +} + +func (r *provider) evaluateExpr(expression string, labelSet model.LabelSet) (bool, error) { + env := make(map[string]interface{}) + + for k, v := range labelSet { + key := string(k) + value := string(v) + + if strings.Contains(key, ".") { + parts := strings.Split(key, ".") + current := env + + for i, part := range parts { + if i == len(parts)-1 { + current[part] = value + } else { + if current[part] == nil { + current[part] = make(map[string]interface{}) + } + current = current[part].(map[string]interface{}) + } + } + } else { + env[key] = value + } + } + + program, err := expr.Compile(expression, expr.Env(env)) + if err != nil { + return false, errors.NewInternalf(errors.CodeInternal, "error compiling route policy %s: %v", expression, err) + } + + output, err := expr.Run(program, env) + if err != nil { + return false, errors.NewInternalf(errors.CodeInternal, "error running route policy %s: %v", expression, err) + } + + if boolVal, ok := output.(bool); ok { + return boolVal, nil + } + + return false, errors.NewInternalf(errors.CodeInternal, "error in evaluating route policy %s: %v", expression, err) +} diff --git a/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go b/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go index b380cc1cee89..37ef095ebfb5 100644 --- a/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go +++ b/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go @@ -2,18 +2,22 @@ package rulebasednotification import ( "context" - "github.com/prometheus/common/model" "sync" "testing" "time" "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest" "github.com/SigNoz/signoz/pkg/factory" "github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest" + "github.com/SigNoz/signoz/pkg/types" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" - "github.com/prometheus/alertmanager/types" + "github.com/SigNoz/signoz/pkg/valuer" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + + "github.com/prometheus/common/model" ) func createTestProviderSettings() factory.ProviderSettings { @@ -21,7 +25,8 @@ func createTestProviderSettings() factory.ProviderSettings { } func TestNewFactory(t *testing.T) { - providerFactory := NewFactory() + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + providerFactory := NewFactory(routeStore) assert.NotNil(t, providerFactory) assert.Equal(t, "rulebased", providerFactory.Name().String()) } @@ -31,7 +36,8 @@ func TestNew(t *testing.T) { providerSettings := createTestProviderSettings() config := nfmanager.Config{} - provider, err := New(ctx, providerSettings, config) + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) require.NoError(t, err) assert.NotNil(t, provider) @@ -44,7 +50,8 @@ func TestProvider_SetNotificationConfig(t *testing.T) { providerSettings := createTestProviderSettings() config := nfmanager.Config{} - provider, err := New(ctx, providerSettings, config) + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) require.NoError(t, err) tests := []struct { @@ -124,11 +131,12 @@ func TestProvider_GetNotificationConfig(t *testing.T) { providerSettings := createTestProviderSettings() config := nfmanager.Config{} - provider, err := New(ctx, providerSettings, config) + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) require.NoError(t, err) orgID := "test-org" - ruleID := "rule1" + ruleID := "ruleId" customConfig := &alertmanagertypes.NotificationConfig{ Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 30 * time.Minute, @@ -144,7 +152,6 @@ func TestProvider_GetNotificationConfig(t *testing.T) { }, } - // Set config for alert1 err = provider.SetNotificationConfig(orgID, ruleID, customConfig) require.NoError(t, err) @@ -155,7 +162,7 @@ func TestProvider_GetNotificationConfig(t *testing.T) { name string orgID string ruleID string - alert *types.Alert + alert *alertmanagertypes.Alert expectedConfig *alertmanagertypes.NotificationConfig shouldFallback bool }{ @@ -165,7 +172,7 @@ func TestProvider_GetNotificationConfig(t *testing.T) { ruleID: ruleID, expectedConfig: &alertmanagertypes.NotificationConfig{ NotificationGroup: map[model.LabelName]struct{}{ - model.LabelName("ruleId"): {}, + model.LabelName(ruleID): {}, }, Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 30 * time.Minute, @@ -182,13 +189,13 @@ func TestProvider_GetNotificationConfig(t *testing.T) { NotificationGroup: map[model.LabelName]struct{}{ model.LabelName("group1"): {}, model.LabelName("group2"): {}, - model.LabelName("ruleId"): {}, + model.LabelName(ruleID): {}, }, Renotify: alertmanagertypes.ReNotificationConfig{ RenotifyInterval: 4 * time.Hour, NoDataInterval: 4 * time.Hour, }, - }, // Will get fallback from standardnotification + }, shouldFallback: false, }, { @@ -231,7 +238,8 @@ func TestProvider_ConcurrentAccess(t *testing.T) { providerSettings := createTestProviderSettings() config := nfmanager.Config{} - provider, err := New(ctx, providerSettings, config) + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) require.NoError(t, err) orgID := "test-org" @@ -268,3 +276,430 @@ func TestProvider_ConcurrentAccess(t *testing.T) { // Wait for both goroutines to complete wg.Wait() } + +func TestProvider_EvaluateExpression(t *testing.T) { + provider := &provider{} + + tests := []struct { + name string + expression string + labelSet model.LabelSet + expected bool + }{ + { + name: "simple equality check - match", + expression: `threshold.name == 'auth' && ruleId == 'rule1'`, + labelSet: model.LabelSet{ + "threshold.name": "auth", + "ruleId": "rule1", + }, + expected: true, + }, + { + name: "simple equality check - no match", + expression: `service == "payment"`, + labelSet: model.LabelSet{ + "service": "auth", + "env": "production", + }, + expected: false, + }, + { + name: "multiple conditions with AND - both match", + expression: `service == "auth" && env == "production"`, + labelSet: model.LabelSet{ + "service": "auth", + "env": "production", + }, + expected: true, + }, + { + name: "multiple conditions with AND - one doesn't match", + expression: `service == "auth" && env == "staging"`, + labelSet: model.LabelSet{ + "service": "auth", + "env": "production", + }, + expected: false, + }, + { + name: "multiple conditions with OR - one matches", + expression: `service == "payment" || env == "production"`, + labelSet: model.LabelSet{ + "service": "auth", + "env": "production", + }, + expected: true, + }, + { + name: "multiple conditions with OR - none match", + expression: `service == "payment" || env == "staging"`, + labelSet: model.LabelSet{ + "service": "auth", + "env": "production", + }, + expected: false, + }, + { + name: "in operator - value in list", + expression: `service in ["auth", "payment", "notification"]`, + labelSet: model.LabelSet{ + "service": "auth", + }, + expected: true, + }, + { + name: "in operator - value not in list", + expression: `service in ["payment", "notification"]`, + labelSet: model.LabelSet{ + "service": "auth", + }, + expected: false, + }, + { + name: "contains operator - substring match", + expression: `host contains "prod"`, + labelSet: model.LabelSet{ + "host": "prod-server-01", + }, + expected: true, + }, + { + name: "contains operator - no substring match", + expression: `host contains "staging"`, + labelSet: model.LabelSet{ + "host": "prod-server-01", + }, + expected: false, + }, + { + name: "complex expression with parentheses", + expression: `(service == "auth" && env == "production") || critical == "true"`, + labelSet: model.LabelSet{ + "service": "payment", + "env": "staging", + "critical": "true", + }, + expected: true, + }, + { + name: "missing label key", + expression: `"missing_key" == "value"`, + labelSet: model.LabelSet{ + "service": "auth", + }, + expected: false, + }, + { + name: "rule-based expression with threshold name and ruleId", + expression: `'threshold.name' == "high-cpu" && ruleId == "rule-123"`, + labelSet: model.LabelSet{ + "threshold.name": "high-cpu", + "ruleId": "rule-123", + "service": "auth", + }, + expected: false, //no commas + }, + { + name: "alertname and ruleId combination", + expression: `alertname == "HighCPUUsage" && ruleId == "cpu-alert-001"`, + labelSet: model.LabelSet{ + "alertname": "HighCPUUsage", + "ruleId": "cpu-alert-001", + "severity": "critical", + }, + expected: true, + }, + { + name: "kubernetes namespace filtering", + expression: `k8s.namespace.name == "auth" && service in ["auth", "payment"]`, + labelSet: model.LabelSet{ + "k8s.namespace.name": "auth", + "service": "auth", + "host": "k8s-node-1", + }, + expected: true, + }, + { + name: "migration expression format from SQL migration", + expression: `threshold.name == "HighCPUUsage" && ruleId == "rule-uuid-123"`, + labelSet: model.LabelSet{ + "threshold.name": "HighCPUUsage", + "ruleId": "rule-uuid-123", + "severity": "warning", + }, + expected: true, + }, + { + name: "case sensitive matching", + expression: `service == "Auth"`, // capital A + labelSet: model.LabelSet{ + "service": "auth", // lowercase a + }, + expected: false, + }, + { + name: "numeric comparison as strings", + expression: `port == "8080"`, + labelSet: model.LabelSet{ + "port": "8080", + }, + expected: true, + }, + { + name: "quoted string with special characters", + expression: `service == "auth-service-v2"`, + labelSet: model.LabelSet{ + "service": "auth-service-v2", + }, + expected: true, + }, + { + name: "boolean operators precedence", + expression: `service == "auth" && env == "prod" || critical == "true"`, + labelSet: model.LabelSet{ + "service": "payment", + "env": "staging", + "critical": "true", + }, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := provider.evaluateExpr(tt.expression, tt.labelSet) + assert.NoError(t, err) + assert.Equal(t, tt.expected, result, "Expression: %s", tt.expression) + }) + } +} + +func TestProvider_DeleteRoute(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + tests := []struct { + name string + orgID string + routeID string + wantErr bool + }{ + { + name: "valid parameters", + orgID: "test-org-123", + routeID: "route-uuid-456", + wantErr: false, + }, + { + name: "empty routeID", + orgID: "test-org-123", + routeID: "", + wantErr: true, + }, + { + name: "valid orgID with valid routeID", + orgID: "another-org", + routeID: "another-route-id", + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) + require.NoError(t, err) + + if !tt.wantErr { + routeStore.ExpectDelete(tt.orgID, tt.routeID) + } + + err = provider.DeleteRoutePolicy(ctx, tt.orgID, tt.routeID) + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.NoError(t, routeStore.ExpectationsWereMet()) + } + }) + } +} + +func TestProvider_CreateRoute(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + tests := []struct { + name string + orgID string + route *alertmanagertypes.RoutePolicy + wantErr bool + }{ + { + name: "valid route", + orgID: "test-org-123", + route: &alertmanagertypes.RoutePolicy{ + Identifiable: types.Identifiable{ID: valuer.GenerateUUID()}, + Expression: `service == "auth"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "auth-service-route", + Description: "Route for auth service alerts", + Enabled: true, + OrgID: "test-org-123", + Channels: []string{"slack-channel"}, + }, + wantErr: false, + }, + { + name: "nil route", + orgID: "test-org-123", + route: nil, + wantErr: true, + }, + { + name: "invalid route - missing expression", + orgID: "test-org-123", + route: &alertmanagertypes.RoutePolicy{ + Expression: "", // empty expression + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "invalid-route", + OrgID: "test-org-123", + }, + wantErr: true, + }, + { + name: "invalid route - missing name", + orgID: "test-org-123", + route: &alertmanagertypes.RoutePolicy{ + Expression: `service == "auth"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "", // empty name + OrgID: "test-org-123", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) + require.NoError(t, err) + + if !tt.wantErr && tt.route != nil { + routeStore.ExpectCreate(tt.route) + } + + err = provider.CreateRoutePolicy(ctx, tt.orgID, tt.route) + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.NoError(t, routeStore.ExpectationsWereMet()) + } + }) + } +} + +func TestProvider_CreateRoutes(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + provider, err := New(ctx, providerSettings, config, routeStore) + require.NoError(t, err) + + validRoute1 := &alertmanagertypes.RoutePolicy{ + Expression: `service == "auth"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "auth-route", + Description: "Auth service route", + Enabled: true, + OrgID: "test-org", + Channels: []string{"slack-auth"}, + } + + validRoute2 := &alertmanagertypes.RoutePolicy{ + Expression: `service == "payment"`, + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "payment-route", + Description: "Payment service route", + Enabled: true, + OrgID: "test-org", + Channels: []string{"slack-payment"}, + } + + invalidRoute := &alertmanagertypes.RoutePolicy{ + Expression: "", // empty expression - invalid + ExpressionKind: alertmanagertypes.PolicyBasedExpression, + Name: "invalid-route", + OrgID: "test-org", + } + + tests := []struct { + name string + orgID string + routes []*alertmanagertypes.RoutePolicy + wantErr bool + }{ + { + name: "valid routes", + orgID: "test-org", + routes: []*alertmanagertypes.RoutePolicy{validRoute1, validRoute2}, + wantErr: false, + }, + { + name: "empty routes list", + orgID: "test-org", + routes: []*alertmanagertypes.RoutePolicy{}, + wantErr: true, + }, + { + name: "nil routes list", + orgID: "test-org", + routes: nil, + wantErr: true, + }, + { + name: "routes with nil route", + orgID: "test-org", + routes: []*alertmanagertypes.RoutePolicy{validRoute1, nil}, + wantErr: true, + }, + { + name: "routes with invalid route", + orgID: "test-org", + routes: []*alertmanagertypes.RoutePolicy{validRoute1, invalidRoute}, + wantErr: true, + }, + { + name: "single valid route", + orgID: "test-org", + routes: []*alertmanagertypes.RoutePolicy{validRoute1}, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if !tt.wantErr && len(tt.routes) > 0 { + routeStore.ExpectCreateBatch(tt.routes) + } + + err := provider.CreateRoutePolicies(ctx, tt.orgID, tt.routes) + + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.NoError(t, routeStore.ExpectationsWereMet()) + } + }) + } +} diff --git a/pkg/alertmanager/service.go b/pkg/alertmanager/service.go index 163c673b7622..606dc72d9ddf 100644 --- a/pkg/alertmanager/service.go +++ b/pkg/alertmanager/service.go @@ -4,6 +4,9 @@ import ( "context" "sync" + "github.com/prometheus/alertmanager/featurecontrol" + "github.com/prometheus/alertmanager/matcher/compat" + "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/errors" @@ -61,6 +64,7 @@ func New( } func (service *Service) SyncServers(ctx context.Context) error { + compat.InitFromFlags(service.settings.Logger(), featurecontrol.NoopFlags{}) orgs, err := service.orgGetter.ListByOwnedKeyRange(ctx) if err != nil { return err @@ -142,7 +146,7 @@ func (service *Service) TestReceiver(ctx context.Context, orgID string, receiver return server.TestReceiver(ctx, receiver) } -func (service *Service) TestAlert(ctx context.Context, orgID string, alert *alertmanagertypes.PostableAlert, receivers []string) error { +func (service *Service) TestAlert(ctx context.Context, orgID string, receiversMap map[*alertmanagertypes.PostableAlert][]string, config *alertmanagertypes.NotificationConfig) error { service.serversMtx.RLock() defer service.serversMtx.RUnlock() @@ -151,7 +155,7 @@ func (service *Service) TestAlert(ctx context.Context, orgID string, alert *aler return err } - return server.TestAlert(ctx, alert, receivers) + return server.TestAlert(ctx, receiversMap, config) } func (service *Service) Stop(ctx context.Context) error { diff --git a/pkg/alertmanager/signozalertmanager/provider.go b/pkg/alertmanager/signozalertmanager/provider.go index a92c5ab4e89f..9eab5e4896a9 100644 --- a/pkg/alertmanager/signozalertmanager/provider.go +++ b/pkg/alertmanager/signozalertmanager/provider.go @@ -2,8 +2,12 @@ package signozalertmanager import ( "context" + "github.com/SigNoz/signoz/pkg/query-service/utils/labels" + "github.com/prometheus/common/model" "time" + amConfig "github.com/prometheus/alertmanager/config" + "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore" "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" @@ -11,7 +15,9 @@ import ( "github.com/SigNoz/signoz/pkg/factory" "github.com/SigNoz/signoz/pkg/modules/organization" "github.com/SigNoz/signoz/pkg/sqlstore" + "github.com/SigNoz/signoz/pkg/types" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/SigNoz/signoz/pkg/types/authtypes" "github.com/SigNoz/signoz/pkg/valuer" ) @@ -94,8 +100,29 @@ func (provider *provider) TestReceiver(ctx context.Context, orgID string, receiv return provider.service.TestReceiver(ctx, orgID, receiver) } -func (provider *provider) TestAlert(ctx context.Context, orgID string, alert *alertmanagertypes.PostableAlert, receivers []string) error { - return provider.service.TestAlert(ctx, orgID, alert, receivers) +func (provider *provider) TestAlert(ctx context.Context, orgID string, ruleID string, receiversMap map[*alertmanagertypes.PostableAlert][]string) error { + config, err := provider.notificationManager.GetNotificationConfig(orgID, ruleID) + if err != nil { + return err + } + if config.UsePolicy { + for alert := range receiversMap { + set := make(model.LabelSet) + for k, v := range alert.Labels { + set[model.LabelName(k)] = model.LabelValue(v) + } + match, err := provider.notificationManager.Match(ctx, orgID, alert.Labels[labels.AlertRuleIdLabel], set) + if err != nil { + return err + } + if len(match) == 0 { + delete(receiversMap, alert) + } else { + receiversMap[alert] = match + } + } + } + return provider.service.TestAlert(ctx, orgID, receiversMap, config) } func (provider *provider) ListChannels(ctx context.Context, orgID string) ([]*alertmanagertypes.Channel, error) { @@ -211,3 +238,316 @@ func (provider *provider) DeleteNotificationConfig(ctx context.Context, orgID va } return nil } + +func (provider *provider) CreateRoutePolicy(ctx context.Context, routeRequest *alertmanagertypes.PostableRoutePolicy) (*alertmanagertypes.GettableRoutePolicy, error) { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return nil, err + } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return nil, err + } + + if err := routeRequest.Validate(); err != nil { + return nil, err + } + + route := alertmanagertypes.RoutePolicy{ + Expression: routeRequest.Expression, + ExpressionKind: routeRequest.ExpressionKind, + Name: routeRequest.Name, + Description: routeRequest.Description, + Enabled: true, + Tags: routeRequest.Tags, + Channels: routeRequest.Channels, + OrgID: claims.OrgID, + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + UserAuditable: types.UserAuditable{ + CreatedBy: claims.Email, + UpdatedBy: claims.Email, + }, + TimeAuditable: types.TimeAuditable{ + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + }, + } + + err = provider.notificationManager.CreateRoutePolicy(ctx, orgID.String(), &route) + if err != nil { + return nil, err + } + + return &alertmanagertypes.GettableRoutePolicy{ + PostableRoutePolicy: *routeRequest, + ID: route.ID.StringValue(), + CreatedAt: &route.CreatedAt, + UpdatedAt: &route.UpdatedAt, + CreatedBy: &route.CreatedBy, + UpdatedBy: &route.UpdatedBy, + }, nil +} + +func (provider *provider) CreateRoutePolicies(ctx context.Context, routeRequests []*alertmanagertypes.PostableRoutePolicy) ([]*alertmanagertypes.GettableRoutePolicy, error) { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return nil, err + } + + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return nil, err + } + + if len(routeRequests) == 0 { + return []*alertmanagertypes.GettableRoutePolicy{}, nil + } + + routes := make([]*alertmanagertypes.RoutePolicy, 0, len(routeRequests)) + results := make([]*alertmanagertypes.GettableRoutePolicy, 0, len(routeRequests)) + + for _, routeRequest := range routeRequests { + if err := routeRequest.Validate(); err != nil { + return nil, err + } + + route := &alertmanagertypes.RoutePolicy{ + Expression: routeRequest.Expression, + ExpressionKind: routeRequest.ExpressionKind, + Name: routeRequest.Name, + Description: routeRequest.Description, + Enabled: true, + Tags: routeRequest.Tags, + Channels: routeRequest.Channels, + OrgID: claims.OrgID, + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + UserAuditable: types.UserAuditable{ + CreatedBy: claims.Email, + UpdatedBy: claims.Email, + }, + TimeAuditable: types.TimeAuditable{ + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + }, + } + + routes = append(routes, route) + results = append(results, &alertmanagertypes.GettableRoutePolicy{ + PostableRoutePolicy: *routeRequest, + ID: route.ID.StringValue(), + CreatedAt: &route.CreatedAt, + UpdatedAt: &route.UpdatedAt, + CreatedBy: &route.CreatedBy, + UpdatedBy: &route.UpdatedBy, + }) + } + + err = provider.notificationManager.CreateRoutePolicies(ctx, orgID.String(), routes) + if err != nil { + return nil, err + } + + return results, nil +} + +func (provider *provider) GetRoutePolicyByID(ctx context.Context, routeID string) (*alertmanagertypes.GettableRoutePolicy, error) { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return nil, err + } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return nil, err + } + + route, err := provider.notificationManager.GetRoutePolicyByID(ctx, orgID.String(), routeID) + if err != nil { + return nil, err + } + + return &alertmanagertypes.GettableRoutePolicy{ + PostableRoutePolicy: alertmanagertypes.PostableRoutePolicy{ + Expression: route.Expression, + ExpressionKind: route.ExpressionKind, + Channels: route.Channels, + Name: route.Name, + Description: route.Description, + Tags: route.Tags, + }, + ID: route.ID.StringValue(), + CreatedAt: &route.CreatedAt, + UpdatedAt: &route.UpdatedAt, + CreatedBy: &route.CreatedBy, + UpdatedBy: &route.UpdatedBy, + }, nil +} + +func (provider *provider) GetAllRoutePolicies(ctx context.Context) ([]*alertmanagertypes.GettableRoutePolicy, error) { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return nil, err + } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return nil, err + } + + routes, err := provider.notificationManager.GetAllRoutePolicies(ctx, orgID.String()) + if err != nil { + return nil, err + } + + results := make([]*alertmanagertypes.GettableRoutePolicy, 0, len(routes)) + for _, route := range routes { + results = append(results, &alertmanagertypes.GettableRoutePolicy{ + PostableRoutePolicy: alertmanagertypes.PostableRoutePolicy{ + Expression: route.Expression, + ExpressionKind: route.ExpressionKind, + Channels: route.Channels, + Name: route.Name, + Description: route.Description, + Tags: route.Tags, + }, + ID: route.ID.StringValue(), + CreatedAt: &route.CreatedAt, + UpdatedAt: &route.UpdatedAt, + CreatedBy: &route.CreatedBy, + UpdatedBy: &route.UpdatedBy, + }) + } + + return results, nil +} + +func (provider *provider) UpdateRoutePolicyByID(ctx context.Context, routeID string, route *alertmanagertypes.PostableRoutePolicy) (*alertmanagertypes.GettableRoutePolicy, error) { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return nil, errors.NewInvalidInputf(errors.CodeUnauthenticated, "invalid claims: %v", err) + } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return nil, err + } + + if routeID == "" { + return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "routeID cannot be empty") + } + + if route == nil { + return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "route cannot be nil") + } + + if err := route.Validate(); err != nil { + return nil, errors.NewInvalidInputf(errors.CodeInvalidInput, "invalid route: %v", err) + } + + existingRoute, err := provider.notificationManager.GetRoutePolicyByID(ctx, claims.OrgID, routeID) + if err != nil { + return nil, errors.NewInvalidInputf(errors.CodeNotFound, "route not found: %v", err) + } + + updatedRoute := &alertmanagertypes.RoutePolicy{ + Expression: route.Expression, + ExpressionKind: route.ExpressionKind, + Name: route.Name, + Description: route.Description, + Tags: route.Tags, + Channels: route.Channels, + OrgID: claims.OrgID, + Identifiable: existingRoute.Identifiable, + UserAuditable: types.UserAuditable{ + CreatedBy: existingRoute.CreatedBy, + UpdatedBy: claims.Email, + }, + TimeAuditable: types.TimeAuditable{ + CreatedAt: existingRoute.CreatedAt, + UpdatedAt: time.Now(), + }, + } + + err = provider.notificationManager.DeleteRoutePolicy(ctx, orgID.String(), routeID) + if err != nil { + return nil, errors.NewInvalidInputf(errors.CodeInternal, "error deleting existing route: %v", err) + } + + err = provider.notificationManager.CreateRoutePolicy(ctx, orgID.String(), updatedRoute) + if err != nil { + return nil, err + } + + return &alertmanagertypes.GettableRoutePolicy{ + PostableRoutePolicy: *route, + ID: updatedRoute.ID.StringValue(), + CreatedAt: &updatedRoute.CreatedAt, + UpdatedAt: &updatedRoute.UpdatedAt, + CreatedBy: &updatedRoute.CreatedBy, + UpdatedBy: &updatedRoute.UpdatedBy, + }, nil +} + +func (provider *provider) DeleteRoutePolicyByID(ctx context.Context, routeID string) error { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return errors.NewInvalidInputf(errors.CodeUnauthenticated, "invalid claims: %v", err) + } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return err + } + if routeID == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "routeID cannot be empty") + } + + return provider.notificationManager.DeleteRoutePolicy(ctx, orgID.String(), routeID) +} + +func (provider *provider) CreateInhibitRules(ctx context.Context, orgID valuer.UUID, rules []amConfig.InhibitRule) error { + config, err := provider.configStore.Get(ctx, orgID.String()) + if err != nil { + return err + } + + if err := config.AddInhibitRules(rules); err != nil { + return err + } + + return provider.configStore.Set(ctx, config) +} + +func (provider *provider) DeleteAllRoutePoliciesByRuleId(ctx context.Context, names string) error { + claims, err := authtypes.ClaimsFromContext(ctx) + if err != nil { + return errors.NewInvalidInputf(errors.CodeUnauthenticated, "invalid claims: %v", err) + } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return err + } + return provider.notificationManager.DeleteAllRoutePoliciesByName(ctx, orgID.String(), names) +} + +func (provider *provider) UpdateAllRoutePoliciesByRuleId(ctx context.Context, names string, routes []*alertmanagertypes.PostableRoutePolicy) error { + err := provider.DeleteAllRoutePoliciesByRuleId(ctx, names) + if err != nil { + return errors.NewInvalidInputf(errors.CodeInternal, "error deleting the routes: %v", err) + } + _, err = provider.CreateRoutePolicies(ctx, routes) + return err +} + +func (provider *provider) DeleteAllInhibitRulesByRuleId(ctx context.Context, orgID valuer.UUID, ruleId string) error { + config, err := provider.configStore.Get(ctx, orgID.String()) + if err != nil { + return err + } + + if err := config.DeleteRuleIDInhibitor(ruleId); err != nil { + return err + } + + return provider.configStore.Set(ctx, config) +} diff --git a/pkg/query-service/app/http_handler.go b/pkg/query-service/app/http_handler.go index a9016eef0e91..03785f7473c3 100644 --- a/pkg/query-service/app/http_handler.go +++ b/pkg/query-service/app/http_handler.go @@ -10,7 +10,6 @@ import ( "fmt" "github.com/SigNoz/signoz/pkg/modules/thirdpartyapi" - //qbtypes "github.com/SigNoz/signoz/pkg/types/querybuildertypes/querybuildertypesv5" "io" "math" "net/http" @@ -492,6 +491,12 @@ func (aH *APIHandler) RegisterRoutes(router *mux.Router, am *middleware.AuthZ) { router.HandleFunc("/api/v1/channels", am.EditAccess(aH.AlertmanagerAPI.CreateChannel)).Methods(http.MethodPost) router.HandleFunc("/api/v1/testChannel", am.EditAccess(aH.AlertmanagerAPI.TestReceiver)).Methods(http.MethodPost) + router.HandleFunc("/api/v1/route_policies", am.ViewAccess(aH.AlertmanagerAPI.GetAllRoutePolicies)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/route_policies/{id}", am.ViewAccess(aH.AlertmanagerAPI.GetRoutePolicyByID)).Methods(http.MethodGet) + router.HandleFunc("/api/v1/route_policies", am.AdminAccess(aH.AlertmanagerAPI.CreateRoutePolicy)).Methods(http.MethodPost) + router.HandleFunc("/api/v1/route_policies/{id}", am.AdminAccess(aH.AlertmanagerAPI.DeleteRoutePolicyByID)).Methods(http.MethodDelete) + router.HandleFunc("/api/v1/route_policies/{id}", am.AdminAccess(aH.AlertmanagerAPI.UpdateRoutePolicy)).Methods(http.MethodPut) + router.HandleFunc("/api/v1/alerts", am.ViewAccess(aH.AlertmanagerAPI.GetAlerts)).Methods(http.MethodGet) router.HandleFunc("/api/v1/rules", am.ViewAccess(aH.listRules)).Methods(http.MethodGet) @@ -616,6 +621,7 @@ func (aH *APIHandler) RegisterRoutes(router *mux.Router, am *middleware.AuthZ) { // Export router.HandleFunc("/api/v1/export_raw_data", am.ViewAccess(aH.Signoz.Handlers.RawDataExport.ExportRawData)).Methods(http.MethodGet) + } func (ah *APIHandler) MetricExplorerRoutes(router *mux.Router, am *middleware.AuthZ) { diff --git a/pkg/query-service/rules/base_rule.go b/pkg/query-service/rules/base_rule.go index a0ddcbf8444d..62669fad172b 100644 --- a/pkg/query-service/rules/base_rule.go +++ b/pkg/query-service/rules/base_rule.go @@ -4,13 +4,11 @@ import ( "context" "fmt" "log/slog" - "math" "net/url" "sync" "time" "github.com/SigNoz/signoz/pkg/errors" - "github.com/SigNoz/signoz/pkg/query-service/converter" "github.com/SigNoz/signoz/pkg/query-service/interfaces" "github.com/SigNoz/signoz/pkg/query-service/model" v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3" @@ -167,22 +165,6 @@ func NewBaseRule(id string, orgID valuer.UUID, p *ruletypes.PostableRule, reader return baseRule, nil } -func (r *BaseRule) targetVal() float64 { - if r.ruleCondition == nil || r.ruleCondition.Target == nil { - return 0 - } - - // get the converter for the target unit - unitConverter := converter.FromUnit(converter.Unit(r.ruleCondition.TargetUnit)) - // convert the target value to the y-axis unit - value := unitConverter.Convert(converter.Value{ - F: *r.ruleCondition.Target, - U: converter.Unit(r.ruleCondition.TargetUnit), - }, converter.Unit(r.Unit())) - - return value.F -} - func (r *BaseRule) matchType() ruletypes.MatchType { if r.ruleCondition == nil { return ruletypes.AtleastOnce @@ -221,10 +203,6 @@ func (r *BaseRule) HoldDuration() time.Duration { return r.holdDuration } -func (r *BaseRule) TargetVal() float64 { - return r.targetVal() -} - func (r *ThresholdRule) hostFromSource() string { parsedUrl, err := url.Parse(r.source) if err != nil { @@ -380,232 +358,6 @@ func (r *BaseRule) ForEachActiveAlert(f func(*ruletypes.Alert)) { } } -func (r *BaseRule) ShouldAlert(series v3.Series) (ruletypes.Sample, bool) { - var alertSmpl ruletypes.Sample - var shouldAlert bool - var lbls qslabels.Labels - - for name, value := range series.Labels { - lbls = append(lbls, qslabels.Label{Name: name, Value: value}) - } - - series.Points = removeGroupinSetPoints(series) - - // nothing to evaluate - if len(series.Points) == 0 { - return alertSmpl, false - } - - if r.ruleCondition.RequireMinPoints { - if len(series.Points) < r.ruleCondition.RequiredNumPoints { - zap.L().Info("not enough data points to evaluate series, skipping", zap.String("ruleid", r.ID()), zap.Int("numPoints", len(series.Points)), zap.Int("requiredPoints", r.ruleCondition.RequiredNumPoints)) - return alertSmpl, false - } - } - - switch r.matchType() { - case ruletypes.AtleastOnce: - // If any sample matches the condition, the rule is firing. - if r.compareOp() == ruletypes.ValueIsAbove { - for _, smpl := range series.Points { - if smpl.Value > r.targetVal() { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - shouldAlert = true - break - } - } - } else if r.compareOp() == ruletypes.ValueIsBelow { - for _, smpl := range series.Points { - if smpl.Value < r.targetVal() { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - shouldAlert = true - break - } - } - } else if r.compareOp() == ruletypes.ValueIsEq { - for _, smpl := range series.Points { - if smpl.Value == r.targetVal() { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - shouldAlert = true - break - } - } - } else if r.compareOp() == ruletypes.ValueIsNotEq { - for _, smpl := range series.Points { - if smpl.Value != r.targetVal() { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - shouldAlert = true - break - } - } - } else if r.compareOp() == ruletypes.ValueOutsideBounds { - for _, smpl := range series.Points { - if math.Abs(smpl.Value) >= r.targetVal() { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - shouldAlert = true - break - } - } - } - case ruletypes.AllTheTimes: - // If all samples match the condition, the rule is firing. - shouldAlert = true - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: r.targetVal()}, Metric: lbls} - if r.compareOp() == ruletypes.ValueIsAbove { - for _, smpl := range series.Points { - if smpl.Value <= r.targetVal() { - shouldAlert = false - break - } - } - // use min value from the series - if shouldAlert { - var minValue float64 = math.Inf(1) - for _, smpl := range series.Points { - if smpl.Value < minValue { - minValue = smpl.Value - } - } - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: minValue}, Metric: lbls} - } - } else if r.compareOp() == ruletypes.ValueIsBelow { - for _, smpl := range series.Points { - if smpl.Value >= r.targetVal() { - shouldAlert = false - break - } - } - if shouldAlert { - var maxValue float64 = math.Inf(-1) - for _, smpl := range series.Points { - if smpl.Value > maxValue { - maxValue = smpl.Value - } - } - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: maxValue}, Metric: lbls} - } - } else if r.compareOp() == ruletypes.ValueIsEq { - for _, smpl := range series.Points { - if smpl.Value != r.targetVal() { - shouldAlert = false - break - } - } - } else if r.compareOp() == ruletypes.ValueIsNotEq { - for _, smpl := range series.Points { - if smpl.Value == r.targetVal() { - shouldAlert = false - break - } - } - // use any non-inf or nan value from the series - if shouldAlert { - for _, smpl := range series.Points { - if !math.IsInf(smpl.Value, 0) && !math.IsNaN(smpl.Value) { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - break - } - } - } - } else if r.compareOp() == ruletypes.ValueOutsideBounds { - for _, smpl := range series.Points { - if math.Abs(smpl.Value) < r.targetVal() { - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: smpl.Value}, Metric: lbls} - shouldAlert = false - break - } - } - } - case ruletypes.OnAverage: - // If the average of all samples matches the condition, the rule is firing. - var sum, count float64 - for _, smpl := range series.Points { - if math.IsNaN(smpl.Value) || math.IsInf(smpl.Value, 0) { - continue - } - sum += smpl.Value - count++ - } - avg := sum / count - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: avg}, Metric: lbls} - if r.compareOp() == ruletypes.ValueIsAbove { - if avg > r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsBelow { - if avg < r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsEq { - if avg == r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsNotEq { - if avg != r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueOutsideBounds { - if math.Abs(avg) >= r.targetVal() { - shouldAlert = true - } - } - case ruletypes.InTotal: - // If the sum of all samples matches the condition, the rule is firing. - var sum float64 - - for _, smpl := range series.Points { - if math.IsNaN(smpl.Value) || math.IsInf(smpl.Value, 0) { - continue - } - sum += smpl.Value - } - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: sum}, Metric: lbls} - if r.compareOp() == ruletypes.ValueIsAbove { - if sum > r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsBelow { - if sum < r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsEq { - if sum == r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsNotEq { - if sum != r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueOutsideBounds { - if math.Abs(sum) >= r.targetVal() { - shouldAlert = true - } - } - case ruletypes.Last: - // If the last sample matches the condition, the rule is firing. - shouldAlert = false - alertSmpl = ruletypes.Sample{Point: ruletypes.Point{V: series.Points[len(series.Points)-1].Value}, Metric: lbls} - if r.compareOp() == ruletypes.ValueIsAbove { - if series.Points[len(series.Points)-1].Value > r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsBelow { - if series.Points[len(series.Points)-1].Value < r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsEq { - if series.Points[len(series.Points)-1].Value == r.targetVal() { - shouldAlert = true - } - } else if r.compareOp() == ruletypes.ValueIsNotEq { - if series.Points[len(series.Points)-1].Value != r.targetVal() { - shouldAlert = true - } - } - } - return alertSmpl, shouldAlert -} - func (r *BaseRule) RecordRuleStateHistory(ctx context.Context, prevState, currentState model.AlertState, itemsToAdd []model.RuleStateHistory) error { zap.L().Debug("recording rule state history", zap.String("ruleid", r.ID()), zap.Any("prevState", prevState), zap.Any("currentState", currentState), zap.Any("itemsToAdd", itemsToAdd)) revisedItemsToAdd := map[uint64]model.RuleStateHistory{} diff --git a/pkg/query-service/rules/base_rule_test.go b/pkg/query-service/rules/base_rule_test.go index 33e14b4ea255..8391ded1fcdf 100644 --- a/pkg/query-service/rules/base_rule_test.go +++ b/pkg/query-service/rules/base_rule_test.go @@ -1,6 +1,7 @@ package rules import ( + "github.com/stretchr/testify/require" "testing" v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3" @@ -22,6 +23,15 @@ func TestBaseRule_RequireMinPoints(t *testing.T) { RequireMinPoints: true, RequiredNumPoints: 4, }, + + Threshold: ruletypes.BasicRuleThresholds{ + { + Name: "test-threshold", + TargetValue: &threshold, + CompareOp: ruletypes.ValueIsAbove, + MatchType: ruletypes.AtleastOnce, + }, + }, }, series: &v3.Series{ Points: []v3.Point{ @@ -41,6 +51,14 @@ func TestBaseRule_RequireMinPoints(t *testing.T) { MatchType: ruletypes.AtleastOnce, Target: &threshold, }, + Threshold: ruletypes.BasicRuleThresholds{ + { + Name: "test-threshold", + TargetValue: &threshold, + CompareOp: ruletypes.ValueIsAbove, + MatchType: ruletypes.AtleastOnce, + }, + }, }, series: &v3.Series{ Points: []v3.Point{ @@ -56,10 +74,9 @@ func TestBaseRule_RequireMinPoints(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - _, shouldAlert := test.rule.ShouldAlert(*test.series) - if shouldAlert != test.shouldAlert { - t.Errorf("expected shouldAlert to be %v, got %v", test.shouldAlert, shouldAlert) - } + _, err := test.rule.Threshold.ShouldAlert(*test.series, "") + require.NoError(t, err) + require.Equal(t, len(test.series.Points) >= test.rule.ruleCondition.RequiredNumPoints, test.shouldAlert) }) } } diff --git a/pkg/query-service/rules/manager.go b/pkg/query-service/rules/manager.go index 8263b03d5bbd..a935aa259002 100644 --- a/pkg/query-service/rules/manager.go +++ b/pkg/query-service/rules/manager.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "github.com/SigNoz/signoz/pkg/query-service/utils/labels" "log/slog" "sort" "strings" @@ -350,39 +351,35 @@ func (m *Manager) EditRule(ctx context.Context, ruleStr string, id valuer.UUID) existingRule.Data = ruleStr return m.ruleStore.EditRule(ctx, existingRule, func(ctx context.Context) error { - cfg, err := m.alertmanager.GetConfig(ctx, claims.OrgID) - if err != nil { - return err - } - - var preferredChannels []string - if len(parsedRule.PreferredChannels) == 0 { - channels, err := m.alertmanager.ListChannels(ctx, claims.OrgID) - if err != nil { - return err - } - - for _, channel := range channels { - preferredChannels = append(preferredChannels, channel.Name) - } - } else { - preferredChannels = parsedRule.PreferredChannels - } - err = cfg.UpdateRuleIDMatcher(id.StringValue(), preferredChannels) - if err != nil { - return err - } if parsedRule.NotificationSettings != nil { config := parsedRule.NotificationSettings.GetAlertManagerNotificationConfig() - err = m.alertmanager.SetNotificationConfig(ctx, orgID, existingRule.ID.StringValue(), &config) + err = m.alertmanager.SetNotificationConfig(ctx, orgID, id.StringValue(), &config) if err != nil { return err } - } + if !parsedRule.NotificationSettings.UsePolicy { + request, err := parsedRule.GetRuleRouteRequest(id.StringValue()) + if err != nil { + return err + } + err = m.alertmanager.UpdateAllRoutePoliciesByRuleId(ctx, id.StringValue(), request) + if err != nil { + return err + } + err = m.alertmanager.DeleteAllInhibitRulesByRuleId(ctx, orgID, id.StringValue()) + if err != nil { + return err + } - err = m.alertmanager.SetConfig(ctx, cfg) - if err != nil { - return err + inhibitRules, err := parsedRule.GetInhibitRules(id.StringValue()) + if err != nil { + return err + } + err = m.alertmanager.CreateInhibitRules(ctx, orgID, inhibitRules) + if err != nil { + return err + } + } } err = m.syncRuleStateWithTask(ctx, orgID, prepareTaskName(existingRule.ID.StringValue()), &parsedRule) if err != nil { @@ -488,6 +485,19 @@ func (m *Manager) DeleteRule(ctx context.Context, idStr string) error { } err = m.alertmanager.DeleteNotificationConfig(ctx, orgID, id.String()) + if err != nil { + return err + } + + err = m.alertmanager.DeleteAllRoutePoliciesByRuleId(ctx, id.String()) + if err != nil { + return err + } + + err = m.alertmanager.DeleteAllInhibitRulesByRuleId(ctx, orgID, id.String()) + if err != nil { + return err + } taskName := prepareTaskName(id.StringValue()) m.deleteTask(taskName) @@ -548,41 +558,30 @@ func (m *Manager) CreateRule(ctx context.Context, ruleStr string) (*ruletypes.Ge } id, err := m.ruleStore.CreateRule(ctx, storedRule, func(ctx context.Context, id valuer.UUID) error { - cfg, err := m.alertmanager.GetConfig(ctx, claims.OrgID) - if err != nil { - return err - } - - var preferredChannels []string - if len(parsedRule.PreferredChannels) == 0 { - channels, err := m.alertmanager.ListChannels(ctx, claims.OrgID) - if err != nil { - return err - } - - for _, channel := range channels { - preferredChannels = append(preferredChannels, channel.Name) - } - } else { - preferredChannels = parsedRule.PreferredChannels - } - if parsedRule.NotificationSettings != nil { config := parsedRule.NotificationSettings.GetAlertManagerNotificationConfig() - err = m.alertmanager.SetNotificationConfig(ctx, orgID, storedRule.ID.StringValue(), &config) + err = m.alertmanager.SetNotificationConfig(ctx, orgID, id.StringValue(), &config) if err != nil { return err } - } - - err = cfg.CreateRuleIDMatcher(id.StringValue(), preferredChannels) - if err != nil { - return err - } - - err = m.alertmanager.SetConfig(ctx, cfg) - if err != nil { - return err + if !parsedRule.NotificationSettings.UsePolicy { + request, err := parsedRule.GetRuleRouteRequest(id.StringValue()) + if err != nil { + return err + } + _, err = m.alertmanager.CreateRoutePolicies(ctx, request) + if err != nil { + return err + } + inhibitRules, err := parsedRule.GetInhibitRules(id.StringValue()) + if err != nil { + return err + } + err = m.alertmanager.CreateInhibitRules(ctx, orgID, inhibitRules) + if err != nil { + return err + } + } } taskName := prepareTaskName(id.StringValue()) @@ -756,36 +755,30 @@ func (m *Manager) prepareTestNotifyFunc() NotifyFunc { if len(alerts) == 0 { return } + ruleID := alerts[0].Labels.Map()[labels.AlertRuleIdLabel] + receiverMap := make(map[*alertmanagertypes.PostableAlert][]string) + for _, alert := range alerts { + generatorURL := alert.GeneratorURL - alert := alerts[0] - generatorURL := alert.GeneratorURL - - a := &alertmanagertypes.PostableAlert{} - a.Annotations = alert.Annotations.Map() - a.StartsAt = strfmt.DateTime(alert.FiredAt) - a.Alert = alertmanagertypes.AlertModel{ - Labels: alert.Labels.Map(), - GeneratorURL: strfmt.URI(generatorURL), - } - if !alert.ResolvedAt.IsZero() { - a.EndsAt = strfmt.DateTime(alert.ResolvedAt) - } else { - a.EndsAt = strfmt.DateTime(alert.ValidUntil) - } - - if len(alert.Receivers) == 0 { - channels, err := m.alertmanager.ListChannels(ctx, orgID) - if err != nil { - zap.L().Error("failed to list channels while sending test notification", zap.Error(err)) - return + a := &alertmanagertypes.PostableAlert{} + a.Annotations = alert.Annotations.Map() + a.StartsAt = strfmt.DateTime(alert.FiredAt) + a.Alert = alertmanagertypes.AlertModel{ + Labels: alert.Labels.Map(), + GeneratorURL: strfmt.URI(generatorURL), } - - for _, channel := range channels { - alert.Receivers = append(alert.Receivers, channel.Name) + if !alert.ResolvedAt.IsZero() { + a.EndsAt = strfmt.DateTime(alert.ResolvedAt) + } else { + a.EndsAt = strfmt.DateTime(alert.ValidUntil) } + receiverMap[a] = alert.Receivers + } + err := m.alertmanager.TestAlert(ctx, orgID, ruleID, receiverMap) + if err != nil { + zap.L().Error("failed to send test notification", zap.Error(err)) + return } - - m.alertmanager.TestAlert(ctx, orgID, a, alert.Receivers) } } @@ -983,6 +976,17 @@ func (m *Manager) TestNotification(ctx context.Context, orgID valuer.UUID, ruleS if err != nil { return 0, model.BadRequest(err) } + if !parsedRule.NotificationSettings.UsePolicy { + parsedRule.NotificationSettings.GroupBy = append(parsedRule.NotificationSettings.GroupBy, ruletypes.LabelThresholdName) + } + config := parsedRule.NotificationSettings.GetAlertManagerNotificationConfig() + err = m.alertmanager.SetNotificationConfig(ctx, orgID, parsedRule.AlertName, &config) + if err != nil { + return 0, &model.ApiError{ + Typ: model.ErrorBadData, + Err: err, + } + } alertCount, apiErr := m.prepareTestRuleFunc(PrepareTestRuleOptions{ Rule: &parsedRule, diff --git a/pkg/query-service/rules/manager_test.go b/pkg/query-service/rules/manager_test.go index 2702a93153ea..90781e1547d6 100644 --- a/pkg/query-service/rules/manager_test.go +++ b/pkg/query-service/rules/manager_test.go @@ -2,10 +2,15 @@ package rules import ( "context" + "fmt" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/nfroutingstoretest" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification" + "github.com/prometheus/common/model" + "strings" "testing" "time" - "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/stretchr/testify/assert" "go.uber.org/zap" @@ -32,19 +37,38 @@ func TestManager_PatchRule_PayloadVariations(t *testing.T) { Email: "test@example.com", Role: "admin", } - manager, mockSQLRuleStore, orgId := setupTestManager(t) + manager, mockSQLRuleStore, mockRouteStore, nfmanager, orgId := setupTestManager(t) claims.OrgID = orgId testCases := []struct { name string originalData string patchData string + Route []*alertmanagertypes.RoutePolicy + Config *alertmanagertypes.NotificationConfig expectedResult func(*ruletypes.GettableRule) bool expectError bool description string }{ { name: "patch complete rule with task sync validation", + Route: []*alertmanagertypes.RoutePolicy{ + { + Expression: fmt.Sprintf("ruleId == \"{{.ruleId}}\" && threshold.name == \"warning\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "{{.ruleId}}", + Enabled: true, + }, + }, + Config: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{model.LabelName("ruleId"): {}}, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 4 * time.Hour, + NoDataInterval: 4 * time.Hour, + }, + UsePolicy: false, + }, originalData: `{ "schemaVersion":"v1", "alert": "test-original-alert", @@ -95,6 +119,23 @@ func TestManager_PatchRule_PayloadVariations(t *testing.T) { }, { name: "patch rule to disabled state", + Route: []*alertmanagertypes.RoutePolicy{ + { + Expression: fmt.Sprintf("ruleId == \"{{.ruleId}}\" && threshold.name == \"warning\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "{{.ruleId}}", + Enabled: true, + }, + }, + Config: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{model.LabelName("ruleId"): {}}, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 4 * time.Hour, + NoDataInterval: 4 * time.Hour, + }, + UsePolicy: false, + }, originalData: `{ "schemaVersion":"v2", "alert": "test-disable-alert", @@ -179,6 +220,20 @@ func TestManager_PatchRule_PayloadVariations(t *testing.T) { OrgID: claims.OrgID, } + // Update route expectations with actual rule ID + routesWithRuleID := make([]*alertmanagertypes.RoutePolicy, len(tc.Route)) + for i, route := range tc.Route { + routesWithRuleID[i] = &alertmanagertypes.RoutePolicy{ + Expression: strings.Replace(route.Expression, "{{.ruleId}}", ruleID.String(), -1), + ExpressionKind: route.ExpressionKind, + Channels: route.Channels, + Name: strings.Replace(route.Name, "{{.ruleId}}", ruleID.String(), -1), + Enabled: route.Enabled, + } + } + + mockRouteStore.ExpectDeleteRouteByName(existingRule.OrgID, ruleID.String()) + mockRouteStore.ExpectCreateBatch(routesWithRuleID) mockSQLRuleStore.ExpectGetStoredRule(ruleID, existingRule) mockSQLRuleStore.ExpectEditRule(existingRule) @@ -200,6 +255,12 @@ func TestManager_PatchRule_PayloadVariations(t *testing.T) { assert.Nil(t, findTaskByName(manager.RuleTasks(), taskName), "Task should be removed for disabled rule") } else { syncCompleted := waitForTaskSync(manager, taskName, true, 2*time.Second) + + // Verify notification config + config, err := nfmanager.GetNotificationConfig(orgId, result.Id) + assert.NoError(t, err) + assert.Equal(t, tc.Config, config) + assert.True(t, syncCompleted, "Task synchronization should complete within timeout") assert.NotNil(t, findTaskByName(manager.RuleTasks(), taskName), "Task should be created/updated for enabled rule") assert.Greater(t, len(manager.Rules()), 0, "Rules should be updated in manager") @@ -234,7 +295,7 @@ func findTaskByName(tasks []Task, taskName string) Task { return nil } -func setupTestManager(t *testing.T) (*Manager, *rulestoretest.MockSQLRuleStore, string) { +func setupTestManager(t *testing.T) (*Manager, *rulestoretest.MockSQLRuleStore, *nfroutingstoretest.MockSQLRouteStore, nfmanager.NotificationManager, string) { settings := instrumentationtest.New().ToProviderSettings() testDB := utils.NewQueryServiceDBForTests(t) @@ -266,7 +327,11 @@ func setupTestManager(t *testing.T) (*Manager, *rulestoretest.MockSQLRuleStore, t.Fatalf("Failed to create noop sharder: %v", err) } orgGetter := implorganization.NewGetter(implorganization.NewStore(testDB), noopSharder) - notificationManager := nfmanagertest.NewMock() + routeStore := nfroutingstoretest.NewMockSQLRouteStore() + notificationManager, err := rulebasednotification.New(t.Context(), settings, nfmanager.Config{}, routeStore) + if err != nil { + t.Fatalf("Failed to create alert manager: %v", err) + } alertManager, err := signozalertmanager.New(context.TODO(), settings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter, notificationManager) if err != nil { t.Fatalf("Failed to create alert manager: %v", err) @@ -290,21 +355,40 @@ func setupTestManager(t *testing.T) (*Manager, *rulestoretest.MockSQLRuleStore, } close(manager.block) - return manager, mockSQLRuleStore, testOrgID.StringValue() + return manager, mockSQLRuleStore, routeStore, notificationManager, testOrgID.StringValue() } func TestCreateRule(t *testing.T) { claims := &authtypes.Claims{ Email: "test@example.com", } - manager, mockSQLRuleStore, orgId := setupTestManager(t) + manager, mockSQLRuleStore, mockRouteStore, nfmanager, orgId := setupTestManager(t) claims.OrgID = orgId testCases := []struct { name string + Route []*alertmanagertypes.RoutePolicy + Config *alertmanagertypes.NotificationConfig ruleStr string }{ { name: "validate stored rule data structure", + Route: []*alertmanagertypes.RoutePolicy{ + { + Expression: fmt.Sprintf("ruleId == \"{{.ruleId}}\" && threshold.name == \"warning\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "{{.ruleId}}", + Enabled: true, + }, + }, + Config: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{model.LabelName("ruleId"): {}}, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 4 * time.Hour, + NoDataInterval: 4 * time.Hour, + }, + UsePolicy: false, + }, ruleStr: `{ "alert": "cpu usage", "ruleType": "threshold_rule", @@ -341,6 +425,30 @@ func TestCreateRule(t *testing.T) { }, { name: "create complete v2 rule with thresholds", + Route: []*alertmanagertypes.RoutePolicy{ + { + Expression: fmt.Sprintf("ruleId == \"{{.ruleId}}\" && threshold.name == \"critical\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "{{.ruleId}}", + Enabled: true, + }, + { + Expression: fmt.Sprintf("ruleId == \"{{.ruleId}}\" && threshold.name == \"warning\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "{{.ruleId}}", + Enabled: true, + }, + }, + Config: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{model.LabelName("k8s.node.name"): {}, model.LabelName("ruleId"): {}}, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 10 * time.Minute, + NoDataInterval: 4 * time.Hour, + }, + UsePolicy: false, + }, ruleStr: `{ "schemaVersion":"v2", "state": "firing", @@ -399,6 +507,18 @@ func TestCreateRule(t *testing.T) { "frequency": "1m" } }, + "notificationSettings": { + "GroupBy": [ + "k8s.node.name" + ], + "renotify": { + "interval": "10m", + "enabled": true, + "alertStates": [ + "firing" + ] + } + }, "labels": { "severity": "warning" }, @@ -429,6 +549,20 @@ func TestCreateRule(t *testing.T) { }, OrgID: claims.OrgID, } + + // Update route expectations with actual rule ID + routesWithRuleID := make([]*alertmanagertypes.RoutePolicy, len(tc.Route)) + for i, route := range tc.Route { + routesWithRuleID[i] = &alertmanagertypes.RoutePolicy{ + Expression: strings.Replace(route.Expression, "{{.ruleId}}", rule.ID.String(), -1), + ExpressionKind: route.ExpressionKind, + Channels: route.Channels, + Name: strings.Replace(route.Name, "{{.ruleId}}", rule.ID.String(), -1), + Enabled: route.Enabled, + } + } + + mockRouteStore.ExpectCreateBatch(routesWithRuleID) mockSQLRuleStore.ExpectCreateRule(rule) ctx := authtypes.NewContextWithClaims(context.Background(), *claims) @@ -441,6 +575,12 @@ func TestCreateRule(t *testing.T) { // Wait for task creation with proper synchronization taskName := prepareTaskName(result.Id) syncCompleted := waitForTaskSync(manager, taskName, true, 2*time.Second) + + // Verify notification config + config, err := nfmanager.GetNotificationConfig(orgId, result.Id) + assert.NoError(t, err) + assert.Equal(t, tc.Config, config) + assert.True(t, syncCompleted, "Task creation should complete within timeout") assert.NotNil(t, findTaskByName(manager.RuleTasks(), taskName), "Task should be created with correct name") assert.Greater(t, len(manager.Rules()), 0, "Rules should be added to manager") @@ -455,14 +595,35 @@ func TestEditRule(t *testing.T) { claims := &authtypes.Claims{ Email: "test@example.com", } - manager, mockSQLRuleStore, orgId := setupTestManager(t) + manager, mockSQLRuleStore, mockRouteStore, nfmanager, orgId := setupTestManager(t) claims.OrgID = orgId testCases := []struct { + ruleID string name string + Route []*alertmanagertypes.RoutePolicy + Config *alertmanagertypes.NotificationConfig ruleStr string }{ { - name: "validate edit rule functionality", + ruleID: "12345678-1234-1234-1234-123456789012", + name: "validate edit rule functionality", + Route: []*alertmanagertypes.RoutePolicy{ + { + Expression: fmt.Sprintf("ruleId == \"rule1\" && threshold.name == \"critical\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"critical-alerts"}, + Name: "12345678-1234-1234-1234-123456789012", + Enabled: true, + }, + }, + Config: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{model.LabelName("ruleId"): {}}, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 4 * time.Hour, + NoDataInterval: 4 * time.Hour, + }, + UsePolicy: false, + }, ruleStr: `{ "alert": "updated cpu usage", "ruleType": "threshold_rule", @@ -498,7 +659,32 @@ func TestEditRule(t *testing.T) { }`, }, { - name: "edit complete v2 rule with thresholds", + ruleID: "12345678-1234-1234-1234-123456789013", + name: "edit complete v2 rule with thresholds", + Route: []*alertmanagertypes.RoutePolicy{ + { + Expression: fmt.Sprintf("ruleId == \"rule2\" && threshold.name == \"critical\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "12345678-1234-1234-1234-123456789013", + Enabled: true, + }, + { + Expression: fmt.Sprintf("ruleId == \"rule2\" && threshold.name == \"warning\""), + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: []string{"test-alerts"}, + Name: "12345678-1234-1234-1234-123456789013", + Enabled: true, + }, + }, + Config: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{model.LabelName("ruleId"): {}, model.LabelName("k8s.node.name"): {}}, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 10 * time.Minute, + NoDataInterval: 4 * time.Hour, + }, + UsePolicy: false, + }, ruleStr: `{ "schemaVersion":"v2", "state": "firing", @@ -560,6 +746,18 @@ func TestEditRule(t *testing.T) { "labels": { "severity": "critical" }, + "notificationSettings": { + "GroupBy": [ + "k8s.node.name" + ], + "renotify": { + "interval": "10m", + "enabled": true, + "alertStates": [ + "firing" + ] + } + }, "annotations": { "description": "This alert is fired when memory usage crosses the threshold", "summary": "Memory usage threshold exceeded" @@ -573,11 +771,13 @@ func TestEditRule(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - ruleID := valuer.GenerateUUID() - + ruleId, err := valuer.NewUUID(tc.ruleID) + if err != nil { + t.Errorf("error creating ruleId: %s", err) + } existingRule := &ruletypes.Rule{ Identifiable: types.Identifiable{ - ID: ruleID, + ID: ruleId, }, TimeAuditable: types.TimeAuditable{ CreatedAt: time.Now(), @@ -590,18 +790,24 @@ func TestEditRule(t *testing.T) { Data: `{"alert": "original cpu usage", "disabled": false}`, OrgID: claims.OrgID, } - - mockSQLRuleStore.ExpectGetStoredRule(ruleID, existingRule) + mockRouteStore.ExpectDeleteRouteByName(existingRule.OrgID, ruleId.String()) + mockRouteStore.ExpectCreateBatch(tc.Route) + mockSQLRuleStore.ExpectGetStoredRule(ruleId, existingRule) mockSQLRuleStore.ExpectEditRule(existingRule) ctx := authtypes.NewContextWithClaims(context.Background(), *claims) - err := manager.EditRule(ctx, tc.ruleStr, ruleID) + err = manager.EditRule(ctx, tc.ruleStr, ruleId) assert.NoError(t, err) // Wait for task update with proper synchronization - taskName := prepareTaskName(ruleID.StringValue()) + + taskName := prepareTaskName(ruleId.String()) syncCompleted := waitForTaskSync(manager, taskName, true, 2*time.Second) + + config, err := nfmanager.GetNotificationConfig(orgId, ruleId.String()) + assert.NoError(t, err) + assert.Equal(t, tc.Config, config) assert.True(t, syncCompleted, "Task update should complete within timeout") assert.NotNil(t, findTaskByName(manager.RuleTasks(), taskName), "Task should be updated with correct name") assert.Greater(t, len(manager.Rules()), 0, "Rules should be updated in manager") diff --git a/pkg/query-service/rules/prom_rule.go b/pkg/query-service/rules/prom_rule.go index 773c86a2368b..a880b98d4c9a 100644 --- a/pkg/query-service/rules/prom_rule.go +++ b/pkg/query-service/rules/prom_rule.go @@ -147,13 +147,19 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error) var alerts = make(map[uint64]*ruletypes.Alert, len(res)) + ruleReceivers := r.Threshold.GetRuleReceivers() + ruleReceiverMap := make(map[string][]string) + for _, value := range ruleReceivers { + ruleReceiverMap[value.Name] = value.Channels + } + for _, series := range res { if len(series.Floats) == 0 { continue } - results, err := r.Threshold.ShouldAlert(toCommonSeries(series)) + results, err := r.Threshold.ShouldAlert(toCommonSeries(series), r.Unit()) if err != nil { return nil, err } @@ -165,7 +171,7 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error) } r.logger.DebugContext(ctx, "alerting for series", "rule_name", r.Name(), "series", series) - threshold := valueFormatter.Format(r.targetVal(), r.Unit()) + threshold := valueFormatter.Format(result.Target, result.TargetUnit) tmplData := ruletypes.AlertTemplateData(l, valueFormatter.Format(result.V, r.Unit()), threshold) // Inject some convenience variables that are easier to remember for users @@ -218,7 +224,6 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error) r.lastError = err return nil, err } - alerts[h] = &ruletypes.Alert{ Labels: lbs, QueryResultLables: resultLabels, @@ -227,13 +232,12 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error) State: model.StatePending, Value: result.V, GeneratorURL: r.GeneratorURL(), - Receivers: r.preferredChannels, + Receivers: ruleReceiverMap[lbs.Map()[ruletypes.LabelThresholdName]], } } } r.logger.InfoContext(ctx, "number of alerts found", "rule_name", r.Name(), "alerts_count", len(alerts)) - // alerts[h] is ready, add or update active list now for h, a := range alerts { // Check whether we already have alerting state for the identifying label set. @@ -241,7 +245,9 @@ func (r *PromRule) Eval(ctx context.Context, ts time.Time) (interface{}, error) if alert, ok := r.Active[h]; ok && alert.State != model.StateInactive { alert.Value = a.Value alert.Annotations = a.Annotations - alert.Receivers = r.preferredChannels + if v, ok := alert.Labels.Map()[ruletypes.LabelThresholdName]; ok { + alert.Receivers = ruleReceiverMap[v] + } continue } diff --git a/pkg/query-service/rules/promrule_test.go b/pkg/query-service/rules/promrule_test.go index 17177de622c9..ef0dbcab32f3 100644 --- a/pkg/query-service/rules/promrule_test.go +++ b/pkg/query-service/rules/promrule_test.go @@ -696,7 +696,7 @@ func TestPromRuleShouldAlert(t *testing.T) { assert.NoError(t, err) } - resultVectors, err := rule.Threshold.ShouldAlert(toCommonSeries(c.values)) + resultVectors, err := rule.Threshold.ShouldAlert(toCommonSeries(c.values), rule.Unit()) assert.NoError(t, err) // Compare full result vector with expected vector diff --git a/pkg/query-service/rules/test_notification.go b/pkg/query-service/rules/test_notification.go index f2a6420a4240..5f72136b99e2 100644 --- a/pkg/query-service/rules/test_notification.go +++ b/pkg/query-service/rules/test_notification.go @@ -38,7 +38,6 @@ func defaultTestNotification(opts PrepareTestRuleOptions) (int, *model.ApiError) if parsedRule.RuleType == ruletypes.RuleTypeThreshold { // add special labels for test alerts - parsedRule.Annotations[labels.AlertSummaryLabel] = fmt.Sprintf("The rule threshold is set to %.4f, and the observed metric value is {{$value}}.", *parsedRule.RuleCondition.Target) parsedRule.Labels[labels.RuleSourceLabel] = "" parsedRule.Labels[labels.AlertRuleIdLabel] = "" diff --git a/pkg/query-service/rules/threshold_rule.go b/pkg/query-service/rules/threshold_rule.go index 536ee1cf2f0f..e881ba4fb184 100644 --- a/pkg/query-service/rules/threshold_rule.go +++ b/pkg/query-service/rules/threshold_rule.go @@ -488,7 +488,7 @@ func (r *ThresholdRule) buildAndRunQuery(ctx context.Context, orgID valuer.UUID, continue } } - resultSeries, err := r.Threshold.ShouldAlert(*series) + resultSeries, err := r.Threshold.ShouldAlert(*series, r.Unit()) if err != nil { return nil, err } @@ -565,7 +565,7 @@ func (r *ThresholdRule) buildAndRunQueryV5(ctx context.Context, orgID valuer.UUI continue } } - resultSeries, err := r.Threshold.ShouldAlert(*series) + resultSeries, err := r.Threshold.ShouldAlert(*series, r.Unit()) if err != nil { return nil, err } @@ -602,6 +602,12 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er resultFPs := map[uint64]struct{}{} var alerts = make(map[uint64]*ruletypes.Alert, len(res)) + ruleReceivers := r.Threshold.GetRuleReceivers() + ruleReceiverMap := make(map[string][]string) + for _, value := range ruleReceivers { + ruleReceiverMap[value.Name] = value.Channels + } + for _, smpl := range res { l := make(map[string]string, len(smpl.Metric)) for _, lbl := range smpl.Metric { @@ -610,7 +616,7 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er value := valueFormatter.Format(smpl.V, r.Unit()) //todo(aniket): handle different threshold - threshold := valueFormatter.Format(r.targetVal(), r.Unit()) + threshold := valueFormatter.Format(smpl.Target, smpl.TargetUnit) r.logger.DebugContext(ctx, "Alert template data for rule", "rule_name", r.Name(), "formatter", valueFormatter.Name(), "value", value, "threshold", threshold) tmplData := ruletypes.AlertTemplateData(l, value, threshold) @@ -690,7 +696,7 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er State: model.StatePending, Value: smpl.V, GeneratorURL: r.GeneratorURL(), - Receivers: r.preferredChannels, + Receivers: ruleReceiverMap[lbs.Map()[ruletypes.LabelThresholdName]], Missing: smpl.IsMissing, } } @@ -705,7 +711,9 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er alert.Value = a.Value alert.Annotations = a.Annotations - alert.Receivers = r.preferredChannels + if v, ok := alert.Labels.Map()[ruletypes.LabelThresholdName]; ok { + alert.Receivers = ruleReceiverMap[v] + } continue } diff --git a/pkg/query-service/rules/threshold_rule_test.go b/pkg/query-service/rules/threshold_rule_test.go index d311a47e186e..2e7523669135 100644 --- a/pkg/query-service/rules/threshold_rule_test.go +++ b/pkg/query-service/rules/threshold_rule_test.go @@ -824,7 +824,7 @@ func TestThresholdRuleShouldAlert(t *testing.T) { values.Points[i].Timestamp = time.Now().UnixMilli() } - resultVectors, err := rule.Threshold.ShouldAlert(c.values) + resultVectors, err := rule.Threshold.ShouldAlert(c.values, rule.Unit()) assert.NoError(t, err, "Test case %d", idx) // Compare result vectors with expected behavior @@ -1201,7 +1201,7 @@ func TestThresholdRuleLabelNormalization(t *testing.T) { values.Points[i].Timestamp = time.Now().UnixMilli() } - vector, err := rule.Threshold.ShouldAlert(c.values) + vector, err := rule.Threshold.ShouldAlert(c.values, rule.Unit()) assert.NoError(t, err) for name, value := range c.values.Labels { @@ -1211,7 +1211,7 @@ func TestThresholdRuleLabelNormalization(t *testing.T) { } // Get result vectors from threshold evaluation - resultVectors, err := rule.Threshold.ShouldAlert(c.values) + resultVectors, err := rule.Threshold.ShouldAlert(c.values, rule.Unit()) assert.NoError(t, err, "Test case %d", idx) // Compare result vectors with expected behavior @@ -1501,13 +1501,11 @@ func TestThresholdRuleUnitCombinations(t *testing.T) { Kind: ruletypes.BasicThresholdKind, Spec: ruletypes.BasicRuleThresholds{ { - Name: postableRule.AlertName, - TargetValue: &c.target, - TargetUnit: c.targetUnit, - RuleUnit: postableRule.RuleCondition.CompositeQuery.Unit, - MatchType: ruletypes.MatchType(c.matchType), - CompareOp: ruletypes.CompareOp(c.compareOp), - SelectedQuery: postableRule.RuleCondition.SelectedQuery, + Name: postableRule.AlertName, + TargetValue: &c.target, + TargetUnit: c.targetUnit, + MatchType: ruletypes.MatchType(c.matchType), + CompareOp: ruletypes.CompareOp(c.compareOp), }, }, } @@ -1612,12 +1610,10 @@ func TestThresholdRuleNoData(t *testing.T) { Kind: ruletypes.BasicThresholdKind, Spec: ruletypes.BasicRuleThresholds{ { - Name: postableRule.AlertName, - TargetValue: &target, - RuleUnit: postableRule.RuleCondition.CompositeQuery.Unit, - MatchType: ruletypes.AtleastOnce, - CompareOp: ruletypes.ValueIsEq, - SelectedQuery: postableRule.RuleCondition.SelectedQuery, + Name: postableRule.AlertName, + TargetValue: &target, + MatchType: ruletypes.AtleastOnce, + CompareOp: ruletypes.ValueIsEq, }, }, } @@ -1734,13 +1730,11 @@ func TestThresholdRuleTracesLink(t *testing.T) { Kind: ruletypes.BasicThresholdKind, Spec: ruletypes.BasicRuleThresholds{ { - Name: postableRule.AlertName, - TargetValue: &c.target, - TargetUnit: c.targetUnit, - RuleUnit: postableRule.RuleCondition.CompositeQuery.Unit, - MatchType: ruletypes.MatchType(c.matchType), - CompareOp: ruletypes.CompareOp(c.compareOp), - SelectedQuery: postableRule.RuleCondition.SelectedQuery, + Name: postableRule.AlertName, + TargetValue: &c.target, + TargetUnit: c.targetUnit, + MatchType: ruletypes.MatchType(c.matchType), + CompareOp: ruletypes.CompareOp(c.compareOp), }, }, } @@ -1873,13 +1867,11 @@ func TestThresholdRuleLogsLink(t *testing.T) { Kind: ruletypes.BasicThresholdKind, Spec: ruletypes.BasicRuleThresholds{ { - Name: postableRule.AlertName, - TargetValue: &c.target, - TargetUnit: c.targetUnit, - RuleUnit: postableRule.RuleCondition.CompositeQuery.Unit, - MatchType: ruletypes.MatchType(c.matchType), - CompareOp: ruletypes.CompareOp(c.compareOp), - SelectedQuery: postableRule.RuleCondition.SelectedQuery, + Name: postableRule.AlertName, + TargetValue: &c.target, + TargetUnit: c.targetUnit, + MatchType: ruletypes.MatchType(c.matchType), + CompareOp: ruletypes.CompareOp(c.compareOp), }, }, } @@ -2125,22 +2117,18 @@ func TestMultipleThresholdRule(t *testing.T) { Kind: ruletypes.BasicThresholdKind, Spec: ruletypes.BasicRuleThresholds{ { - Name: "first_threshold", - TargetValue: &c.target, - TargetUnit: c.targetUnit, - RuleUnit: postableRule.RuleCondition.CompositeQuery.Unit, - MatchType: ruletypes.MatchType(c.matchType), - CompareOp: ruletypes.CompareOp(c.compareOp), - SelectedQuery: postableRule.RuleCondition.SelectedQuery, + Name: "first_threshold", + TargetValue: &c.target, + TargetUnit: c.targetUnit, + MatchType: ruletypes.MatchType(c.matchType), + CompareOp: ruletypes.CompareOp(c.compareOp), }, { - Name: "second_threshold", - TargetValue: &c.secondTarget, - TargetUnit: c.targetUnit, - RuleUnit: postableRule.RuleCondition.CompositeQuery.Unit, - MatchType: ruletypes.MatchType(c.matchType), - CompareOp: ruletypes.CompareOp(c.compareOp), - SelectedQuery: postableRule.RuleCondition.SelectedQuery, + Name: "second_threshold", + TargetValue: &c.secondTarget, + TargetUnit: c.targetUnit, + MatchType: ruletypes.MatchType(c.matchType), + CompareOp: ruletypes.CompareOp(c.compareOp), }, }, } diff --git a/pkg/signoz/provider.go b/pkg/signoz/provider.go index 72b037ad31f5..d66b2acd461e 100644 --- a/pkg/signoz/provider.go +++ b/pkg/signoz/provider.go @@ -38,6 +38,7 @@ import ( "github.com/SigNoz/signoz/pkg/telemetrystore" "github.com/SigNoz/signoz/pkg/telemetrystore/clickhousetelemetrystore" "github.com/SigNoz/signoz/pkg/telemetrystore/telemetrystorehook" + routeTypes "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" "github.com/SigNoz/signoz/pkg/version" "github.com/SigNoz/signoz/pkg/web" "github.com/SigNoz/signoz/pkg/web/noopweb" @@ -133,6 +134,7 @@ func NewSQLMigrationProviderFactories( sqlmigration.NewQueryBuilderV5MigrationFactory(sqlstore, telemetryStore), sqlmigration.NewAddMeterQuickFiltersFactory(sqlstore, sqlschema), sqlmigration.NewUpdateTTLSettingForCustomRetentionFactory(sqlstore, sqlschema), + sqlmigration.NewAddRoutePolicyFactory(sqlstore, sqlschema), ) } @@ -155,9 +157,9 @@ func NewPrometheusProviderFactories(telemetryStore telemetrystore.TelemetryStore ) } -func NewNotificationManagerProviderFactories() factory.NamedMap[factory.ProviderFactory[nfmanager.NotificationManager, nfmanager.Config]] { +func NewNotificationManagerProviderFactories(routeStore routeTypes.RouteStore) factory.NamedMap[factory.ProviderFactory[nfmanager.NotificationManager, nfmanager.Config]] { return factory.MustNewNamedMap( - rulebasednotification.NewFactory(), + rulebasednotification.NewFactory(routeStore), ) } diff --git a/pkg/signoz/signoz.go b/pkg/signoz/signoz.go index 08fd5cb60c4b..6d531e6fe3da 100644 --- a/pkg/signoz/signoz.go +++ b/pkg/signoz/signoz.go @@ -4,6 +4,7 @@ import ( "context" "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfroutingstore/sqlroutingstore" "github.com/SigNoz/signoz/pkg/analytics" "github.com/SigNoz/signoz/pkg/cache" "github.com/SigNoz/signoz/pkg/emailing" @@ -230,12 +231,14 @@ func New( // Initialize user getter userGetter := impluser.NewGetter(impluser.NewStore(sqlstore, providerSettings)) + // will need to create factory for all stores + routeStore := sqlroutingstore.NewStore(sqlstore) // shared NotificationManager instance for both alertmanager and rules notificationManager, err := factory.NewProviderFromNamedMap( ctx, providerSettings, nfmanager.Config{}, - NewNotificationManagerProviderFactories(), + NewNotificationManagerProviderFactories(routeStore), "rulebased", ) if err != nil { diff --git a/pkg/sqlmigration/049_add_route_policy.go b/pkg/sqlmigration/049_add_route_policy.go new file mode 100644 index 000000000000..c59207b4fda4 --- /dev/null +++ b/pkg/sqlmigration/049_add_route_policy.go @@ -0,0 +1,260 @@ +package sqlmigration + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "github.com/SigNoz/signoz/pkg/errors" + "github.com/SigNoz/signoz/pkg/factory" + "github.com/SigNoz/signoz/pkg/sqlschema" + "github.com/SigNoz/signoz/pkg/sqlstore" + "github.com/SigNoz/signoz/pkg/types" + "github.com/SigNoz/signoz/pkg/types/ruletypes" + "github.com/SigNoz/signoz/pkg/valuer" + "github.com/uptrace/bun" + "github.com/uptrace/bun/migrate" + "log/slog" + "time" +) + +// Shared types for migration + +type expressionRoute struct { + bun.BaseModel `bun:"table:route_policy"` + types.Identifiable + types.TimeAuditable + types.UserAuditable + + Expression string `bun:"expression,type:text"` + ExpressionKind string `bun:"kind,type:text"` + + Channels []string `bun:"channels,type:text"` + + Name string `bun:"name,type:text"` + Description string `bun:"description,type:text"` + Enabled bool `bun:"enabled,type:boolean,default:true"` + Tags []string `bun:"tags,type:text"` + + OrgID string `bun:"org_id,type:text"` +} + +type rule struct { + bun.BaseModel `bun:"table:rule"` + types.Identifiable + types.TimeAuditable + types.UserAuditable + Deleted int `bun:"deleted,default:0"` + Data string `bun:"data,type:text"` + OrgID string `bun:"org_id,type:text"` +} + +type addRoutePolicies struct { + sqlstore sqlstore.SQLStore + sqlschema sqlschema.SQLSchema + logger *slog.Logger +} + +func NewAddRoutePolicyFactory(sqlstore sqlstore.SQLStore, sqlschema sqlschema.SQLSchema) factory.ProviderFactory[SQLMigration, Config] { + return factory.NewProviderFactory(factory.MustNewName("add_route_policy"), func(ctx context.Context, providerSettings factory.ProviderSettings, config Config) (SQLMigration, error) { + return newAddRoutePolicy(ctx, providerSettings, config, sqlstore, sqlschema) + }) +} + +func newAddRoutePolicy(_ context.Context, settings factory.ProviderSettings, _ Config, sqlstore sqlstore.SQLStore, sqlschema sqlschema.SQLSchema) (SQLMigration, error) { + return &addRoutePolicies{ + sqlstore: sqlstore, + sqlschema: sqlschema, + logger: settings.Logger, + }, nil +} + +func (migration *addRoutePolicies) Register(migrations *migrate.Migrations) error { + if err := migrations.Register(migration.Up, migration.Down); err != nil { + return err + } + + return nil +} + +func (migration *addRoutePolicies) Up(ctx context.Context, db *bun.DB) error { + _, _, err := migration.sqlschema.GetTable(ctx, sqlschema.TableName("route_policy")) + if err == nil { + return nil + } + + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return err + } + + defer func() { + _ = tx.Rollback() + }() + + sqls := [][]byte{} + + // Create the route_policy table + table := &sqlschema.Table{ + Name: "route_policy", + Columns: []*sqlschema.Column{ + {Name: "id", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "created_at", DataType: sqlschema.DataTypeTimestamp, Nullable: false}, + {Name: "updated_at", DataType: sqlschema.DataTypeTimestamp, Nullable: false}, + {Name: "created_by", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "updated_by", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "expression", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "kind", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "channels", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "name", DataType: sqlschema.DataTypeText, Nullable: false}, + {Name: "description", DataType: sqlschema.DataTypeText, Nullable: true}, + {Name: "enabled", DataType: sqlschema.DataTypeBoolean, Nullable: false, Default: "true"}, + {Name: "tags", DataType: sqlschema.DataTypeText, Nullable: true}, + {Name: "org_id", DataType: sqlschema.DataTypeText, Nullable: false}, + }, + PrimaryKeyConstraint: &sqlschema.PrimaryKeyConstraint{ + ColumnNames: []sqlschema.ColumnName{"id"}, + }, + ForeignKeyConstraints: []*sqlschema.ForeignKeyConstraint{ + { + ReferencingColumnName: "org_id", + ReferencedTableName: "organizations", + ReferencedColumnName: "id", + }, + }, + } + + tableSQLs := migration.sqlschema.Operator().CreateTable(table) + sqls = append(sqls, tableSQLs...) + + for _, sqlStmt := range sqls { + if _, err := tx.ExecContext(ctx, string(sqlStmt)); err != nil { + return err + } + } + + err = migration.migrateRulesToRoutePolicies(ctx, tx) + if err != nil { + return err + } + + if err := tx.Commit(); err != nil { + return err + } + + return nil +} + +func (migration *addRoutePolicies) migrateRulesToRoutePolicies(ctx context.Context, tx bun.Tx) error { + var rules []*rule + err := tx.NewSelect(). + Model(&rules). + Where("deleted = ?", 0). + Scan(ctx) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return nil // No rules to migrate + } + return errors.NewInternalf(errors.CodeInternal, "failed to fetch rules") + } + + channelsByOrg, err := migration.getAllChannels(ctx, tx) + if err != nil { + return errors.NewInternalf(errors.CodeInternal, "fetching channels error: %v", err) + } + + var routesToInsert []*expressionRoute + + routesToInsert, err = migration.convertRulesToRoutes(rules, channelsByOrg) + if err != nil { + return errors.NewInternalf(errors.CodeInternal, "converting rules to routes error: %v", err) + } + + // Insert all routes in a single batch operation + if len(routesToInsert) > 0 { + _, err = tx.NewInsert(). + Model(&routesToInsert). + Exec(ctx) + if err != nil { + return errors.NewInternalf(errors.CodeInternal, "failed to insert notification routes") + } + } + + return nil +} + +func (migration *addRoutePolicies) convertRulesToRoutes(rules []*rule, channelsByOrg map[string][]string) ([]*expressionRoute, error) { + var routes []*expressionRoute + for _, r := range rules { + var gettableRule ruletypes.GettableRule + if err := json.Unmarshal([]byte(r.Data), &gettableRule); err != nil { + return nil, errors.NewInternalf(errors.CodeInternal, "failed to unmarshal rule data for rule ID %s: %v", r.ID, err) + } + + if len(gettableRule.PreferredChannels) == 0 { + channels, exists := channelsByOrg[r.OrgID] + if !exists || len(channels) == 0 { + continue + } + gettableRule.PreferredChannels = channels + } + severity := "critical" + if v, ok := gettableRule.Labels["severity"]; ok { + severity = v + } + expression := fmt.Sprintf(`%s == "%s" && %s == "%s"`, "threshold.name", severity, "ruleId", r.ID.String()) + route := &expressionRoute{ + Identifiable: types.Identifiable{ + ID: valuer.GenerateUUID(), + }, + TimeAuditable: types.TimeAuditable{ + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + }, + UserAuditable: types.UserAuditable{ + CreatedBy: r.CreatedBy, + UpdatedBy: r.UpdatedBy, + }, + Expression: expression, + ExpressionKind: "rule", + Channels: gettableRule.PreferredChannels, + Name: r.ID.StringValue(), + Enabled: true, + OrgID: r.OrgID, + } + routes = append(routes, route) + } + return routes, nil +} + +func (migration *addRoutePolicies) getAllChannels(ctx context.Context, tx bun.Tx) (map[string][]string, error) { + type channel struct { + bun.BaseModel `bun:"table:notification_channel"` + types.Identifiable + types.TimeAuditable + Name string `json:"name" bun:"name"` + Type string `json:"type" bun:"type"` + Data string `json:"data" bun:"data"` + OrgID string `json:"org_id" bun:"org_id"` + } + + var channels []*channel + err := tx.NewSelect(). + Model(&channels). + Scan(ctx) + if err != nil { + return nil, errors.NewInternalf(errors.CodeInternal, "failed to fetch all channels") + } + + // Group channels by org ID + channelsByOrg := make(map[string][]string) + for _, ch := range channels { + channelsByOrg[ch.OrgID] = append(channelsByOrg[ch.OrgID], ch.Name) + } + + return channelsByOrg, nil +} + +func (migration *addRoutePolicies) Down(ctx context.Context, db *bun.DB) error { + return nil +} diff --git a/pkg/types/alertmanagertypes/alert.go b/pkg/types/alertmanagertypes/alert.go index 971ec23b1ccd..02d3ee2fe039 100644 --- a/pkg/types/alertmanagertypes/alert.go +++ b/pkg/types/alertmanagertypes/alert.go @@ -27,6 +27,8 @@ type ( // An alias for the Alert type from the alertmanager package. Alert = types.Alert + AlertSlice = types.AlertSlice + PostableAlert = models.PostableAlert PostableAlerts = models.PostableAlerts @@ -38,6 +40,10 @@ type ( GettableAlerts = models.GettableAlerts ) +const ( + NoDataLabel = model.LabelName("nodata") +) + type DeprecatedGettableAlert struct { *model.Alert Status types.AlertStatus `json:"status"` @@ -307,3 +313,11 @@ func receiversMatchFilter(receivers []string, filter *regexp.Regexp) bool { return false } + +func NoDataAlert(alert *types.Alert) bool { + if _, ok := alert.Labels[NoDataLabel]; ok { + return true + } else { + return false + } +} diff --git a/pkg/types/alertmanagertypes/config.go b/pkg/types/alertmanagertypes/config.go index a438afc7ee12..1b394fb0acfc 100644 --- a/pkg/types/alertmanagertypes/config.go +++ b/pkg/types/alertmanagertypes/config.go @@ -21,6 +21,7 @@ import ( const ( DefaultReceiverName string = "default-receiver" DefaultGroupBy string = "ruleId" + DefaultGroupByAll string = "__all__" ) var ( @@ -193,6 +194,20 @@ func (c *Config) SetRouteConfig(routeConfig RouteConfig) error { return nil } +func (c *Config) AddInhibitRules(rules []config.InhibitRule) error { + if c.alertmanagerConfig == nil { + return errors.New(errors.TypeInvalidInput, ErrCodeAlertmanagerConfigInvalid, "config is nil") + } + + c.alertmanagerConfig.InhibitRules = append(c.alertmanagerConfig.InhibitRules, rules...) + + c.storeableConfig.Config = string(newRawFromConfig(c.alertmanagerConfig)) + c.storeableConfig.Hash = fmt.Sprintf("%x", newConfigHash(c.storeableConfig.Config)) + c.storeableConfig.UpdatedAt = time.Now() + + return nil +} + func (c *Config) AlertmanagerConfig() *config.Config { return c.alertmanagerConfig } @@ -304,6 +319,27 @@ func (c *Config) CreateRuleIDMatcher(ruleID string, receiverNames []string) erro return nil } +func (c *Config) DeleteRuleIDInhibitor(ruleID string) error { + if c.alertmanagerConfig.InhibitRules == nil { + return nil // already nil + } + + var filteredRules []config.InhibitRule + for _, inhibitor := range c.alertmanagerConfig.InhibitRules { + sourceContainsRuleID := matcherContainsRuleID(inhibitor.SourceMatchers, ruleID) + targetContainsRuleID := matcherContainsRuleID(inhibitor.TargetMatchers, ruleID) + if !sourceContainsRuleID && !targetContainsRuleID { + filteredRules = append(filteredRules, inhibitor) + } + } + c.alertmanagerConfig.InhibitRules = filteredRules + c.storeableConfig.Config = string(newRawFromConfig(c.alertmanagerConfig)) + c.storeableConfig.Hash = fmt.Sprintf("%x", newConfigHash(c.storeableConfig.Config)) + c.storeableConfig.UpdatedAt = time.Now() + + return nil +} + func (c *Config) UpdateRuleIDMatcher(ruleID string, receiverNames []string) error { err := c.DeleteRuleIDMatcher(ruleID) if err != nil { @@ -405,6 +441,8 @@ func init() { type NotificationConfig struct { NotificationGroup map[model.LabelName]struct{} Renotify ReNotificationConfig + UsePolicy bool + GroupByAll bool } func (nc *NotificationConfig) DeepCopy() NotificationConfig { @@ -415,6 +453,7 @@ func (nc *NotificationConfig) DeepCopy() NotificationConfig { for k, v := range nc.NotificationGroup { deepCopy.NotificationGroup[k] = v } + deepCopy.UsePolicy = nc.UsePolicy return deepCopy } @@ -423,7 +462,7 @@ type ReNotificationConfig struct { RenotifyInterval time.Duration } -func NewNotificationConfig(groups []string, renotifyInterval time.Duration, noDataRenotifyInterval time.Duration) NotificationConfig { +func NewNotificationConfig(groups []string, renotifyInterval time.Duration, noDataRenotifyInterval time.Duration, policy bool) NotificationConfig { notificationConfig := GetDefaultNotificationConfig() if renotifyInterval != 0 { @@ -435,8 +474,13 @@ func NewNotificationConfig(groups []string, renotifyInterval time.Duration, noDa } for _, group := range groups { notificationConfig.NotificationGroup[model.LabelName(group)] = struct{}{} + if group == DefaultGroupByAll { + notificationConfig.GroupByAll = true + } } + notificationConfig.UsePolicy = policy + return notificationConfig } diff --git a/pkg/types/alertmanagertypes/expressionroute.go b/pkg/types/alertmanagertypes/expressionroute.go new file mode 100644 index 000000000000..858864c681cc --- /dev/null +++ b/pkg/types/alertmanagertypes/expressionroute.go @@ -0,0 +1,139 @@ +package alertmanagertypes + +import ( + "context" + "github.com/expr-lang/expr" + "time" + + "github.com/SigNoz/signoz/pkg/errors" + "github.com/SigNoz/signoz/pkg/types" + "github.com/SigNoz/signoz/pkg/valuer" + "github.com/uptrace/bun" +) + +type PostableRoutePolicy struct { + Expression string `json:"expression"` + ExpressionKind ExpressionKind `json:"kind"` + Channels []string `json:"channels"` + Name string `json:"name"` + Description string `json:"description"` + Tags []string `json:"tags,omitempty"` +} + +func (p *PostableRoutePolicy) Validate() error { + if p.Expression == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "expression is required") + } + + if p.Name == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "name is required") + } + + if len(p.Channels) == 0 { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "at least one channel is required") + } + + // Validate channels are not empty + for i, channel := range p.Channels { + if channel == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "channel at index %d cannot be empty", i) + } + } + + if p.ExpressionKind != PolicyBasedExpression && p.ExpressionKind != RuleBasedExpression { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "unsupported expression kind: %s", p.ExpressionKind.StringValue()) + } + + _, err := expr.Compile(p.Expression) + if err != nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "invalid expression syntax: %v", err) + } + + return nil +} + +type GettableRoutePolicy struct { + PostableRoutePolicy // Embedded + + ID string `json:"id"` + + // Audit fields + CreatedAt *time.Time `json:"createdAt"` + UpdatedAt *time.Time `json:"updatedAt"` + CreatedBy *string `json:"createdBy"` + UpdatedBy *string `json:"updatedBy"` +} + +type ExpressionKind struct { + valuer.String +} + +var ( + RuleBasedExpression = ExpressionKind{valuer.NewString("rule")} + PolicyBasedExpression = ExpressionKind{valuer.NewString("policy")} +) + +// RoutePolicy represents the database model for expression routes +type RoutePolicy struct { + bun.BaseModel `bun:"table:route_policy"` + types.Identifiable + types.TimeAuditable + types.UserAuditable + + Expression string `bun:"expression,type:text,notnull" json:"expression"` + ExpressionKind ExpressionKind `bun:"kind,type:text" json:"kind"` + + Channels []string `bun:"channels,type:jsonb" json:"channels"` + + Name string `bun:"name,type:text" json:"name"` + Description string `bun:"description,type:text" json:"description"` + Enabled bool `bun:"enabled,type:boolean,default:true" json:"enabled"` + Tags []string `bun:"tags,type:jsonb" json:"tags,omitempty"` + + OrgID string `bun:"org_id,type:text,notnull" json:"orgId"` +} + +func (er *RoutePolicy) Validate() error { + if er == nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "route_policy cannot be nil") + } + + if er.Expression == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "expression is required") + } + + if er.Name == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "name is required") + } + + if er.OrgID == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "organization ID is required") + } + + if len(er.Channels) == 0 { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "at least one channel is required") + } + + // Validate channels are not empty + for i, channel := range er.Channels { + if channel == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "channel at index %d cannot be empty", i) + } + } + + if er.ExpressionKind != PolicyBasedExpression && er.ExpressionKind != RuleBasedExpression { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "unsupported expression kind: %s", er.ExpressionKind.StringValue()) + } + + return nil +} + +type RouteStore interface { + GetByID(ctx context.Context, orgId string, id string) (*RoutePolicy, error) + Create(ctx context.Context, route *RoutePolicy) error + CreateBatch(ctx context.Context, routes []*RoutePolicy) error + Delete(ctx context.Context, orgId string, id string) error + GetAllByKind(ctx context.Context, orgID string, kind ExpressionKind) ([]*RoutePolicy, error) + GetAllByName(ctx context.Context, orgID string, name string) ([]*RoutePolicy, error) + DeleteRouteByName(ctx context.Context, orgID string, name string) error +} diff --git a/pkg/types/alertmanagertypes/receiver.go b/pkg/types/alertmanagertypes/receiver.go index 83cae2931b8d..3916c150eecf 100644 --- a/pkg/types/alertmanagertypes/receiver.go +++ b/pkg/types/alertmanagertypes/receiver.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "fmt" + "github.com/prometheus/common/model" "log/slog" "time" @@ -49,9 +50,9 @@ func NewReceiver(input string) (Receiver, error) { return receiverWithDefaults, nil } -func TestReceiver(ctx context.Context, receiver Receiver, receiverIntegrationsFunc ReceiverIntegrationsFunc, config *Config, tmpl *template.Template, logger *slog.Logger, alert *Alert) error { - ctx = notify.WithGroupKey(ctx, fmt.Sprintf("%s-%s-%d", receiver.Name, alert.Labels.Fingerprint(), time.Now().Unix())) - ctx = notify.WithGroupLabels(ctx, alert.Labels) +func TestReceiver(ctx context.Context, receiver Receiver, receiverIntegrationsFunc ReceiverIntegrationsFunc, config *Config, tmpl *template.Template, logger *slog.Logger, lSet model.LabelSet, alert ...*Alert) error { + ctx = notify.WithGroupKey(ctx, fmt.Sprintf("%s-%s-%d", receiver.Name, lSet.Fingerprint(), time.Now().Unix())) + ctx = notify.WithGroupLabels(ctx, lSet) ctx = notify.WithReceiverName(ctx, receiver.Name) // We need to create a new config with the same global and route config but empty receivers and routes @@ -80,7 +81,7 @@ func TestReceiver(ctx context.Context, receiver Receiver, receiverIntegrationsFu return errors.Newf(errors.TypeNotFound, errors.CodeNotFound, "no integrations found for receiver %s", receiver.Name) } - if _, err = integrations[0].Notify(ctx, alert); err != nil { + if _, err = integrations[0].Notify(ctx, alert...); err != nil { return err } diff --git a/pkg/types/ruletypes/api_params.go b/pkg/types/ruletypes/api_params.go index 4cc44ed90230..df04c68382cf 100644 --- a/pkg/types/ruletypes/api_params.go +++ b/pkg/types/ruletypes/api_params.go @@ -15,6 +15,8 @@ import ( "github.com/SigNoz/signoz/pkg/query-service/utils/times" "github.com/SigNoz/signoz/pkg/query-service/utils/timestamp" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + + "github.com/prometheus/alertmanager/config" ) type AlertType string @@ -65,21 +67,95 @@ type PostableRule struct { } type NotificationSettings struct { - NotificationGroupBy []string `json:"notificationGroupBy,omitempty"` - ReNotifyInterval Duration `json:"renotify,omitempty"` - AlertStates []model.AlertState `json:"alertStates,omitempty"` + GroupBy []string `json:"groupBy,omitempty"` + Renotify Renotify `json:"renotify,omitempty"` + UsePolicy bool `json:"usePolicy,omitempty"` +} + +type Renotify struct { + Enabled bool `json:"enabled"` + ReNotifyInterval Duration `json:"interval,omitempty"` + AlertStates []model.AlertState `json:"alertStates,omitempty"` } func (ns *NotificationSettings) GetAlertManagerNotificationConfig() alertmanagertypes.NotificationConfig { - var renotifyInterval Duration - var noDataRenotifyInterval Duration - if slices.Contains(ns.AlertStates, model.StateNoData) { - noDataRenotifyInterval = ns.ReNotifyInterval + var renotifyInterval time.Duration + var noDataRenotifyInterval time.Duration + if ns.Renotify.Enabled { + if slices.Contains(ns.Renotify.AlertStates, model.StateNoData) { + noDataRenotifyInterval = time.Duration(ns.Renotify.ReNotifyInterval) + } + if slices.Contains(ns.Renotify.AlertStates, model.StateFiring) { + renotifyInterval = time.Duration(ns.Renotify.ReNotifyInterval) + } + } else { + renotifyInterval = 8760 * time.Hour //1 year for no renotify substitute + noDataRenotifyInterval = 8760 * time.Hour } - if slices.Contains(ns.AlertStates, model.StateFiring) { - renotifyInterval = ns.ReNotifyInterval + return alertmanagertypes.NewNotificationConfig(ns.GroupBy, renotifyInterval, noDataRenotifyInterval, ns.UsePolicy) +} + +func (r *PostableRule) GetRuleRouteRequest(ruleId string) ([]*alertmanagertypes.PostableRoutePolicy, error) { + threshold, err := r.RuleCondition.Thresholds.GetRuleThreshold() + if err != nil { + return nil, err } - return alertmanagertypes.NewNotificationConfig(ns.NotificationGroupBy, time.Duration(renotifyInterval), time.Duration(noDataRenotifyInterval)) + receivers := threshold.GetRuleReceivers() + routeRequests := make([]*alertmanagertypes.PostableRoutePolicy, 0) + for _, receiver := range receivers { + expression := fmt.Sprintf(`%s == "%s" && %s == "%s"`, LabelThresholdName, receiver.Name, LabelRuleId, ruleId) + routeRequests = append(routeRequests, &alertmanagertypes.PostableRoutePolicy{ + Expression: expression, + ExpressionKind: alertmanagertypes.RuleBasedExpression, + Channels: receiver.Channels, + Name: ruleId, + Description: fmt.Sprintf("Auto-generated route for rule %s", ruleId), + Tags: []string{"auto-generated", "rule-based"}, + }) + } + return routeRequests, nil +} + +func (r *PostableRule) GetInhibitRules(ruleId string) ([]config.InhibitRule, error) { + threshold, err := r.RuleCondition.Thresholds.GetRuleThreshold() + if err != nil { + return nil, err + } + var groups []string + if r.NotificationSettings != nil { + for k := range r.NotificationSettings.GetAlertManagerNotificationConfig().NotificationGroup { + groups = append(groups, string(k)) + } + } + receivers := threshold.GetRuleReceivers() + var inhibitRules []config.InhibitRule + for i := 0; i < len(receivers)-1; i++ { + rule := config.InhibitRule{ + SourceMatchers: config.Matchers{ + { + Name: LabelThresholdName, + Value: receivers[i].Name, + }, + { + Name: LabelRuleId, + Value: ruleId, + }, + }, + TargetMatchers: config.Matchers{ + { + Name: LabelThresholdName, + Value: receivers[i+1].Name, + }, + { + Name: LabelRuleId, + Value: ruleId, + }, + }, + Equal: groups, + } + inhibitRules = append(inhibitRules, rule) + } + return inhibitRules, nil } func (ns *NotificationSettings) UnmarshalJSON(data []byte) error { @@ -95,7 +171,7 @@ func (ns *NotificationSettings) UnmarshalJSON(data []byte) error { } // Validate states after unmarshaling - for _, state := range ns.AlertStates { + for _, state := range ns.Renotify.AlertStates { if state != model.StateFiring && state != model.StateNoData { return fmt.Errorf("invalid alert state: %s", state) } @@ -143,15 +219,25 @@ func (r *PostableRule) processRuleDefaults() error { Kind: BasicThresholdKind, Spec: BasicRuleThresholds{{ Name: thresholdName, - RuleUnit: r.RuleCondition.CompositeQuery.Unit, TargetUnit: r.RuleCondition.TargetUnit, TargetValue: r.RuleCondition.Target, MatchType: r.RuleCondition.MatchType, CompareOp: r.RuleCondition.CompareOp, + Channels: r.PreferredChannels, }}, } r.RuleCondition.Thresholds = &thresholdData r.Evaluation = &EvaluationEnvelope{RollingEvaluation, RollingWindow{EvalWindow: r.EvalWindow, Frequency: r.Frequency}} + r.NotificationSettings = &NotificationSettings{ + Renotify: Renotify{ + Enabled: true, + ReNotifyInterval: Duration(4 * time.Hour), + AlertStates: []model.AlertState{model.StateFiring}, + }, + } + if r.RuleCondition.AlertOnAbsent { + r.NotificationSettings.Renotify.AlertStates = append(r.NotificationSettings.Renotify.AlertStates, model.StateNoData) + } } } @@ -170,6 +256,7 @@ func (r *PostableRule) MarshalJSON() ([]byte, error) { } aux.Evaluation = nil aux.SchemaVersion = "" + aux.NotificationSettings = nil return json.Marshal(aux) default: copyStruct := *r @@ -192,7 +279,7 @@ func isValidLabelName(ln string) bool { return false } for i, b := range ln { - if !((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || (b >= '0' && b <= '9' && i > 0)) { + if !((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || b == '.' || (b >= '0' && b <= '9' && i > 0)) { return false } } @@ -347,6 +434,7 @@ func (g *GettableRule) MarshalJSON() ([]byte, error) { } aux.Evaluation = nil aux.SchemaVersion = "" + aux.NotificationSettings = nil return json.Marshal(aux) default: copyStruct := *g diff --git a/pkg/types/ruletypes/api_params_test.go b/pkg/types/ruletypes/api_params_test.go index 74d58fdb39a0..5b33d0b72e0d 100644 --- a/pkg/types/ruletypes/api_params_test.go +++ b/pkg/types/ruletypes/api_params_test.go @@ -2,10 +2,11 @@ package ruletypes import ( "encoding/json" - "github.com/stretchr/testify/assert" "testing" "time" + "github.com/stretchr/testify/assert" + v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3" ) @@ -303,10 +304,6 @@ func TestParseIntoRuleSchemaVersioning(t *testing.T) { t.Errorf("Expected threshold name 'warning' from severity label, got '%s'", spec.Name) } - // Verify all fields are copied from RuleCondition - if spec.RuleUnit != "percent" { - t.Errorf("Expected RuleUnit 'percent', got '%s'", spec.RuleUnit) - } if spec.TargetUnit != "%" { t.Errorf("Expected TargetUnit '%%', got '%s'", spec.TargetUnit) } @@ -455,9 +452,6 @@ func TestParseIntoRuleSchemaVersioning(t *testing.T) { if spec.TargetUnit != "%" { t.Errorf("Expected TargetUnit '%%' (overwritten), got '%s'", spec.TargetUnit) } - if spec.RuleUnit != "percent" { - t.Errorf("Expected RuleUnit 'percent' (overwritten), got '%s'", spec.RuleUnit) - } if rule.Evaluation == nil { t.Fatal("Expected Evaluation to be populated") @@ -630,9 +624,9 @@ func TestParseIntoRuleThresholdGeneration(t *testing.T) { vector, err := threshold.ShouldAlert(v3.Series{ Points: []v3.Point{{Value: 0.15, Timestamp: 1000}}, // 150ms in seconds Labels: map[string]string{"test": "label"}, - }) + }, "") if err != nil { - t.Fatalf("Unexpected error in ShouldAlert: %v", err) + t.Fatalf("Unexpected error in shouldAlert: %v", err) } if len(vector) == 0 { @@ -707,9 +701,9 @@ func TestParseIntoRuleMultipleThresholds(t *testing.T) { vector, err := threshold.ShouldAlert(v3.Series{ Points: []v3.Point{{Value: 95.0, Timestamp: 1000}}, // 95% CPU usage Labels: map[string]string{"service": "test"}, - }) + }, "") if err != nil { - t.Fatalf("Unexpected error in ShouldAlert: %v", err) + t.Fatalf("Unexpected error in shouldAlert: %v", err) } assert.Equal(t, 2, len(vector)) @@ -717,9 +711,9 @@ func TestParseIntoRuleMultipleThresholds(t *testing.T) { vector, err = threshold.ShouldAlert(v3.Series{ Points: []v3.Point{{Value: 75.0, Timestamp: 1000}}, // 75% CPU usage Labels: map[string]string{"service": "test"}, - }) + }, "") if err != nil { - t.Fatalf("Unexpected error in ShouldAlert: %v", err) + t.Fatalf("Unexpected error in shouldAlert: %v", err) } assert.Equal(t, 1, len(vector)) diff --git a/pkg/types/ruletypes/constants.go b/pkg/types/ruletypes/constants.go index 1851ef919a11..43f97055a931 100644 --- a/pkg/types/ruletypes/constants.go +++ b/pkg/types/ruletypes/constants.go @@ -2,3 +2,4 @@ package ruletypes const CriticalThresholdName = "CRITICAL" const LabelThresholdName = "threshold.name" +const LabelRuleId = "ruleId" diff --git a/pkg/types/ruletypes/result_types.go b/pkg/types/ruletypes/result_types.go index 0eda5c8aaaf2..2460322a6dcd 100644 --- a/pkg/types/ruletypes/result_types.go +++ b/pkg/types/ruletypes/result_types.go @@ -18,6 +18,10 @@ type Sample struct { Metric labels.Labels IsMissing bool + + Target float64 + + TargetUnit string } func (s Sample) String() string { diff --git a/pkg/types/ruletypes/threshold.go b/pkg/types/ruletypes/threshold.go index fba9765d5793..87f771b2fe4a 100644 --- a/pkg/types/ruletypes/threshold.go +++ b/pkg/types/ruletypes/threshold.go @@ -51,23 +51,41 @@ func (r *RuleThresholdData) UnmarshalJSON(data []byte) error { return nil } +type RuleReceivers struct { + Channels []string `json:"channels"` + Name string `json:"name"` +} + type RuleThreshold interface { - ShouldAlert(series v3.Series) (Vector, error) + ShouldAlert(series v3.Series, unit string) (Vector, error) + GetRuleReceivers() []RuleReceivers } type BasicRuleThreshold struct { Name string `json:"name"` TargetValue *float64 `json:"target"` TargetUnit string `json:"targetUnit"` - RuleUnit string `json:"ruleUnit"` RecoveryTarget *float64 `json:"recoveryTarget"` MatchType MatchType `json:"matchType"` CompareOp CompareOp `json:"op"` - SelectedQuery string `json:"selectedQuery"` + Channels []string `json:"channels"` } type BasicRuleThresholds []BasicRuleThreshold +func (r BasicRuleThresholds) GetRuleReceivers() []RuleReceivers { + thresholds := []BasicRuleThreshold(r) + var receiverRoutes []RuleReceivers + sortThresholds(thresholds) + for _, threshold := range thresholds { + receiverRoutes = append(receiverRoutes, RuleReceivers{ + Name: threshold.Name, + Channels: threshold.Channels, + }) + } + return receiverRoutes +} + func (r BasicRuleThresholds) Validate() error { var errs []error for _, basicThreshold := range r { @@ -78,13 +96,27 @@ func (r BasicRuleThresholds) Validate() error { return errors.Join(errs...) } -func (r BasicRuleThresholds) ShouldAlert(series v3.Series) (Vector, error) { +func (r BasicRuleThresholds) ShouldAlert(series v3.Series, unit string) (Vector, error) { var resultVector Vector thresholds := []BasicRuleThreshold(r) + sortThresholds(thresholds) + for _, threshold := range thresholds { + smpl, shouldAlert := threshold.shouldAlert(series, unit) + if shouldAlert { + smpl.Target = threshold.target(unit) + smpl.TargetUnit = threshold.TargetUnit + resultVector = append(resultVector, smpl) + } + } + return resultVector, nil +} + +func sortThresholds(thresholds []BasicRuleThreshold) { sort.Slice(thresholds, func(i, j int) bool { - compareOp := thresholds[i].GetCompareOp() - targetI := thresholds[i].Target() - targetJ := thresholds[j].Target() + + compareOp := thresholds[i].getCompareOp() + targetI := thresholds[i].target(thresholds[i].TargetUnit) //for sorting we dont need rule unit + targetJ := thresholds[j].target(thresholds[j].TargetUnit) switch compareOp { case ValueIsAbove, ValueAboveOrEq, ValueOutsideBounds: @@ -98,49 +130,22 @@ func (r BasicRuleThresholds) ShouldAlert(series v3.Series) (Vector, error) { return targetI > targetJ } }) - for _, threshold := range thresholds { - smpl, shouldAlert := threshold.ShouldAlert(series) - if shouldAlert { - resultVector = append(resultVector, smpl) - } - } - return resultVector, nil } -func (b BasicRuleThreshold) GetName() string { - return b.Name -} - -func (b BasicRuleThreshold) Target() float64 { +func (b BasicRuleThreshold) target(ruleUnit string) float64 { unitConverter := converter.FromUnit(converter.Unit(b.TargetUnit)) // convert the target value to the y-axis unit value := unitConverter.Convert(converter.Value{ F: *b.TargetValue, U: converter.Unit(b.TargetUnit), - }, converter.Unit(b.RuleUnit)) + }, converter.Unit(ruleUnit)) return value.F } -func (b BasicRuleThreshold) GetRecoveryTarget() float64 { - if b.RecoveryTarget == nil { - return 0 - } else { - return *b.RecoveryTarget - } -} - -func (b BasicRuleThreshold) GetMatchType() MatchType { - return b.MatchType -} - -func (b BasicRuleThreshold) GetCompareOp() CompareOp { +func (b BasicRuleThreshold) getCompareOp() CompareOp { return b.CompareOp } -func (b BasicRuleThreshold) GetSelectedQuery() string { - return b.SelectedQuery -} - func (b BasicRuleThreshold) Validate() error { var errs []error if b.Name == "" { @@ -182,7 +187,7 @@ func removeGroupinSetPoints(series v3.Series) []v3.Point { return result } -func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { +func (b BasicRuleThreshold) shouldAlert(series v3.Series, ruleUnit string) (Sample, bool) { var shouldAlert bool var alertSmpl Sample var lbls labels.Labels @@ -191,6 +196,8 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { lbls = append(lbls, labels.Label{Name: name, Value: value}) } + target := b.target(ruleUnit) + lbls = append(lbls, labels.Label{Name: LabelThresholdName, Value: b.Name}) series.Points = removeGroupinSetPoints(series) @@ -205,7 +212,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { // If any sample matches the condition, the rule is firing. if b.CompareOp == ValueIsAbove { for _, smpl := range series.Points { - if smpl.Value > b.Target() { + if smpl.Value > target { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lbls} shouldAlert = true break @@ -213,7 +220,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueIsBelow { for _, smpl := range series.Points { - if smpl.Value < b.Target() { + if smpl.Value < target { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lbls} shouldAlert = true break @@ -221,7 +228,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueIsEq { for _, smpl := range series.Points { - if smpl.Value == b.Target() { + if smpl.Value == target { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lbls} shouldAlert = true break @@ -229,7 +236,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueIsNotEq { for _, smpl := range series.Points { - if smpl.Value != b.Target() { + if smpl.Value != target { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lbls} shouldAlert = true break @@ -237,7 +244,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueOutsideBounds { for _, smpl := range series.Points { - if math.Abs(smpl.Value) >= b.Target() { + if math.Abs(smpl.Value) >= target { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lbls} shouldAlert = true break @@ -247,10 +254,10 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { case AllTheTimes: // If all samples match the condition, the rule is firing. shouldAlert = true - alertSmpl = Sample{Point: Point{V: b.Target()}, Metric: lbls} + alertSmpl = Sample{Point: Point{V: target}, Metric: lbls} if b.CompareOp == ValueIsAbove { for _, smpl := range series.Points { - if smpl.Value <= b.Target() { + if smpl.Value <= target { shouldAlert = false break } @@ -267,7 +274,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueIsBelow { for _, smpl := range series.Points { - if smpl.Value >= b.Target() { + if smpl.Value >= target { shouldAlert = false break } @@ -283,14 +290,14 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueIsEq { for _, smpl := range series.Points { - if smpl.Value != b.Target() { + if smpl.Value != target { shouldAlert = false break } } } else if b.CompareOp == ValueIsNotEq { for _, smpl := range series.Points { - if smpl.Value == b.Target() { + if smpl.Value == target { shouldAlert = false break } @@ -306,7 +313,7 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } } else if b.CompareOp == ValueOutsideBounds { for _, smpl := range series.Points { - if math.Abs(smpl.Value) < b.Target() { + if math.Abs(smpl.Value) < target { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lbls} shouldAlert = false break @@ -326,23 +333,23 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { avg := sum / count alertSmpl = Sample{Point: Point{V: avg}, Metric: lbls} if b.CompareOp == ValueIsAbove { - if avg > b.Target() { + if avg > target { shouldAlert = true } } else if b.CompareOp == ValueIsBelow { - if avg < b.Target() { + if avg < target { shouldAlert = true } } else if b.CompareOp == ValueIsEq { - if avg == b.Target() { + if avg == target { shouldAlert = true } } else if b.CompareOp == ValueIsNotEq { - if avg != b.Target() { + if avg != target { shouldAlert = true } } else if b.CompareOp == ValueOutsideBounds { - if math.Abs(avg) >= b.Target() { + if math.Abs(avg) >= target { shouldAlert = true } } @@ -358,23 +365,23 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { } alertSmpl = Sample{Point: Point{V: sum}, Metric: lbls} if b.CompareOp == ValueIsAbove { - if sum > b.Target() { + if sum > target { shouldAlert = true } } else if b.CompareOp == ValueIsBelow { - if sum < b.Target() { + if sum < target { shouldAlert = true } } else if b.CompareOp == ValueIsEq { - if sum == b.Target() { + if sum == target { shouldAlert = true } } else if b.CompareOp == ValueIsNotEq { - if sum != b.Target() { + if sum != target { shouldAlert = true } } else if b.CompareOp == ValueOutsideBounds { - if math.Abs(sum) >= b.Target() { + if math.Abs(sum) >= target { shouldAlert = true } } @@ -383,19 +390,19 @@ func (b BasicRuleThreshold) ShouldAlert(series v3.Series) (Sample, bool) { shouldAlert = false alertSmpl = Sample{Point: Point{V: series.Points[len(series.Points)-1].Value}, Metric: lbls} if b.CompareOp == ValueIsAbove { - if series.Points[len(series.Points)-1].Value > b.Target() { + if series.Points[len(series.Points)-1].Value > target { shouldAlert = true } } else if b.CompareOp == ValueIsBelow { - if series.Points[len(series.Points)-1].Value < b.Target() { + if series.Points[len(series.Points)-1].Value < target { shouldAlert = true } } else if b.CompareOp == ValueIsEq { - if series.Points[len(series.Points)-1].Value == b.Target() { + if series.Points[len(series.Points)-1].Value == target { shouldAlert = true } } else if b.CompareOp == ValueIsNotEq { - if series.Points[len(series.Points)-1].Value != b.Target() { + if series.Points[len(series.Points)-1].Value != target { shouldAlert = true } }