317 lines
10 KiB
Go
Raw Normal View History

package alertmanagerserver
import (
"bytes"
"context"
"io"
"log/slog"
"net"
"net/http"
"net/url"
"testing"
"time"
"github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest"
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes"
"github.com/SigNoz/signoz/pkg/types/alertmanagertypes/alertmanagertypestest"
"github.com/go-openapi/strfmt"
"github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/client_golang/prometheus"
commoncfg "github.com/prometheus/common/config"
chore: notification routing | added notificaiton routing via expression based routes (#9195) * chore: added custom distpatcher * feat(notification-grouping): added notification grouping * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): added structure changes * feat(notification-grouping): added structure changes * feat(notification-routing): added notification routing * chore(notification-grouping): added notificaiton grouping * Update pkg/alertmanager/nfmanager/rulebasednotification/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): added renotification interval * fix(notification-grouping): added fix for renotification * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notification-grouping): added no data renotify interval * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed unwanted tests * chore(notification-grouping): removed unwanted pkg name * chore(notification-grouping): added delete notification setting * chore(notification-grouping): added delete notification setting * Update pkg/alertmanager/nfmanager/nfmanagertest/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): go linting errors * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): corrected get notification config logic * Update pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * feat(notification-routing): added test cases for dispatcher * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Apply suggestions from code review Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Update pkg/alertmanager/alertmanagerserver/distpatcher_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): sorted imports * chore(notification-routing): minor edit |pr resolve comments * chore(notification-grouping): corrected dispatcher test cases * chore(notification-routing): added notification routing policies * chore(notification-routing): corrected race condition in test * chore: resolved pr comments * chore: passing threshold value to tempalte * chore: completed delete rule functionality * chore: added grouping disabled functionality * chore: added grouping disabled functionality * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): sorted imports * chore(notification-routing): fix linting errors * chore(notification-routing): removed enabled flags * fix: test rule multiple threhsold (#9224) * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * Update pkg/sqlmigration/049_add_route_policy.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore: added org_is as foreign key * chore: resolved pr comments * chore: removed route store unused --------- Co-authored-by: Srikanth Chekuri <srikanth.chekuri92@gmail.com> Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
2025-10-03 19:47:15 +05:30
"github.com/prometheus/common/model"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestServerSetConfigAndStop(t *testing.T) {
notificationManager := nfmanagertest.NewMock()
server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager)
require.NoError(t, err)
feat(alertmanager): integrate with ruler (#7222) ### Summary Integrate the new implementations of the alertmanager along with changes to the ruler. This change can be broadly categoried into 3 parts: #### Frontend - The earlier `/api/v1/alerts` api was double encoding the response in json and sending it to the frontend. This PR fixes the json response object. For instance, we have gone from the response `{ "status": "success", "data": "{\"status\":\"success\",\"data\":[{\"labels\":{\"alertname\":\"[platform][consumer] consumer is above 100% memory utilization\",\"bu\":\"platform\",\"...... }` to the response `{"status":"success","data":[{"labels":{"alertname":"[Metrics] Pod CP......` - `msteams` has been changed to `msteamsv2` wherever applicable #### Ruler The following changes have been done in the ruler component: - Removal of the old alertmanager and notifier - The RuleDB methods `Create`, `Edit` and `Delete` have been made transactional - Introduction of a new `testPrepareNotifyFunc` for sending test notifications - Integration with the new alertmanager #### Alertmanager Although a huge chunk of the alertmanagers have been merged in previous PRs (the list can be found at https://github.com/SigNoz/platform-pod/issues/404), this PR takes care of changes needed in order to incorporate it with the ruler - Addition of ruleId based matching - Support for marshalling the global configuration directly from the upstream alertmanager - Addition of orgId to the legacy alertmanager - Support for always adding defaults to both routes and receivers while creating them - Migration to create the required alertmanager tables - Migration for msteams to msteamsv2 has been added. We will start using msteamv2 config for the new alertmanager and keep using msteams for the old one. #### Related Issues / PR's Closes https://github.com/SigNoz/platform-pod/issues/404 Closes https://github.com/SigNoz/platform-pod/issues/176
2025-03-10 01:30:42 +05:30
amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1")
require.NoError(t, err)
assert.NoError(t, server.SetConfig(context.Background(), amConfig))
assert.NoError(t, server.Stop(context.Background()))
}
func TestServerTestReceiverTypeWebhook(t *testing.T) {
notificationManager := nfmanagertest.NewMock()
server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager)
require.NoError(t, err)
feat(alertmanager): integrate with ruler (#7222) ### Summary Integrate the new implementations of the alertmanager along with changes to the ruler. This change can be broadly categoried into 3 parts: #### Frontend - The earlier `/api/v1/alerts` api was double encoding the response in json and sending it to the frontend. This PR fixes the json response object. For instance, we have gone from the response `{ "status": "success", "data": "{\"status\":\"success\",\"data\":[{\"labels\":{\"alertname\":\"[platform][consumer] consumer is above 100% memory utilization\",\"bu\":\"platform\",\"...... }` to the response `{"status":"success","data":[{"labels":{"alertname":"[Metrics] Pod CP......` - `msteams` has been changed to `msteamsv2` wherever applicable #### Ruler The following changes have been done in the ruler component: - Removal of the old alertmanager and notifier - The RuleDB methods `Create`, `Edit` and `Delete` have been made transactional - Introduction of a new `testPrepareNotifyFunc` for sending test notifications - Integration with the new alertmanager #### Alertmanager Although a huge chunk of the alertmanagers have been merged in previous PRs (the list can be found at https://github.com/SigNoz/platform-pod/issues/404), this PR takes care of changes needed in order to incorporate it with the ruler - Addition of ruleId based matching - Support for marshalling the global configuration directly from the upstream alertmanager - Addition of orgId to the legacy alertmanager - Support for always adding defaults to both routes and receivers while creating them - Migration to create the required alertmanager tables - Migration for msteams to msteamsv2 has been added. We will start using msteamv2 config for the new alertmanager and keep using msteams for the old one. #### Related Issues / PR's Closes https://github.com/SigNoz/platform-pod/issues/404 Closes https://github.com/SigNoz/platform-pod/issues/176
2025-03-10 01:30:42 +05:30
amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1")
require.NoError(t, err)
webhookListener, err := net.Listen("tcp", "localhost:0")
require.NoError(t, err)
requestBody := new(bytes.Buffer)
webhookServer := &http.Server{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, err := requestBody.ReadFrom(r.Body)
require.NoError(t, err)
w.WriteHeader(http.StatusOK)
}),
}
go func() {
require.NoError(t, webhookServer.Serve(webhookListener))
}()
require.NoError(t, server.SetConfig(context.Background(), amConfig))
defer require.NoError(t, server.Stop(context.Background()))
webhookURL, err := url.Parse("http://" + webhookListener.Addr().String() + "/webhook")
require.NoError(t, err)
err = server.TestReceiver(context.Background(), alertmanagertypes.Receiver{
Name: "test-receiver",
WebhookConfigs: []*config.WebhookConfig{
{
HTTPConfig: &commoncfg.HTTPClientConfig{},
URL: &config.SecretURL{URL: webhookURL},
},
},
})
assert.NoError(t, err)
assert.Contains(t, requestBody.String(), "test-receiver")
assert.Contains(t, requestBody.String(), "firing")
}
func TestServerPutAlerts(t *testing.T) {
stateStore := alertmanagertypestest.NewStateStore()
srvCfg := NewConfig()
srvCfg.Route.GroupInterval = 1 * time.Second
notificationManager := nfmanagertest.NewMock()
server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
require.NoError(t, err)
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
require.NoError(t, err)
require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{
Name: "test-receiver",
WebhookConfigs: []*config.WebhookConfig{
{
HTTPConfig: &commoncfg.HTTPClientConfig{},
URL: &config.SecretURL{URL: &url.URL{Host: "localhost", Path: "/test-receiver"}},
},
},
}))
require.NoError(t, server.SetConfig(context.Background(), amConfig))
require.NoError(t, server.PutAlerts(context.Background(), alertmanagertypes.PostableAlerts{
{
Annotations: models.LabelSet{"alertname": "test-alert"},
StartsAt: strfmt.DateTime(time.Now().Add(-time.Hour)),
EndsAt: strfmt.DateTime(time.Now().Add(time.Hour)),
Alert: models.Alert{
GeneratorURL: "http://localhost:8080/test-alert",
Labels: models.LabelSet{"alertname": "test-alert"},
},
},
}))
dummyRequest, err := http.NewRequest(http.MethodGet, "/alerts", nil)
require.NoError(t, err)
params, err := alertmanagertypes.NewGettableAlertsParams(dummyRequest)
require.NoError(t, err)
gettableAlerts, err := server.GetAlerts(context.Background(), params)
require.NoError(t, err)
assert.Equal(t, 1, len(gettableAlerts))
assert.Equal(t, gettableAlerts[0].Alert.Labels["alertname"], "test-alert")
assert.NoError(t, server.Stop(context.Background()))
}
chore: notification routing | added notificaiton routing via expression based routes (#9195) * chore: added custom distpatcher * feat(notification-grouping): added notification grouping * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): linting and test cases * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): debug log lines * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): addded integration test dependency * feat(notification-grouping): added structure changes * feat(notification-grouping): added structure changes * feat(notification-routing): added notification routing * chore(notification-grouping): added notificaiton grouping * Update pkg/alertmanager/nfmanager/rulebasednotification/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): added renotification interval * fix(notification-grouping): added fix for renotification * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notificaiton-grouping): added no data renotify * chore(notification-grouping): added no data renotify interval * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed errors package from dispatcher * chore(notification-grouping): removed unwanted tests * chore(notification-grouping): removed unwanted pkg name * chore(notification-grouping): added delete notification setting * chore(notification-grouping): added delete notification setting * Update pkg/alertmanager/nfmanager/nfmanagertest/provider.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): removed nfmanager config| notification settings in postable rule * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): added test for dispatcher * chore(notification-grouping): go linting errors * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): added test cases for aggGroupPerRoute * chore(notification-grouping): corrected get notification config logic * Update pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * feat(notification-routing): added test cases for dispatcher * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Apply suggestions from code review Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): added notification routing policies * chore(notification-routing): added notification routing policies * Update pkg/alertmanager/alertmanagerserver/distpatcher_test.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore(notification-routing): sorted imports * chore(notification-routing): minor edit |pr resolve comments * chore(notification-grouping): corrected dispatcher test cases * chore(notification-routing): added notification routing policies * chore(notification-routing): corrected race condition in test * chore: resolved pr comments * chore: passing threshold value to tempalte * chore: completed delete rule functionality * chore: added grouping disabled functionality * chore: added grouping disabled functionality * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): resolved pr comments * chore(notification-routing): sorted imports * chore(notification-routing): fix linting errors * chore(notification-routing): removed enabled flags * fix: test rule multiple threhsold (#9224) * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected linting errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * chore: corrected migration errors * Update pkg/sqlmigration/049_add_route_policy.go Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * chore: added org_is as foreign key * chore: resolved pr comments * chore: removed route store unused --------- Co-authored-by: Srikanth Chekuri <srikanth.chekuri92@gmail.com> Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
2025-10-03 19:47:15 +05:30
func TestServerTestAlert(t *testing.T) {
stateStore := alertmanagertypestest.NewStateStore()
srvCfg := NewConfig()
srvCfg.Route.GroupInterval = 1 * time.Second
notificationManager := nfmanagertest.NewMock()
server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
require.NoError(t, err)
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
require.NoError(t, err)
webhook1Listener, err := net.Listen("tcp", "localhost:0")
require.NoError(t, err)
webhook2Listener, err := net.Listen("tcp", "localhost:0")
require.NoError(t, err)
requestCount1 := 0
requestCount2 := 0
webhook1Server := &http.Server{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCount1++
w.WriteHeader(http.StatusOK)
}),
}
webhook2Server := &http.Server{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCount2++
w.WriteHeader(http.StatusOK)
}),
}
go func() {
_ = webhook1Server.Serve(webhook1Listener)
}()
go func() {
_ = webhook2Server.Serve(webhook2Listener)
}()
webhook1URL, err := url.Parse("http://" + webhook1Listener.Addr().String() + "/webhook")
require.NoError(t, err)
webhook2URL, err := url.Parse("http://" + webhook2Listener.Addr().String() + "/webhook")
require.NoError(t, err)
require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{
Name: "receiver-1",
WebhookConfigs: []*config.WebhookConfig{
{
HTTPConfig: &commoncfg.HTTPClientConfig{},
URL: &config.SecretURL{URL: webhook1URL},
},
},
}))
require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{
Name: "receiver-2",
WebhookConfigs: []*config.WebhookConfig{
{
HTTPConfig: &commoncfg.HTTPClientConfig{},
URL: &config.SecretURL{URL: webhook2URL},
},
},
}))
require.NoError(t, server.SetConfig(context.Background(), amConfig))
defer func() {
_ = server.Stop(context.Background())
_ = webhook1Server.Close()
_ = webhook2Server.Close()
}()
// Test with multiple alerts going to different receivers
alert1 := &alertmanagertypes.PostableAlert{
Annotations: models.LabelSet{"alertname": "test-alert-1"},
StartsAt: strfmt.DateTime(time.Now()),
Alert: models.Alert{
Labels: models.LabelSet{"alertname": "test-alert-1", "severity": "critical"},
},
}
alert2 := &alertmanagertypes.PostableAlert{
Annotations: models.LabelSet{"alertname": "test-alert-2"},
StartsAt: strfmt.DateTime(time.Now()),
Alert: models.Alert{
Labels: models.LabelSet{"alertname": "test-alert-2", "severity": "warning"},
},
}
receiversMap := map[*alertmanagertypes.PostableAlert][]string{
alert1: {"receiver-1", "receiver-2"},
alert2: {"receiver-2"},
}
config := &alertmanagertypes.NotificationConfig{
NotificationGroup: make(map[model.LabelName]struct{}),
GroupByAll: false,
}
err = server.TestAlert(context.Background(), receiversMap, config)
require.NoError(t, err)
time.Sleep(100 * time.Millisecond)
assert.Greater(t, requestCount1, 0, "receiver-1 should have received at least one request")
assert.Greater(t, requestCount2, 0, "receiver-2 should have received at least one request")
}
func TestServerTestAlertContinuesOnFailure(t *testing.T) {
stateStore := alertmanagertypestest.NewStateStore()
srvCfg := NewConfig()
srvCfg.Route.GroupInterval = 1 * time.Second
notificationManager := nfmanagertest.NewMock()
server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager)
require.NoError(t, err)
amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1")
require.NoError(t, err)
// Create one working webhook and one failing receiver (non-existent)
webhookListener, err := net.Listen("tcp", "localhost:0")
require.NoError(t, err)
requestCount := 0
webhookServer := &http.Server{
Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
requestCount++
w.WriteHeader(http.StatusOK)
}),
}
go func() {
_ = webhookServer.Serve(webhookListener)
}()
webhookURL, err := url.Parse("http://" + webhookListener.Addr().String() + "/webhook")
require.NoError(t, err)
require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{
Name: "working-receiver",
WebhookConfigs: []*config.WebhookConfig{
{
HTTPConfig: &commoncfg.HTTPClientConfig{},
URL: &config.SecretURL{URL: webhookURL},
},
},
}))
require.NoError(t, amConfig.CreateReceiver(alertmanagertypes.Receiver{
Name: "failing-receiver",
WebhookConfigs: []*config.WebhookConfig{
{
HTTPConfig: &commoncfg.HTTPClientConfig{},
URL: &config.SecretURL{URL: &url.URL{Scheme: "http", Host: "localhost:1", Path: "/webhook"}},
},
},
}))
require.NoError(t, server.SetConfig(context.Background(), amConfig))
defer func() {
_ = server.Stop(context.Background())
_ = webhookServer.Close()
}()
alert := &alertmanagertypes.PostableAlert{
Annotations: models.LabelSet{"alertname": "test-alert"},
StartsAt: strfmt.DateTime(time.Now()),
Alert: models.Alert{
Labels: models.LabelSet{"alertname": "test-alert"},
},
}
receiversMap := map[*alertmanagertypes.PostableAlert][]string{
alert: {"working-receiver", "failing-receiver"},
}
config := &alertmanagertypes.NotificationConfig{
NotificationGroup: make(map[model.LabelName]struct{}),
GroupByAll: false,
}
err = server.TestAlert(context.Background(), receiversMap, config)
assert.Error(t, err)
time.Sleep(100 * time.Millisecond)
assert.Greater(t, requestCount, 0, "working-receiver should have received at least one request even though failing-receiver failed")
}