2024-09-24 10:22:52 +05:30
package rules
import (
"context"
"encoding/json"
"fmt"
2025-07-30 19:25:27 +05:30
"log/slog"
2024-09-24 10:22:52 +05:30
"math"
"strings"
"sync"
"time"
2025-03-20 21:01:41 +05:30
"github.com/SigNoz/signoz/ee/query-service/anomaly"
2025-05-03 18:30:07 +05:30
"github.com/SigNoz/signoz/pkg/cache"
2025-03-20 21:01:41 +05:30
"github.com/SigNoz/signoz/pkg/query-service/common"
"github.com/SigNoz/signoz/pkg/query-service/model"
2025-07-30 19:25:27 +05:30
"github.com/SigNoz/signoz/pkg/transition"
2025-04-18 00:04:25 +05:30
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
2025-05-03 18:30:07 +05:30
"github.com/SigNoz/signoz/pkg/valuer"
2025-03-20 21:01:41 +05:30
querierV2 "github.com/SigNoz/signoz/pkg/query-service/app/querier/v2"
"github.com/SigNoz/signoz/pkg/query-service/app/queryBuilder"
"github.com/SigNoz/signoz/pkg/query-service/interfaces"
v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3"
"github.com/SigNoz/signoz/pkg/query-service/utils/labels"
"github.com/SigNoz/signoz/pkg/query-service/utils/times"
"github.com/SigNoz/signoz/pkg/query-service/utils/timestamp"
"github.com/SigNoz/signoz/pkg/query-service/formatter"
baserules "github.com/SigNoz/signoz/pkg/query-service/rules"
2024-09-24 10:22:52 +05:30
2025-07-30 19:25:27 +05:30
querierV5 "github.com/SigNoz/signoz/pkg/querier"
anomalyV2 "github.com/SigNoz/signoz/ee/anomaly"
qbtypes "github.com/SigNoz/signoz/pkg/types/querybuildertypes/querybuildertypesv5"
2024-09-24 10:22:52 +05:30
)
const (
RuleTypeAnomaly = "anomaly_rule"
)
type AnomalyRule struct {
* baserules . BaseRule
mtx sync . Mutex
reader interfaces . Reader
// querierV2 is used for alerts created after the introduction of new metrics query builder
querierV2 interfaces . Querier
2025-07-30 19:25:27 +05:30
// querierV5 is used for alerts migrated after the introduction of new query builder
querierV5 querierV5 . Querier
provider anomaly . Provider
providerV2 anomalyV2 . Provider
version string
logger * slog . Logger
2024-09-24 10:22:52 +05:30
seasonality anomaly . Seasonality
}
func NewAnomalyRule (
id string ,
2025-05-03 18:30:07 +05:30
orgID valuer . UUID ,
2025-04-18 00:04:25 +05:30
p * ruletypes . PostableRule ,
2024-09-24 10:22:52 +05:30
reader interfaces . Reader ,
2025-07-30 19:25:27 +05:30
querierV5 querierV5 . Querier ,
logger * slog . Logger ,
2024-09-24 10:22:52 +05:30
cache cache . Cache ,
opts ... baserules . RuleOption ,
) ( * AnomalyRule , error ) {
2025-07-30 19:25:27 +05:30
logger . Info ( "creating new AnomalyRule" , "rule_id" , id )
opts = append ( opts , baserules . WithLogger ( logger ) )
2024-09-24 10:22:52 +05:30
2025-04-18 00:04:25 +05:30
if p . RuleCondition . CompareOp == ruletypes . ValueIsBelow {
2024-11-16 20:17:34 +05:30
target := - 1 * * p . RuleCondition . Target
p . RuleCondition . Target = & target
}
2025-05-03 18:30:07 +05:30
baseRule , err := baserules . NewBaseRule ( id , orgID , p , reader , opts ... )
2024-09-24 10:22:52 +05:30
if err != nil {
return nil , err
}
t := AnomalyRule {
BaseRule : baseRule ,
}
switch strings . ToLower ( p . RuleCondition . Seasonality ) {
case "hourly" :
t . seasonality = anomaly . SeasonalityHourly
case "daily" :
t . seasonality = anomaly . SeasonalityDaily
case "weekly" :
t . seasonality = anomaly . SeasonalityWeekly
default :
t . seasonality = anomaly . SeasonalityDaily
}
2025-07-30 19:25:27 +05:30
logger . Info ( "using seasonality" , "seasonality" , t . seasonality . String ( ) )
2024-09-24 10:22:52 +05:30
querierOptsV2 := querierV2 . QuerierOptions {
2025-04-05 18:22:26 +05:30
Reader : reader ,
Cache : cache ,
KeyGenerator : queryBuilder . NewKeyGenerator ( ) ,
2024-09-24 10:22:52 +05:30
}
t . querierV2 = querierV2 . NewQuerier ( querierOptsV2 )
t . reader = reader
if t . seasonality == anomaly . SeasonalityHourly {
t . provider = anomaly . NewHourlyProvider (
anomaly . WithCache [ * anomaly . HourlyProvider ] ( cache ) ,
anomaly . WithKeyGenerator [ * anomaly . HourlyProvider ] ( queryBuilder . NewKeyGenerator ( ) ) ,
anomaly . WithReader [ * anomaly . HourlyProvider ] ( reader ) ,
)
} else if t . seasonality == anomaly . SeasonalityDaily {
t . provider = anomaly . NewDailyProvider (
anomaly . WithCache [ * anomaly . DailyProvider ] ( cache ) ,
anomaly . WithKeyGenerator [ * anomaly . DailyProvider ] ( queryBuilder . NewKeyGenerator ( ) ) ,
anomaly . WithReader [ * anomaly . DailyProvider ] ( reader ) ,
)
} else if t . seasonality == anomaly . SeasonalityWeekly {
t . provider = anomaly . NewWeeklyProvider (
anomaly . WithCache [ * anomaly . WeeklyProvider ] ( cache ) ,
anomaly . WithKeyGenerator [ * anomaly . WeeklyProvider ] ( queryBuilder . NewKeyGenerator ( ) ) ,
anomaly . WithReader [ * anomaly . WeeklyProvider ] ( reader ) ,
)
}
2025-07-30 19:25:27 +05:30
if t . seasonality == anomaly . SeasonalityHourly {
t . providerV2 = anomalyV2 . NewHourlyProvider (
anomalyV2 . WithQuerier [ * anomalyV2 . HourlyProvider ] ( querierV5 ) ,
anomalyV2 . WithLogger [ * anomalyV2 . HourlyProvider ] ( logger ) ,
)
} else if t . seasonality == anomaly . SeasonalityDaily {
t . providerV2 = anomalyV2 . NewDailyProvider (
anomalyV2 . WithQuerier [ * anomalyV2 . DailyProvider ] ( querierV5 ) ,
anomalyV2 . WithLogger [ * anomalyV2 . DailyProvider ] ( logger ) ,
)
} else if t . seasonality == anomaly . SeasonalityWeekly {
t . providerV2 = anomalyV2 . NewWeeklyProvider (
anomalyV2 . WithQuerier [ * anomalyV2 . WeeklyProvider ] ( querierV5 ) ,
anomalyV2 . WithLogger [ * anomalyV2 . WeeklyProvider ] ( logger ) ,
)
}
t . querierV5 = querierV5
t . version = p . Version
t . logger = logger
2024-09-24 10:22:52 +05:30
return & t , nil
}
2025-04-18 00:04:25 +05:30
func ( r * AnomalyRule ) Type ( ) ruletypes . RuleType {
2024-09-24 10:22:52 +05:30
return RuleTypeAnomaly
}
2025-07-30 19:25:27 +05:30
func ( r * AnomalyRule ) prepareQueryRange ( ctx context . Context , ts time . Time ) ( * v3 . QueryRangeParamsV3 , error ) {
2024-09-24 10:22:52 +05:30
2025-07-30 19:25:27 +05:30
r . logger . InfoContext (
ctx , "prepare query range request v4" , "ts" , ts . UnixMilli ( ) , "eval_window" , r . EvalWindow ( ) . Milliseconds ( ) , "eval_delay" , r . EvalDelay ( ) . Milliseconds ( ) ,
)
2024-09-24 10:22:52 +05:30
2025-09-15 15:00:12 +05:30
st , en := r . Timestamps ( ts )
start := st . UnixMilli ( )
end := en . UnixMilli ( )
2024-09-24 10:22:52 +05:30
compositeQuery := r . Condition ( ) . CompositeQuery
if compositeQuery . PanelType != v3 . PanelTypeGraph {
compositeQuery . PanelType = v3 . PanelTypeGraph
}
// default mode
return & v3 . QueryRangeParamsV3 {
Start : start ,
End : end ,
Step : int64 ( math . Max ( float64 ( common . MinAllowedStepInterval ( start , end ) ) , 60 ) ) ,
CompositeQuery : compositeQuery ,
Variables : make ( map [ string ] interface { } , 0 ) ,
NoCache : false ,
} , nil
}
2025-07-30 19:25:27 +05:30
func ( r * AnomalyRule ) prepareQueryRangeV5 ( ctx context . Context , ts time . Time ) ( * qbtypes . QueryRangeRequest , error ) {
r . logger . InfoContext ( ctx , "prepare query range request v5" , "ts" , ts . UnixMilli ( ) , "eval_window" , r . EvalWindow ( ) . Milliseconds ( ) , "eval_delay" , r . EvalDelay ( ) . Milliseconds ( ) )
startTs , endTs := r . Timestamps ( ts )
start , end := startTs . UnixMilli ( ) , endTs . UnixMilli ( )
req := & qbtypes . QueryRangeRequest {
Start : uint64 ( start ) ,
End : uint64 ( end ) ,
RequestType : qbtypes . RequestTypeTimeSeries ,
CompositeQuery : qbtypes . CompositeQuery {
Queries : make ( [ ] qbtypes . QueryEnvelope , 0 ) ,
} ,
NoCache : true ,
}
2025-08-04 21:02:54 +05:30
req . CompositeQuery . Queries = make ( [ ] qbtypes . QueryEnvelope , len ( r . Condition ( ) . CompositeQuery . Queries ) )
copy ( req . CompositeQuery . Queries , r . Condition ( ) . CompositeQuery . Queries )
2025-07-30 19:25:27 +05:30
return req , nil
}
2024-09-24 10:22:52 +05:30
func ( r * AnomalyRule ) GetSelectedQuery ( ) string {
return r . Condition ( ) . GetSelectedQueryName ( )
}
2025-05-03 18:30:07 +05:30
func ( r * AnomalyRule ) buildAndRunQuery ( ctx context . Context , orgID valuer . UUID , ts time . Time ) ( ruletypes . Vector , error ) {
2024-09-24 10:22:52 +05:30
2025-07-30 19:25:27 +05:30
params , err := r . prepareQueryRange ( ctx , ts )
2024-09-24 10:22:52 +05:30
if err != nil {
return nil , err
}
2025-05-03 18:30:07 +05:30
err = r . PopulateTemporality ( ctx , orgID , params )
2024-09-24 10:22:52 +05:30
if err != nil {
return nil , fmt . Errorf ( "internal error while setting temporality" )
}
2025-05-03 18:30:07 +05:30
anomalies , err := r . provider . GetAnomalies ( ctx , orgID , & anomaly . GetAnomaliesRequest {
2024-09-24 10:22:52 +05:30
Params : params ,
Seasonality : r . seasonality ,
} )
if err != nil {
return nil , err
}
var queryResult * v3 . Result
for _ , result := range anomalies . Results {
if result . QueryName == r . GetSelectedQuery ( ) {
queryResult = result
break
}
}
2025-04-18 00:04:25 +05:30
var resultVector ruletypes . Vector
2024-09-24 10:22:52 +05:30
scoresJSON , _ := json . Marshal ( queryResult . AnomalyScores )
2025-07-30 19:25:27 +05:30
r . logger . InfoContext ( ctx , "anomaly scores" , "scores" , string ( scoresJSON ) )
for _ , series := range queryResult . AnomalyScores {
2025-09-12 13:11:54 +05:30
if r . Condition ( ) != nil && r . Condition ( ) . RequireMinPoints {
if len ( series . Points ) < r . Condition ( ) . RequiredNumPoints {
r . logger . InfoContext ( ctx , "not enough data points to evaluate series, skipping" , "ruleid" , r . ID ( ) , "numPoints" , len ( series . Points ) , "requiredPoints" , r . Condition ( ) . RequiredNumPoints )
continue
}
}
2025-10-03 19:47:15 +05:30
results , err := r . Threshold . ShouldAlert ( * series , r . Unit ( ) )
2025-09-12 13:11:54 +05:30
if err != nil {
return nil , err
2025-07-30 19:25:27 +05:30
}
2025-09-12 13:11:54 +05:30
resultVector = append ( resultVector , results ... )
2025-07-30 19:25:27 +05:30
}
return resultVector , nil
}
func ( r * AnomalyRule ) buildAndRunQueryV5 ( ctx context . Context , orgID valuer . UUID , ts time . Time ) ( ruletypes . Vector , error ) {
params , err := r . prepareQueryRangeV5 ( ctx , ts )
if err != nil {
return nil , err
}
anomalies , err := r . providerV2 . GetAnomalies ( ctx , orgID , & anomalyV2 . AnomaliesRequest {
Params : * params ,
Seasonality : anomalyV2 . Seasonality { String : valuer . NewString ( r . seasonality . String ( ) ) } ,
} )
if err != nil {
return nil , err
}
var qbResult * qbtypes . TimeSeriesData
for _ , result := range anomalies . Results {
if result . QueryName == r . GetSelectedQuery ( ) {
qbResult = result
break
}
}
if qbResult == nil {
r . logger . WarnContext ( ctx , "nil qb result" , "ts" , ts . UnixMilli ( ) )
}
queryResult := transition . ConvertV5TimeSeriesDataToV4Result ( qbResult )
var resultVector ruletypes . Vector
scoresJSON , _ := json . Marshal ( queryResult . AnomalyScores )
r . logger . InfoContext ( ctx , "anomaly scores" , "scores" , string ( scoresJSON ) )
2024-09-24 10:22:52 +05:30
for _ , series := range queryResult . AnomalyScores {
2025-09-12 13:11:54 +05:30
if r . Condition ( ) . RequireMinPoints {
if len ( series . Points ) < r . Condition ( ) . RequiredNumPoints {
r . logger . InfoContext ( ctx , "not enough data points to evaluate series, skipping" , "ruleid" , r . ID ( ) , "numPoints" , len ( series . Points ) , "requiredPoints" , r . Condition ( ) . RequiredNumPoints )
continue
}
}
2025-10-03 19:47:15 +05:30
results , err := r . Threshold . ShouldAlert ( * series , r . Unit ( ) )
2025-09-12 13:11:54 +05:30
if err != nil {
return nil , err
2024-09-24 10:22:52 +05:30
}
2025-09-12 13:11:54 +05:30
resultVector = append ( resultVector , results ... )
2024-09-24 10:22:52 +05:30
}
return resultVector , nil
}
func ( r * AnomalyRule ) Eval ( ctx context . Context , ts time . Time ) ( interface { } , error ) {
prevState := r . State ( )
valueFormatter := formatter . FromUnit ( r . Unit ( ) )
2025-07-30 19:25:27 +05:30
var res ruletypes . Vector
var err error
if r . version == "v5" {
r . logger . InfoContext ( ctx , "running v5 query" )
res , err = r . buildAndRunQueryV5 ( ctx , r . OrgID ( ) , ts )
} else {
r . logger . InfoContext ( ctx , "running v4 query" )
res , err = r . buildAndRunQuery ( ctx , r . OrgID ( ) , ts )
}
2024-09-24 10:22:52 +05:30
if err != nil {
return nil , err
}
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
resultFPs := map [ uint64 ] struct { } { }
2025-04-18 00:04:25 +05:30
var alerts = make ( map [ uint64 ] * ruletypes . Alert , len ( res ) )
2024-09-24 10:22:52 +05:30
2025-10-03 19:47:15 +05:30
ruleReceivers := r . Threshold . GetRuleReceivers ( )
ruleReceiverMap := make ( map [ string ] [ ] string )
for _ , value := range ruleReceivers {
ruleReceiverMap [ value . Name ] = value . Channels
}
2024-09-24 10:22:52 +05:30
for _ , smpl := range res {
l := make ( map [ string ] string , len ( smpl . Metric ) )
for _ , lbl := range smpl . Metric {
l [ lbl . Name ] = lbl . Value
}
value := valueFormatter . Format ( smpl . V , r . Unit ( ) )
2025-10-03 19:47:15 +05:30
threshold := valueFormatter . Format ( smpl . Target , smpl . TargetUnit )
2025-07-30 19:25:27 +05:30
r . logger . DebugContext ( ctx , "Alert template data for rule" , "rule_name" , r . Name ( ) , "formatter" , valueFormatter . Name ( ) , "value" , value , "threshold" , threshold )
2024-09-24 10:22:52 +05:30
2025-04-18 00:04:25 +05:30
tmplData := ruletypes . AlertTemplateData ( l , value , threshold )
2024-09-24 10:22:52 +05:30
// Inject some convenience variables that are easier to remember for users
// who are not used to Go's templating system.
defs := "{{$labels := .Labels}}{{$value := .Value}}{{$threshold := .Threshold}}"
// utility function to apply go template on labels and annotations
expand := func ( text string ) string {
2025-04-18 00:04:25 +05:30
tmpl := ruletypes . NewTemplateExpander (
2024-09-24 10:22:52 +05:30
ctx ,
defs + text ,
"__alert_" + r . Name ( ) ,
tmplData ,
times . Time ( timestamp . FromTime ( ts ) ) ,
nil ,
)
result , err := tmpl . Expand ( )
if err != nil {
result = fmt . Sprintf ( "<error expanding template: %s>" , err )
2025-07-30 19:25:27 +05:30
r . logger . ErrorContext ( ctx , "Expanding alert template failed" , "error" , err , "data" , tmplData , "rule_name" , r . Name ( ) )
2024-09-24 10:22:52 +05:30
}
return result
}
lb := labels . NewBuilder ( smpl . Metric ) . Del ( labels . MetricNameLabel ) . Del ( labels . TemporalityLabel )
2024-10-03 16:56:58 +05:30
resultLabels := labels . NewBuilder ( smpl . Metric ) . Del ( labels . MetricNameLabel ) . Del ( labels . TemporalityLabel ) . Labels ( )
2024-09-24 10:22:52 +05:30
for name , value := range r . Labels ( ) . Map ( ) {
lb . Set ( name , expand ( value ) )
}
lb . Set ( labels . AlertNameLabel , r . Name ( ) )
lb . Set ( labels . AlertRuleIdLabel , r . ID ( ) )
lb . Set ( labels . RuleSourceLabel , r . GeneratorURL ( ) )
annotations := make ( labels . Labels , 0 , len ( r . Annotations ( ) . Map ( ) ) )
for name , value := range r . Annotations ( ) . Map ( ) {
2024-10-03 16:56:58 +05:30
annotations = append ( annotations , labels . Label { Name : name , Value : expand ( value ) } )
2024-09-24 10:22:52 +05:30
}
if smpl . IsMissing {
lb . Set ( labels . AlertNameLabel , "[No data] " + r . Name ( ) )
2025-09-26 18:54:58 +05:30
lb . Set ( labels . NoDataLabel , "true" )
2024-09-24 10:22:52 +05:30
}
lbs := lb . Labels ( )
h := lbs . Hash ( )
resultFPs [ h ] = struct { } { }
if _ , ok := alerts [ h ] ; ok {
2025-07-30 19:25:27 +05:30
r . logger . ErrorContext ( ctx , "the alert query returns duplicate records" , "rule_id" , r . ID ( ) , "alert" , alerts [ h ] )
2024-09-24 10:22:52 +05:30
err = fmt . Errorf ( "duplicate alert found, vector contains metrics with the same labelset after applying alert labels" )
return nil , err
}
2025-04-18 00:04:25 +05:30
alerts [ h ] = & ruletypes . Alert {
2024-09-24 10:22:52 +05:30
Labels : lbs ,
QueryResultLables : resultLabels ,
Annotations : annotations ,
ActiveAt : ts ,
State : model . StatePending ,
Value : smpl . V ,
GeneratorURL : r . GeneratorURL ( ) ,
2025-10-03 19:47:15 +05:30
Receivers : ruleReceiverMap [ lbs . Map ( ) [ ruletypes . LabelThresholdName ] ] ,
2024-09-24 10:22:52 +05:30
Missing : smpl . IsMissing ,
}
}
2025-07-30 19:25:27 +05:30
r . logger . InfoContext ( ctx , "number of alerts found" , "rule_name" , r . Name ( ) , "alerts_count" , len ( alerts ) )
2024-09-24 10:22:52 +05:30
// alerts[h] is ready, add or update active list now
for h , a := range alerts {
// Check whether we already have alerting state for the identifying label set.
// Update the last value and annotations if so, create a new alert entry otherwise.
if alert , ok := r . Active [ h ] ; ok && alert . State != model . StateInactive {
alert . Value = a . Value
alert . Annotations = a . Annotations
2025-10-03 19:47:15 +05:30
if v , ok := alert . Labels . Map ( ) [ ruletypes . LabelThresholdName ] ; ok {
alert . Receivers = ruleReceiverMap [ v ]
}
2024-09-24 10:22:52 +05:30
continue
}
r . Active [ h ] = a
}
itemsToAdd := [ ] model . RuleStateHistory { }
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
for fp , a := range r . Active {
labelsJSON , err := json . Marshal ( a . QueryResultLables )
if err != nil {
2025-07-30 19:25:27 +05:30
r . logger . ErrorContext ( ctx , "error marshaling labels" , "error" , err , "labels" , a . Labels )
2024-09-24 10:22:52 +05:30
}
if _ , ok := resultFPs [ fp ] ; ! ok {
// If the alert was previously firing, keep it around for a given
// retention time so it is reported as resolved to the AlertManager.
2025-04-18 00:04:25 +05:30
if a . State == model . StatePending || ( ! a . ResolvedAt . IsZero ( ) && ts . Sub ( a . ResolvedAt ) > ruletypes . ResolvedRetention ) {
2024-09-24 10:22:52 +05:30
delete ( r . Active , fp )
}
if a . State != model . StateInactive {
a . State = model . StateInactive
a . ResolvedAt = ts
itemsToAdd = append ( itemsToAdd , model . RuleStateHistory {
RuleID : r . ID ( ) ,
RuleName : r . Name ( ) ,
State : model . StateInactive ,
StateChanged : true ,
UnixMilli : ts . UnixMilli ( ) ,
Labels : model . LabelsString ( labelsJSON ) ,
Fingerprint : a . QueryResultLables . Hash ( ) ,
Value : a . Value ,
} )
}
continue
}
if a . State == model . StatePending && ts . Sub ( a . ActiveAt ) >= r . HoldDuration ( ) {
a . State = model . StateFiring
a . FiredAt = ts
state := model . StateFiring
if a . Missing {
state = model . StateNoData
}
itemsToAdd = append ( itemsToAdd , model . RuleStateHistory {
RuleID : r . ID ( ) ,
RuleName : r . Name ( ) ,
State : state ,
StateChanged : true ,
UnixMilli : ts . UnixMilli ( ) ,
Labels : model . LabelsString ( labelsJSON ) ,
Fingerprint : a . QueryResultLables . Hash ( ) ,
Value : a . Value ,
} )
}
}
currentState := r . State ( )
overallStateChanged := currentState != prevState
for idx , item := range itemsToAdd {
item . OverallStateChanged = overallStateChanged
item . OverallState = currentState
itemsToAdd [ idx ] = item
}
r . RecordRuleStateHistory ( ctx , prevState , currentState , itemsToAdd )
return len ( r . Active ) , nil
}
func ( r * AnomalyRule ) String ( ) string {
2025-04-18 00:04:25 +05:30
ar := ruletypes . PostableRule {
2024-09-24 10:22:52 +05:30
AlertName : r . Name ( ) ,
RuleCondition : r . Condition ( ) ,
2025-04-18 00:04:25 +05:30
EvalWindow : ruletypes . Duration ( r . EvalWindow ( ) ) ,
2024-09-24 10:22:52 +05:30
Labels : r . Labels ( ) . Map ( ) ,
Annotations : r . Annotations ( ) . Map ( ) ,
PreferredChannels : r . PreferredChannels ( ) ,
}
2025-09-12 13:11:54 +05:30
byt , err := json . Marshal ( ar )
2024-09-24 10:22:52 +05:30
if err != nil {
return fmt . Sprintf ( "error marshaling alerting rule: %s" , err . Error ( ) )
}
return string ( byt )
}