2024-09-11 09:56:59 +05:30
package rules
import (
"context"
"fmt"
2025-07-30 19:25:27 +05:30
"log/slog"
2024-09-11 09:56:59 +05:30
"net/url"
"sync"
"time"
2025-09-15 15:00:12 +05:30
"github.com/SigNoz/signoz/pkg/errors"
2025-03-20 21:01:41 +05:30
"github.com/SigNoz/signoz/pkg/query-service/interfaces"
"github.com/SigNoz/signoz/pkg/query-service/model"
v3 "github.com/SigNoz/signoz/pkg/query-service/model/v3"
qslabels "github.com/SigNoz/signoz/pkg/query-service/utils/labels"
"github.com/SigNoz/signoz/pkg/sqlstore"
2025-04-18 00:04:25 +05:30
ruletypes "github.com/SigNoz/signoz/pkg/types/ruletypes"
2025-05-03 18:30:07 +05:30
"github.com/SigNoz/signoz/pkg/valuer"
2024-09-11 09:56:59 +05:30
"go.uber.org/zap"
)
// BaseRule contains common fields and methods for all rule types
type BaseRule struct {
id string
name string
2025-05-03 18:30:07 +05:30
orgID valuer . UUID
2024-09-11 09:56:59 +05:30
source string
handledRestart bool
// Type of the rule
2025-04-18 00:04:25 +05:30
typ ruletypes . AlertType
2024-09-11 09:56:59 +05:30
2025-04-18 00:04:25 +05:30
ruleCondition * ruletypes . RuleCondition
2025-09-12 13:11:54 +05:30
Threshold ruletypes . RuleThreshold
2024-09-11 09:56:59 +05:30
// evalWindow is the time window used for evaluating the rule
// i.e each time we lookback from the current time, we look at data for the last
// evalWindow duration
evalWindow time . Duration
// holdDuration is the duration for which the alert waits before firing
holdDuration time . Duration
// evalDelay is the delay in evaluation of the rule
// this is useful in cases where the data is not available immediately
evalDelay time . Duration
// holds the static set of labels and annotations for the rule
// these are the same for all alerts created for this rule
labels qslabels . BaseLabels
annotations qslabels . BaseLabels
// preferredChannels is the list of channels to send the alert to
// if the rule is triggered
preferredChannels [ ] string
mtx sync . Mutex
// the time it took to evaluate the rule (most recent evaluation)
evaluationDuration time . Duration
// the timestamp of the last evaluation
evaluationTimestamp time . Time
2025-04-18 00:04:25 +05:30
health ruletypes . RuleHealth
2024-09-11 09:56:59 +05:30
lastError error
2025-04-18 00:04:25 +05:30
Active map [ uint64 ] * ruletypes . Alert
2024-09-11 09:56:59 +05:30
// lastTimestampWithDatapoints is the timestamp of the last datapoint we observed
// for this rule
// this is used for missing data alerts
lastTimestampWithDatapoints time . Time
reader interfaces . Reader
2025-07-30 19:25:27 +05:30
logger * slog . Logger
2024-09-11 09:56:59 +05:30
// sendUnmatched sends observed metric values
// even if they dont match the rule condition. this is
// useful in testing the rule
sendUnmatched bool
// sendAlways will send alert irresepective of resendDelay
// or other params
sendAlways bool
2024-09-24 10:22:52 +05:30
// TemporalityMap is a map of metric name to temporality
// to avoid fetching temporality for the same metric multiple times
// querying the v4 table on low cardinal temporality column
// should be fast but we can still avoid the query if we have the data in memory
TemporalityMap map [ string ] map [ v3 . Temporality ] bool
2025-03-10 01:30:42 +05:30
sqlstore sqlstore . SQLStore
2025-09-15 15:00:12 +05:30
evaluation ruletypes . Evaluation
2024-09-11 09:56:59 +05:30
}
type RuleOption func ( * BaseRule )
func WithSendAlways ( ) RuleOption {
return func ( r * BaseRule ) {
r . sendAlways = true
}
}
func WithSendUnmatched ( ) RuleOption {
return func ( r * BaseRule ) {
r . sendUnmatched = true
}
}
func WithEvalDelay ( dur time . Duration ) RuleOption {
return func ( r * BaseRule ) {
r . evalDelay = dur
}
}
2025-07-30 19:25:27 +05:30
func WithLogger ( logger * slog . Logger ) RuleOption {
2024-09-11 09:56:59 +05:30
return func ( r * BaseRule ) {
r . logger = logger
}
}
2025-03-10 01:30:42 +05:30
func WithSQLStore ( sqlstore sqlstore . SQLStore ) RuleOption {
return func ( r * BaseRule ) {
r . sqlstore = sqlstore
}
}
2025-05-03 18:30:07 +05:30
func NewBaseRule ( id string , orgID valuer . UUID , p * ruletypes . PostableRule , reader interfaces . Reader , opts ... RuleOption ) ( * BaseRule , error ) {
2024-09-11 09:56:59 +05:30
if p . RuleCondition == nil || ! p . RuleCondition . IsValid ( ) {
return nil , fmt . Errorf ( "invalid rule condition" )
}
2025-09-12 13:11:54 +05:30
threshold , err := p . RuleCondition . Thresholds . GetRuleThreshold ( )
if err != nil {
return nil , err
}
2025-09-15 15:00:12 +05:30
evaluation , err := p . Evaluation . GetEvaluation ( )
if err != nil {
return nil , errors . NewInvalidInputf ( errors . CodeInvalidInput , "failed to get evaluation: %v" , err )
}
2024-09-11 09:56:59 +05:30
baseRule := & BaseRule {
id : id ,
2025-05-03 18:30:07 +05:30
orgID : orgID ,
2024-09-11 09:56:59 +05:30
name : p . AlertName ,
source : p . Source ,
2024-09-13 17:30:02 +05:30
typ : p . AlertType ,
2024-09-11 09:56:59 +05:30
ruleCondition : p . RuleCondition ,
evalWindow : time . Duration ( p . EvalWindow ) ,
labels : qslabels . FromMap ( p . Labels ) ,
annotations : qslabels . FromMap ( p . Annotations ) ,
preferredChannels : p . PreferredChannels ,
2025-04-18 00:04:25 +05:30
health : ruletypes . HealthUnknown ,
Active : map [ uint64 ] * ruletypes . Alert { } ,
2024-09-11 09:56:59 +05:30
reader : reader ,
2024-09-24 10:22:52 +05:30
TemporalityMap : make ( map [ string ] map [ v3 . Temporality ] bool ) ,
2025-09-12 13:11:54 +05:30
Threshold : threshold ,
2025-09-15 15:00:12 +05:30
evaluation : evaluation ,
2024-09-11 09:56:59 +05:30
}
if baseRule . evalWindow == 0 {
baseRule . evalWindow = 5 * time . Minute
}
for _ , opt := range opts {
opt ( baseRule )
}
return baseRule , nil
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) matchType ( ) ruletypes . MatchType {
2024-09-11 09:56:59 +05:30
if r . ruleCondition == nil {
2025-04-18 00:04:25 +05:30
return ruletypes . AtleastOnce
2024-09-11 09:56:59 +05:30
}
return r . ruleCondition . MatchType
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) compareOp ( ) ruletypes . CompareOp {
2024-09-11 09:56:59 +05:30
if r . ruleCondition == nil {
2025-04-18 00:04:25 +05:30
return ruletypes . ValueIsEq
2024-09-11 09:56:59 +05:30
}
return r . ruleCondition . CompareOp
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) currentAlerts ( ) [ ] * ruletypes . Alert {
2024-09-11 09:56:59 +05:30
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
2025-04-18 00:04:25 +05:30
alerts := make ( [ ] * ruletypes . Alert , 0 , len ( r . Active ) )
2024-09-24 10:22:52 +05:30
for _ , a := range r . Active {
2024-09-11 09:56:59 +05:30
anew := * a
alerts = append ( alerts , & anew )
}
return alerts
}
2024-09-24 10:22:52 +05:30
func ( r * BaseRule ) EvalDelay ( ) time . Duration {
return r . evalDelay
}
func ( r * BaseRule ) EvalWindow ( ) time . Duration {
return r . evalWindow
}
func ( r * BaseRule ) HoldDuration ( ) time . Duration {
return r . holdDuration
}
2024-09-11 09:56:59 +05:30
func ( r * ThresholdRule ) hostFromSource ( ) string {
parsedUrl , err := url . Parse ( r . source )
if err != nil {
return ""
}
if parsedUrl . Port ( ) != "" {
return fmt . Sprintf ( "%s://%s:%s" , parsedUrl . Scheme , parsedUrl . Hostname ( ) , parsedUrl . Port ( ) )
}
return fmt . Sprintf ( "%s://%s" , parsedUrl . Scheme , parsedUrl . Hostname ( ) )
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) ID ( ) string { return r . id }
2025-05-03 18:30:07 +05:30
func ( r * BaseRule ) OrgID ( ) valuer . UUID { return r . orgID }
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) Name ( ) string { return r . name }
func ( r * BaseRule ) Condition ( ) * ruletypes . RuleCondition { return r . ruleCondition }
func ( r * BaseRule ) Labels ( ) qslabels . BaseLabels { return r . labels }
func ( r * BaseRule ) Annotations ( ) qslabels . BaseLabels { return r . annotations }
func ( r * BaseRule ) PreferredChannels ( ) [ ] string { return r . preferredChannels }
2024-09-11 09:56:59 +05:30
func ( r * BaseRule ) GeneratorURL ( ) string {
2025-04-18 00:04:25 +05:30
return ruletypes . PrepareRuleGeneratorURL ( r . ID ( ) , r . source )
2024-09-11 09:56:59 +05:30
}
func ( r * BaseRule ) Unit ( ) string {
if r . ruleCondition != nil && r . ruleCondition . CompositeQuery != nil {
return r . ruleCondition . CompositeQuery . Unit
}
return ""
}
2024-09-17 15:33:17 +05:30
func ( r * BaseRule ) Timestamps ( ts time . Time ) ( time . Time , time . Time ) {
2025-09-15 15:00:12 +05:30
st , en := r . evaluation . NextWindowFor ( ts )
start := st . UnixMilli ( )
end := en . UnixMilli ( )
2024-09-17 15:33:17 +05:30
if r . evalDelay > 0 {
start = start - int64 ( r . evalDelay . Milliseconds ( ) )
end = end - int64 ( r . evalDelay . Milliseconds ( ) )
}
// round to minute otherwise we could potentially miss data
start = start - ( start % ( 60 * 1000 ) )
end = end - ( end % ( 60 * 1000 ) )
return time . UnixMilli ( start ) , time . UnixMilli ( end )
}
2024-09-11 09:56:59 +05:30
func ( r * BaseRule ) SetLastError ( err error ) {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
r . lastError = err
}
func ( r * BaseRule ) LastError ( ) error {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
return r . lastError
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) SetHealth ( health ruletypes . RuleHealth ) {
2024-09-11 09:56:59 +05:30
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
r . health = health
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) Health ( ) ruletypes . RuleHealth {
2024-09-11 09:56:59 +05:30
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
return r . health
}
func ( r * BaseRule ) SetEvaluationDuration ( dur time . Duration ) {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
r . evaluationDuration = dur
}
func ( r * BaseRule ) GetEvaluationDuration ( ) time . Duration {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
return r . evaluationDuration
}
func ( r * BaseRule ) SetEvaluationTimestamp ( ts time . Time ) {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
r . evaluationTimestamp = ts
}
func ( r * BaseRule ) GetEvaluationTimestamp ( ) time . Time {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
return r . evaluationTimestamp
}
func ( r * BaseRule ) State ( ) model . AlertState {
maxState := model . StateInactive
2024-09-24 10:22:52 +05:30
for _ , a := range r . Active {
2024-09-11 09:56:59 +05:30
if a . State > maxState {
maxState = a . State
}
}
return maxState
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) ActiveAlerts ( ) [ ] * ruletypes . Alert {
var res [ ] * ruletypes . Alert
2024-09-11 09:56:59 +05:30
for _ , a := range r . currentAlerts ( ) {
if a . ResolvedAt . IsZero ( ) {
res = append ( res , a )
}
}
return res
}
func ( r * BaseRule ) SendAlerts ( ctx context . Context , ts time . Time , resendDelay time . Duration , interval time . Duration , notifyFunc NotifyFunc ) {
2025-03-10 01:30:42 +05:30
var orgID string
err := r .
sqlstore .
BunDB ( ) .
NewSelect ( ) .
Table ( "organizations" ) .
ColumnExpr ( "id" ) .
Limit ( 1 ) .
Scan ( ctx , & orgID )
if err != nil {
2025-07-30 19:25:27 +05:30
r . logger . ErrorContext ( ctx , "failed to get org ids" , "error" , err )
2025-03-10 01:30:42 +05:30
return
}
2025-04-18 00:04:25 +05:30
alerts := [ ] * ruletypes . Alert { }
r . ForEachActiveAlert ( func ( alert * ruletypes . Alert ) {
if alert . NeedsSending ( ts , resendDelay ) {
2024-09-11 09:56:59 +05:30
alert . LastSentAt = ts
delta := resendDelay
if interval > resendDelay {
delta = interval
}
alert . ValidUntil = ts . Add ( 4 * delta )
anew := * alert
alerts = append ( alerts , & anew )
}
} )
2025-03-10 01:30:42 +05:30
notifyFunc ( ctx , orgID , "" , alerts ... )
2024-09-11 09:56:59 +05:30
}
2025-04-18 00:04:25 +05:30
func ( r * BaseRule ) ForEachActiveAlert ( f func ( * ruletypes . Alert ) ) {
2024-09-11 09:56:59 +05:30
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
2024-09-24 10:22:52 +05:30
for _ , a := range r . Active {
2024-09-11 09:56:59 +05:30
f ( a )
}
}
2024-09-13 18:10:49 +05:30
func ( r * BaseRule ) RecordRuleStateHistory ( ctx context . Context , prevState , currentState model . AlertState , itemsToAdd [ ] model . RuleStateHistory ) error {
2024-09-11 09:56:59 +05:30
zap . L ( ) . Debug ( "recording rule state history" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "prevState" , prevState ) , zap . Any ( "currentState" , currentState ) , zap . Any ( "itemsToAdd" , itemsToAdd ) )
2024-09-13 18:10:49 +05:30
revisedItemsToAdd := map [ uint64 ] model . RuleStateHistory { }
2024-09-11 09:56:59 +05:30
lastSavedState , err := r . reader . GetLastSavedRuleStateHistory ( ctx , r . ID ( ) )
if err != nil {
return err
}
// if the query-service has been restarted, or the rule has been modified (which re-initializes the rule),
// the state would reset so we need to add the corresponding state changes to previously saved states
if ! r . handledRestart && len ( lastSavedState ) > 0 {
zap . L ( ) . Debug ( "handling restart" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "lastSavedState" , lastSavedState ) )
2024-09-13 18:10:49 +05:30
l := map [ uint64 ] model . RuleStateHistory { }
2024-09-11 09:56:59 +05:30
for _ , item := range itemsToAdd {
l [ item . Fingerprint ] = item
}
shouldSkip := map [ uint64 ] bool { }
for _ , item := range lastSavedState {
// for the last saved item with fingerprint, check if there is a corresponding entry in the current state
currentState , ok := l [ item . Fingerprint ]
if ! ok {
// there was a state change in the past, but not in the current state
// if the state was firing, then we should add a resolved state change
if item . State == model . StateFiring || item . State == model . StateNoData {
item . State = model . StateInactive
item . StateChanged = true
item . UnixMilli = time . Now ( ) . UnixMilli ( )
revisedItemsToAdd [ item . Fingerprint ] = item
}
// there is nothing to do if the prev state was normal
} else {
if item . State != currentState . State {
item . State = currentState . State
item . StateChanged = true
item . UnixMilli = time . Now ( ) . UnixMilli ( )
revisedItemsToAdd [ item . Fingerprint ] = item
}
}
// do not add this item to revisedItemsToAdd as it is already processed
shouldSkip [ item . Fingerprint ] = true
}
zap . L ( ) . Debug ( "after lastSavedState loop" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "revisedItemsToAdd" , revisedItemsToAdd ) )
// if there are any new state changes that were not saved, add them to the revised items
for _ , item := range itemsToAdd {
if _ , ok := revisedItemsToAdd [ item . Fingerprint ] ; ! ok && ! shouldSkip [ item . Fingerprint ] {
revisedItemsToAdd [ item . Fingerprint ] = item
}
}
zap . L ( ) . Debug ( "after itemsToAdd loop" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "revisedItemsToAdd" , revisedItemsToAdd ) )
newState := model . StateInactive
for _ , item := range revisedItemsToAdd {
if item . State == model . StateFiring || item . State == model . StateNoData {
newState = model . StateFiring
break
}
}
zap . L ( ) . Debug ( "newState" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "newState" , newState ) )
// if there is a change in the overall state, update the overall state
if lastSavedState [ 0 ] . OverallState != newState {
for fingerprint , item := range revisedItemsToAdd {
item . OverallState = newState
item . OverallStateChanged = true
revisedItemsToAdd [ fingerprint ] = item
}
}
zap . L ( ) . Debug ( "revisedItemsToAdd after newState" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "revisedItemsToAdd" , revisedItemsToAdd ) )
} else {
for _ , item := range itemsToAdd {
revisedItemsToAdd [ item . Fingerprint ] = item
}
}
if len ( revisedItemsToAdd ) > 0 && r . reader != nil {
zap . L ( ) . Debug ( "writing rule state history" , zap . String ( "ruleid" , r . ID ( ) ) , zap . Any ( "revisedItemsToAdd" , revisedItemsToAdd ) )
2024-09-13 18:10:49 +05:30
entries := make ( [ ] model . RuleStateHistory , 0 , len ( revisedItemsToAdd ) )
2024-09-11 09:56:59 +05:30
for _ , item := range revisedItemsToAdd {
entries = append ( entries , item )
}
err := r . reader . AddRuleStateHistory ( ctx , entries )
if err != nil {
zap . L ( ) . Error ( "error while inserting rule state history" , zap . Error ( err ) , zap . Any ( "itemsToAdd" , itemsToAdd ) )
}
}
r . handledRestart = true
return nil
}
2024-09-24 10:22:52 +05:30
2025-05-03 18:30:07 +05:30
func ( r * BaseRule ) PopulateTemporality ( ctx context . Context , orgID valuer . UUID , qp * v3 . QueryRangeParamsV3 ) error {
2024-09-24 10:22:52 +05:30
missingTemporality := make ( [ ] string , 0 )
metricNameToTemporality := make ( map [ string ] map [ v3 . Temporality ] bool )
if qp . CompositeQuery != nil && len ( qp . CompositeQuery . BuilderQueries ) > 0 {
for _ , query := range qp . CompositeQuery . BuilderQueries {
// if there is no temporality specified in the query but we have it in the map
// then use the value from the map
if query . Temporality == "" && r . TemporalityMap [ query . AggregateAttribute . Key ] != nil {
// We prefer delta if it is available
if r . TemporalityMap [ query . AggregateAttribute . Key ] [ v3 . Delta ] {
query . Temporality = v3 . Delta
} else if r . TemporalityMap [ query . AggregateAttribute . Key ] [ v3 . Cumulative ] {
query . Temporality = v3 . Cumulative
} else {
query . Temporality = v3 . Unspecified
}
}
// we don't have temporality for this metric
if query . DataSource == v3 . DataSourceMetrics && query . Temporality == "" {
missingTemporality = append ( missingTemporality , query . AggregateAttribute . Key )
}
if _ , ok := metricNameToTemporality [ query . AggregateAttribute . Key ] ; ! ok {
metricNameToTemporality [ query . AggregateAttribute . Key ] = make ( map [ v3 . Temporality ] bool )
}
}
}
var nameToTemporality map [ string ] map [ v3 . Temporality ] bool
var err error
if len ( missingTemporality ) > 0 {
2025-05-03 18:30:07 +05:30
nameToTemporality , err = r . reader . FetchTemporality ( ctx , orgID , missingTemporality )
2024-09-24 10:22:52 +05:30
if err != nil {
return err
}
}
if qp . CompositeQuery != nil && len ( qp . CompositeQuery . BuilderQueries ) > 0 {
for name := range qp . CompositeQuery . BuilderQueries {
query := qp . CompositeQuery . BuilderQueries [ name ]
if query . DataSource == v3 . DataSourceMetrics && query . Temporality == "" {
if nameToTemporality [ query . AggregateAttribute . Key ] [ v3 . Delta ] {
query . Temporality = v3 . Delta
} else if nameToTemporality [ query . AggregateAttribute . Key ] [ v3 . Cumulative ] {
query . Temporality = v3 . Cumulative
} else {
query . Temporality = v3 . Unspecified
}
r . TemporalityMap [ query . AggregateAttribute . Key ] = nameToTemporality [ query . AggregateAttribute . Key ]
}
}
}
return nil
}