From 8b485de584b9500869533432d1221dea90eff318 Mon Sep 17 00:00:00 2001 From: Yunus M Date: Fri, 26 Sep 2025 18:07:59 +0530 Subject: [PATCH 1/4] chore: create a HOC to wrap components with ErrorBoundary (#9096) * chore: create a HOC to wrap components with ErrorBoundary * feat: move svg to public, use render from test-utils --- .gitignore | 4 +- frontend/jest.config.ts | 1 + frontend/public/Images/cloud.svg | 81 +++++++ .../src/components/ErrorBoundaryHOC/README.md | 117 ++++++++++ .../__tests__/withErrorBoundary.test.tsx | 211 ++++++++++++++++++ .../src/components/ErrorBoundaryHOC/index.ts | 2 + .../withErrorBoundary.example.tsx | 143 ++++++++++++ .../ErrorBoundaryHOC/withErrorBoundary.tsx | 99 ++++++++ .../Domains/DomainDetails/TopErrors.tsx | 3 +- .../Summary/__tests__/Summary.test.tsx | 2 +- .../ErrorBoundaryFallback.styles.scss | 41 +++- .../ErrorBoundaryFallback.tsx | 77 +++---- .../src/pages/Support/Support.styles.scss | 120 +++++++++- frontend/src/pages/Support/Support.tsx | 77 ++++--- 14 files changed, 887 insertions(+), 91 deletions(-) create mode 100644 frontend/public/Images/cloud.svg create mode 100644 frontend/src/components/ErrorBoundaryHOC/README.md create mode 100644 frontend/src/components/ErrorBoundaryHOC/__tests__/withErrorBoundary.test.tsx create mode 100644 frontend/src/components/ErrorBoundaryHOC/index.ts create mode 100644 frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.example.tsx create mode 100644 frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.tsx diff --git a/.gitignore b/.gitignore index 014c7c2800bc..c002fbe276c1 100644 --- a/.gitignore +++ b/.gitignore @@ -230,6 +230,6 @@ poetry.toml # LSP config files pyrightconfig.json -# End of https://www.toptal.com/developers/gitignore/api/python -frontend/.cursor/rules/ \ No newline at end of file +# cursor files +frontend/.cursor/ diff --git a/frontend/jest.config.ts b/frontend/jest.config.ts index 1d9255a329e8..a5e8d86c3fce 100644 --- a/frontend/jest.config.ts +++ b/frontend/jest.config.ts @@ -3,6 +3,7 @@ import type { Config } from '@jest/types'; const USE_SAFE_NAVIGATE_MOCK_PATH = '/__mocks__/useSafeNavigate.ts'; const config: Config.InitialOptions = { + silent: true, clearMocks: true, coverageDirectory: 'coverage', coverageReporters: ['text', 'cobertura', 'html', 'json-summary'], diff --git a/frontend/public/Images/cloud.svg b/frontend/public/Images/cloud.svg new file mode 100644 index 000000000000..c7138d589b2f --- /dev/null +++ b/frontend/public/Images/cloud.svg @@ -0,0 +1,81 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/frontend/src/components/ErrorBoundaryHOC/README.md b/frontend/src/components/ErrorBoundaryHOC/README.md new file mode 100644 index 000000000000..4022cc96681d --- /dev/null +++ b/frontend/src/components/ErrorBoundaryHOC/README.md @@ -0,0 +1,117 @@ +# withErrorBoundary HOC + +A Higher-Order Component (HOC) that wraps React components with ErrorBoundary to provide error handling and recovery. + +## Features + +- **Automatic Error Catching**: Catches JavaScript errors in any component tree +- **Integration**: Automatically reports errors with context +- **Custom Fallback UI**: Supports custom error fallback components +- **Error Logging**: Optional custom error handlers for additional logging +- **TypeScript Support**: Fully typed with proper generics +- **Component Context**: Automatically adds component name to tags + +## Basic Usage + +```tsx +import { withErrorBoundary } from 'components/HOC'; + +// Wrap any component +const SafeComponent = withErrorBoundary(MyComponent); + +// Use it like any other component + +``` + +## Advanced Usage + +### Custom Fallback Component + +```tsx +const CustomFallback = () => ( +
+

Oops! Something went wrong

+ +
+); + +const SafeComponent = withErrorBoundary(MyComponent, { + fallback: +}); +``` + +### Custom Error Handler + +```tsx +const SafeComponent = withErrorBoundary(MyComponent, { + onError: (error, componentStack, eventId) => { + console.error('Component error:', error); + // Send to analytics, logging service, etc. + } +}); +``` + +### Sentry Configuration + +```tsx +const SafeComponent = withErrorBoundary(MyComponent, { + sentryOptions: { + tags: { + section: 'dashboard', + priority: 'high', + feature: 'metrics' + }, + level: 'error' + } +}); +``` + +## API Reference + +### `withErrorBoundary

(component, options?)` + +#### Parameters + +- `component: ComponentType

` - The React component to wrap +- `options?: WithErrorBoundaryOptions` - Configuration options + +#### Options + +```tsx +interface WithErrorBoundaryOptions { + /** Custom fallback component to render when an error occurs */ + fallback?: ReactElement; + + /** Custom error handler function */ + onError?: ( + error: unknown, + componentStack: string | undefined, + eventId: string + ) => void; + + /** Additional props to pass to the Sentry ErrorBoundary */ + sentryOptions?: { + tags?: Record; + level?: Sentry.SeverityLevel; + }; +} +``` + +## When to Use + +- **Critical Components**: Wrap important UI components that shouldn't crash the entire app +- **Third-party Integrations**: Wrap components that use external libraries +- **Data-heavy Components**: Wrap components that process complex data +- **Route Components**: Wrap page-level components to prevent navigation issues + +## Best Practices + +1. **Use Sparingly**: Don't wrap every component - focus on critical ones +2. **Meaningful Fallbacks**: Provide helpful fallback UI that guides users +3. **Log Errors**: Always implement error logging for debugging +4. **Component Names**: Ensure components have proper `displayName` for debugging +5. **Test Error Scenarios**: Test that your error boundaries work as expected + +## Examples + +See `withErrorBoundary.example.tsx` for complete usage examples. diff --git a/frontend/src/components/ErrorBoundaryHOC/__tests__/withErrorBoundary.test.tsx b/frontend/src/components/ErrorBoundaryHOC/__tests__/withErrorBoundary.test.tsx new file mode 100644 index 000000000000..3cec7083326a --- /dev/null +++ b/frontend/src/components/ErrorBoundaryHOC/__tests__/withErrorBoundary.test.tsx @@ -0,0 +1,211 @@ +import { render, screen } from '@testing-library/react'; +import React from 'react'; + +import withErrorBoundary, { + WithErrorBoundaryOptions, +} from '../withErrorBoundary'; + +// Mock dependencies before imports +jest.mock('@sentry/react', () => { + const ReactMock = jest.requireActual('react'); + + class MockErrorBoundary extends ReactMock.Component< + { + children: React.ReactNode; + fallback: React.ReactElement; + onError?: (error: Error, componentStack: string, eventId: string) => void; + beforeCapture?: (scope: { + setTag: (key: string, value: string) => void; + setLevel: (level: string) => void; + }) => void; + }, + { hasError: boolean } + > { + constructor(props: MockErrorBoundary['props']) { + super(props); + this.state = { hasError: false }; + } + + static getDerivedStateFromError(): { hasError: boolean } { + return { hasError: true }; + } + + componentDidCatch(error: Error, errorInfo: { componentStack: string }): void { + const { beforeCapture, onError } = this.props; + if (beforeCapture) { + const mockScope = { + setTag: jest.fn(), + setLevel: jest.fn(), + }; + beforeCapture(mockScope); + } + if (onError) { + onError(error, errorInfo.componentStack, 'mock-event-id'); + } + } + + render(): React.ReactNode { + const { hasError } = this.state; + const { fallback, children } = this.props; + if (hasError) { + return

{fallback}
; + } + return
{children}
; + } + } + + return { + ErrorBoundary: MockErrorBoundary, + SeverityLevel: { + error: 'error', + warning: 'warning', + info: 'info', + }, + }; +}); + +jest.mock( + '../../../pages/ErrorBoundaryFallback/ErrorBoundaryFallback', + () => + function MockErrorBoundaryFallback(): JSX.Element { + return ( +
Default Error Fallback
+ ); + }, +); + +// Test component that can throw errors +interface TestComponentProps { + shouldThrow?: boolean; + message?: string; +} + +function TestComponent({ + shouldThrow = false, + message = 'Test Component', +}: TestComponentProps): JSX.Element { + if (shouldThrow) { + throw new Error('Test error'); + } + return
{message}
; +} + +TestComponent.defaultProps = { + shouldThrow: false, + message: 'Test Component', +}; + +// Test component with display name +function NamedComponent(): JSX.Element { + return
Named Component
; +} +NamedComponent.displayName = 'NamedComponent'; + +describe('withErrorBoundary', () => { + // Suppress console errors for cleaner test output + const originalError = console.error; + beforeAll(() => { + console.error = jest.fn(); + }); + + afterAll(() => { + console.error = originalError; + }); + + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('should wrap component with ErrorBoundary and render successfully', () => { + // Arrange + const SafeComponent = withErrorBoundary(TestComponent); + + // Act + render(); + + // Assert + expect(screen.getByTestId('app-error-boundary')).toBeInTheDocument(); + expect(screen.getByTestId('test-component')).toBeInTheDocument(); + expect(screen.getByText('Hello World')).toBeInTheDocument(); + }); + + it('should render fallback UI when component throws error', () => { + // Arrange + const SafeComponent = withErrorBoundary(TestComponent); + + // Act + render(); + + // Assert + expect(screen.getByTestId('error-boundary-fallback')).toBeInTheDocument(); + expect(screen.getByTestId('default-error-fallback')).toBeInTheDocument(); + }); + + it('should render custom fallback component when provided', () => { + // Arrange + const customFallback = ( +
Custom Error UI
+ ); + const options: WithErrorBoundaryOptions = { + fallback: customFallback, + }; + const SafeComponent = withErrorBoundary(TestComponent, options); + + // Act + render(); + + // Assert + expect(screen.getByTestId('error-boundary-fallback')).toBeInTheDocument(); + expect(screen.getByTestId('custom-fallback')).toBeInTheDocument(); + expect(screen.getByText('Custom Error UI')).toBeInTheDocument(); + }); + + it('should call custom error handler when error occurs', () => { + // Arrange + const mockErrorHandler = jest.fn(); + const options: WithErrorBoundaryOptions = { + onError: mockErrorHandler, + }; + const SafeComponent = withErrorBoundary(TestComponent, options); + + // Act + render(); + + // Assert + expect(mockErrorHandler).toHaveBeenCalledWith( + expect.any(Error), + expect.any(String), + 'mock-event-id', + ); + expect(mockErrorHandler).toHaveBeenCalledTimes(1); + }); + + it('should set correct display name for debugging', () => { + // Arrange & Act + const SafeTestComponent = withErrorBoundary(TestComponent); + const SafeNamedComponent = withErrorBoundary(NamedComponent); + + // Assert + expect(SafeTestComponent.displayName).toBe( + 'withErrorBoundary(TestComponent)', + ); + expect(SafeNamedComponent.displayName).toBe( + 'withErrorBoundary(NamedComponent)', + ); + }); + + it('should handle component without display name', () => { + // Arrange + function AnonymousComponent(): JSX.Element { + return
Anonymous
; + } + + // Act + const SafeAnonymousComponent = withErrorBoundary(AnonymousComponent); + + // Assert + expect(SafeAnonymousComponent.displayName).toBe( + 'withErrorBoundary(AnonymousComponent)', + ); + }); +}); diff --git a/frontend/src/components/ErrorBoundaryHOC/index.ts b/frontend/src/components/ErrorBoundaryHOC/index.ts new file mode 100644 index 000000000000..1e7e5a6ae10c --- /dev/null +++ b/frontend/src/components/ErrorBoundaryHOC/index.ts @@ -0,0 +1,2 @@ +export type { WithErrorBoundaryOptions } from './withErrorBoundary'; +export { default as withErrorBoundary } from './withErrorBoundary'; diff --git a/frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.example.tsx b/frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.example.tsx new file mode 100644 index 000000000000..ce0c83fa537b --- /dev/null +++ b/frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.example.tsx @@ -0,0 +1,143 @@ +import { Button } from 'antd'; +import { useState } from 'react'; + +import { withErrorBoundary } from './index'; + +/** + * Example component that can throw errors + */ +function ProblematicComponent(): JSX.Element { + const [shouldThrow, setShouldThrow] = useState(false); + + if (shouldThrow) { + throw new Error('This is a test error from ProblematicComponent!'); + } + + return ( +
+

Problematic Component

+

This component can throw errors when the button is clicked.

+ +
+ ); +} + +/** + * Basic usage - wraps component with default error boundary + */ +export const SafeProblematicComponent = withErrorBoundary(ProblematicComponent); + +/** + * Usage with custom fallback component + */ +function CustomErrorFallback(): JSX.Element { + return ( +
+

Custom Error Fallback

+

Something went wrong in this specific component!

+ +
+ ); +} + +export const SafeProblematicComponentWithCustomFallback = withErrorBoundary( + ProblematicComponent, + { + fallback: , + }, +); + +/** + * Usage with custom error handler + */ +export const SafeProblematicComponentWithErrorHandler = withErrorBoundary( + ProblematicComponent, + { + onError: (error, errorInfo) => { + console.error('Custom error handler:', error); + console.error('Error info:', errorInfo); + // You could also send to analytics, logging service, etc. + }, + sentryOptions: { + tags: { + section: 'dashboard', + priority: 'high', + }, + level: 'error', + }, + }, +); + +/** + * Example of wrapping an existing component from the codebase + */ +function ExistingComponent({ + title, + data, +}: { + title: string; + data: any[]; +}): JSX.Element { + // This could be any existing component that might throw errors + return ( +
+

{title}

+
    + {data.map((item, index) => ( + // eslint-disable-next-line react/no-array-index-key +
  • {item.name}
  • + ))} +
+
+ ); +} + +export const SafeExistingComponent = withErrorBoundary(ExistingComponent, { + sentryOptions: { + tags: { + component: 'ExistingComponent', + feature: 'data-display', + }, + }, +}); + +/** + * Usage examples in a container component + */ +export function ErrorBoundaryExamples(): JSX.Element { + const sampleData = [ + { name: 'Item 1' }, + { name: 'Item 2' }, + { name: 'Item 3' }, + ]; + + return ( +
+

Error Boundary HOC Examples

+ +
+

1. Basic Usage

+ +
+ +
+

2. With Custom Fallback

+ +
+ +
+

3. With Custom Error Handler

+ +
+ +
+

4. Wrapped Existing Component

+ +
+
+ ); +} diff --git a/frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.tsx b/frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.tsx new file mode 100644 index 000000000000..62c552641506 --- /dev/null +++ b/frontend/src/components/ErrorBoundaryHOC/withErrorBoundary.tsx @@ -0,0 +1,99 @@ +import * as Sentry from '@sentry/react'; +import { ComponentType, ReactElement } from 'react'; + +import ErrorBoundaryFallback from '../../pages/ErrorBoundaryFallback/ErrorBoundaryFallback'; + +/** + * Configuration options for the ErrorBoundary HOC + */ +interface WithErrorBoundaryOptions { + /** Custom fallback component to render when an error occurs */ + fallback?: ReactElement; + /** Custom error handler function */ + onError?: ( + error: unknown, + componentStack: string | undefined, + eventId: string, + ) => void; + /** Additional props to pass to the ErrorBoundary */ + sentryOptions?: { + tags?: Record; + level?: Sentry.SeverityLevel; + }; +} + +/** + * Higher-Order Component that wraps a component with ErrorBoundary + * + * @param WrappedComponent - The component to wrap with error boundary + * @param options - Configuration options for the error boundary + * + * @example + * // Basic usage + * const SafeComponent = withErrorBoundary(MyComponent); + * + * @example + * // With custom fallback + * const SafeComponent = withErrorBoundary(MyComponent, { + * fallback:
Something went wrong!
+ * }); + * + * @example + * // With custom error handler + * const SafeComponent = withErrorBoundary(MyComponent, { + * onError: (error, errorInfo) => { + * console.error('Component error:', error, errorInfo); + * } + * }); + */ +function withErrorBoundary

>( + WrappedComponent: ComponentType

, + options: WithErrorBoundaryOptions = {}, +): ComponentType

{ + const { + fallback = , + onError, + sentryOptions = {}, + } = options; + + function WithErrorBoundaryComponent(props: P): JSX.Element { + return ( + { + // Add component name to context + scope.setTag( + 'component', + WrappedComponent.displayName || WrappedComponent.name || 'Unknown', + ); + + // Add any custom tags + if (sentryOptions.tags) { + Object.entries(sentryOptions.tags).forEach(([key, value]) => { + scope.setTag(key, value); + }); + } + + // Set severity level if provided + if (sentryOptions.level) { + scope.setLevel(sentryOptions.level); + } + }} + onError={onError} + > + {/* eslint-disable-next-line react/jsx-props-no-spreading */} + + + ); + } + + // Set display name for debugging purposes + WithErrorBoundaryComponent.displayName = `withErrorBoundary(${ + WrappedComponent.displayName || WrappedComponent.name || 'Component' + })`; + + return WithErrorBoundaryComponent; +} + +export default withErrorBoundary; +export type { WithErrorBoundaryOptions }; diff --git a/frontend/src/container/ApiMonitoring/Explorer/Domains/DomainDetails/TopErrors.tsx b/frontend/src/container/ApiMonitoring/Explorer/Domains/DomainDetails/TopErrors.tsx index 909a46558106..bc9c839e642b 100644 --- a/frontend/src/container/ApiMonitoring/Explorer/Domains/DomainDetails/TopErrors.tsx +++ b/frontend/src/container/ApiMonitoring/Explorer/Domains/DomainDetails/TopErrors.tsx @@ -1,6 +1,7 @@ import { LoadingOutlined } from '@ant-design/icons'; import { Spin, Switch, Table, Tooltip, Typography } from 'antd'; import { useNavigateToExplorer } from 'components/CeleryTask/useNavigateToExplorer'; +import { withErrorBoundary } from 'components/ErrorBoundaryHOC'; import { DEFAULT_ENTITY_VERSION, ENTITY_VERSION_V4 } from 'constants/app'; import { REACT_QUERY_KEY } from 'constants/reactQueryKeys'; import { @@ -248,4 +249,4 @@ function TopErrors({ ); } -export default TopErrors; +export default withErrorBoundary(TopErrors); diff --git a/frontend/src/container/MetricsExplorer/Summary/__tests__/Summary.test.tsx b/frontend/src/container/MetricsExplorer/Summary/__tests__/Summary.test.tsx index 48522c686875..0923e405cf47 100644 --- a/frontend/src/container/MetricsExplorer/Summary/__tests__/Summary.test.tsx +++ b/frontend/src/container/MetricsExplorer/Summary/__tests__/Summary.test.tsx @@ -1,4 +1,3 @@ -import { render, screen } from '@testing-library/react'; import { MetricType } from 'api/metricsExplorer/getMetricsList'; import ROUTES from 'constants/routes'; import * as useGetMetricsListHooks from 'hooks/metricsExplorer/useGetMetricsList'; @@ -7,6 +6,7 @@ import { QueryClient, QueryClientProvider } from 'react-query'; import { Provider } from 'react-redux'; import { useSearchParams } from 'react-router-dom-v5-compat'; import store from 'store'; +import { render, screen } from 'tests/test-utils'; import Summary from '../Summary'; import { TreemapViewType } from '../types'; diff --git a/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.styles.scss b/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.styles.scss index 33f0121b974b..768713bb5c78 100644 --- a/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.styles.scss +++ b/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.styles.scss @@ -9,18 +9,39 @@ color: var(--bg-vanilla-100); - .error-icon { - margin-bottom: 16px; - } - - .title, - .actions { + .error-boundary-fallback-content { display: flex; - align-items: center; + flex-direction: column; + max-width: 520px; gap: 8px; - } - .actions { - margin-top: 16px; + .title, + .actions { + display: flex; + align-items: center; + gap: 8px; + } + + .title { + color: var(--bg-vanilla-100); + font-size: 14px; + font-style: normal; + font-weight: 500; + line-height: 18px; /* 128.571% */ + letter-spacing: -0.07px; + } + + .description { + color: var(--bg-vanilla-400); + font-size: 13px; + font-style: normal; + font-weight: 400; + line-height: 18px; + letter-spacing: -0.07px; + } + + .actions { + margin-top: 16px; + } } } diff --git a/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.tsx b/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.tsx index 3bf26e0db1f4..9aa1eba5187d 100644 --- a/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.tsx +++ b/frontend/src/pages/ErrorBoundaryFallback/ErrorBoundaryFallback.tsx @@ -1,55 +1,56 @@ import './ErrorBoundaryFallback.styles.scss'; -import { BugOutlined } from '@ant-design/icons'; -import { Button, Typography } from 'antd'; +import { Button } from 'antd'; import ROUTES from 'constants/routes'; -import Slack from 'container/SideNav/Slack'; -import { Home, TriangleAlert } from 'lucide-react'; -import { useTranslation } from 'react-i18next'; +import { useGetTenantLicense } from 'hooks/useGetTenantLicense'; +import { Home, LifeBuoy } from 'lucide-react'; +import { handleContactSupport } from 'pages/Integrations/utils'; +import { useCallback } from 'react'; function ErrorBoundaryFallback(): JSX.Element { - const { t } = useTranslation(['errorDetails']); - - const onClickSlackHandler = (): void => { - window.open('https://signoz.io/slack', '_blank'); - }; - const handleReload = (): void => { // Go to home page window.location.href = ROUTES.HOME; }; + + const { isCloudUser: isCloudUserVal } = useGetTenantLicense(); + + const handleSupport = useCallback(() => { + handleContactSupport(isCloudUserVal); + }, [isCloudUserVal]); + return (

-
- -
-
- - - {t('something_went_wrong')} - -
+
+
+ error-cloud-icon +
+
Something went wrong :/
-

{t('contact_if_issue_exists')}

+
+ Our team is getting on top to resolve this. Please reach out to support if + the issue persists. +
-
- +
+ - + +
); diff --git a/frontend/src/pages/Support/Support.styles.scss b/frontend/src/pages/Support/Support.styles.scss index 4d63414a9089..1c47b29f5108 100644 --- a/frontend/src/pages/Support/Support.styles.scss +++ b/frontend/src/pages/Support/Support.styles.scss @@ -1,10 +1,49 @@ .support-page-container { - color: white; - padding-left: 48px; - padding-right: 48px; + max-height: 100vh; + overflow: hidden; - max-width: 1400px; - margin: 64px auto; + .support-page-header { + border-bottom: 1px solid var(--bg-slate-500); + background: rgba(11, 12, 14, 0.7); + backdrop-filter: blur(20px); + + .support-page-header-title { + color: var(--bg-vanilla-100); + text-align: center; + font-family: Inter; + font-size: 13px; + font-style: normal; + line-height: 14px; + letter-spacing: 0.4px; + + display: flex; + align-items: center; + gap: 8px; + padding: 16px; + } + } + + .support-page-content { + padding: 16px; + + .support-page-content-description { + color: var(--bg-vanilla-100); + text-align: left; + font-family: Inter; + font-size: 16px; + font-style: normal; + line-height: 24px; + letter-spacing: 0.4px; + + display: flex; + align-items: center; + gap: 8px; + } + + .support-channels { + margin: 24px 0; + } + } } .support-channels { @@ -21,6 +60,16 @@ position: relative; border: none !important; + border-radius: 4px; + border: 1px solid var(--bg-slate-400); + background: linear-gradient( + 139deg, + rgba(18, 19, 23, 0.8) 0%, + rgba(18, 19, 23, 0.9) 98.68% + ); + box-shadow: 4px 10px 16px 2px rgba(0, 0, 0, 0.2); + backdrop-filter: blur(20px); + .support-channel-title { width: 100%; display: flex; @@ -37,6 +86,21 @@ button { max-width: 100%; + padding: 4px 16px; + + .ant-typography { + font-size: 11px; + font-weight: 400; + line-height: 24px; + letter-spacing: -0.07px; + } + } + + .support-channel-btn { + display: flex; + align-items: center; + justify-content: center; + gap: 8px; } } } @@ -47,8 +111,50 @@ } } -@media screen and (min-width: 1440px) { +.lightMode { .support-page-container { - width: 80%; + .support-page-header { + border-bottom: 1px solid var(--bg-vanilla-300); + background: var(--bg-vanilla-100); + + .support-page-header-title { + color: var(--bg-ink-400); + } + } + } + + .support-page-content { + .support-page-content-description { + color: var(--bg-ink-400); + } + + .support-channels { + .support-channel { + border: 1px solid var(--bg-vanilla-300); + background: linear-gradient( + 139deg, + rgba(255, 255, 255, 0.8) 0%, + rgba(255, 255, 255, 0.9) 98.68% + ); + box-shadow: 4px 10px 16px 2px rgba(255, 255, 255, 0.2); + backdrop-filter: blur(20px); + } + + .support-channel-title { + color: var(--bg-ink-400); + } + + .support-channel-action { + button { + .ant-typography { + color: var(--bg-ink-400); + } + } + + .support-channel-btn { + color: var(--bg-ink-400); + } + } + } } } diff --git a/frontend/src/pages/Support/Support.tsx b/frontend/src/pages/Support/Support.tsx index 0b3dad39fff1..19faad6c0812 100644 --- a/frontend/src/pages/Support/Support.tsx +++ b/frontend/src/pages/Support/Support.tsx @@ -6,9 +6,11 @@ import updateCreditCardApi from 'api/v1/checkout/create'; import { FeatureKeys } from 'constants/features'; import { useNotifications } from 'hooks/useNotifications'; import { + ArrowUpRight, Book, CreditCard, Github, + LifeBuoy, MessageSquare, Slack, X, @@ -45,34 +47,38 @@ const supportChannels = [ { key: 'documentation', name: 'Documentation', - icon: , + icon: , title: 'Find answers in the documentation.', url: 'https://signoz.io/docs/', btnText: 'Visit docs', + isExternal: true, }, { key: 'github', name: 'Github', - icon: , + icon: , title: 'Create an issue on GitHub to report bugs or request new features.', url: 'https://github.com/SigNoz/signoz/issues', btnText: 'Create issue', + isExternal: true, }, { key: 'slack_community', name: 'Slack Community', - icon: , + icon: , title: 'Get support from the SigNoz community on Slack.', url: 'https://signoz.io/slack', btnText: 'Join Slack', + isExternal: true, }, { key: 'chat', name: 'Chat', - icon: , + icon: , title: 'Get quick support directly from the team.', url: '', btnText: 'Launch chat', + isExternal: false, }, ]; @@ -182,38 +188,45 @@ export default function Support(): JSX.Element { return (
-
- Help & Support - +
+
+ + Support +
+
+ +
+
We are here to help in case of questions or issues. Pick the channel that is most convenient for you. - -
+
-
- {supportChannels.map( - (channel): JSX.Element => ( - -
- - {channel.icon} - {channel.name}{' '} - - {channel.title} -
+
+ {supportChannels.map( + (channel): JSX.Element => ( + +
+ + {channel.icon} + {channel.name}{' '} + + {channel.title} +
-
- -
-
- ), - )} +
+ +
+ + ), + )} +
{/* Add Credit Card Modal */} From 735b90722de23762b1c3fcdfff7bd27d51dde648 Mon Sep 17 00:00:00 2001 From: aniketio-ctrl Date: Fri, 26 Sep 2025 18:54:58 +0530 Subject: [PATCH 2/4] chore(notification grouping): added custom grouping in signoz dispatcher (#8812) --- ee/query-service/app/server.go | 12 +- ee/query-service/rules/anomaly.go | 1 + pkg/alertmanager/alertmanager.go | 4 + .../alertmanagerserver/dispatcher.go | 563 +++++++++ .../alertmanagerserver/distpatcher_test.go | 1013 +++++++++++++++++ pkg/alertmanager/alertmanagerserver/server.go | 50 +- .../alertmanagerserver/server_test.go | 10 +- .../alertmanagerserver/telemetry.go | 43 + pkg/alertmanager/nfmanager/config.go | 18 + .../nfmanager/nfmanagertest/provider.go | 75 ++ .../nfmanager/notificationmanager.go | 13 + .../rulebasednotification/provider.go | 103 ++ .../rulebasednotification/provider_test.go | 270 +++++ pkg/alertmanager/service.go | 21 +- .../signozalertmanager/provider.go | 48 +- .../app/cloudintegrations/controller_test.go | 17 +- .../app/integrations/manager_test.go | 4 +- pkg/query-service/rules/manager.go | 49 +- pkg/query-service/rules/manager_test.go | 4 +- pkg/query-service/rules/threshold_rule.go | 1 + .../integration/filter_suggestions_test.go | 5 +- .../integration/logparsingpipeline_test.go | 5 +- .../signoz_cloud_integrations_test.go | 5 +- .../integration/signoz_integrations_test.go | 7 +- pkg/query-service/utils/labels/labels.go | 1 + pkg/signoz/handler_test.go | 5 +- pkg/signoz/module_test.go | 5 +- pkg/signoz/provider.go | 12 +- pkg/signoz/provider_test.go | 4 +- pkg/signoz/signoz.go | 17 +- pkg/types/alertmanagertypes/alert.go | 25 +- pkg/types/alertmanagertypes/config.go | 53 + pkg/types/ruletypes/api_params.go | 44 + 33 files changed, 2396 insertions(+), 111 deletions(-) create mode 100644 pkg/alertmanager/alertmanagerserver/dispatcher.go create mode 100644 pkg/alertmanager/alertmanagerserver/distpatcher_test.go create mode 100644 pkg/alertmanager/alertmanagerserver/telemetry.go create mode 100644 pkg/alertmanager/nfmanager/config.go create mode 100644 pkg/alertmanager/nfmanager/nfmanagertest/provider.go create mode 100644 pkg/alertmanager/nfmanager/notificationmanager.go create mode 100644 pkg/alertmanager/nfmanager/rulebasednotification/provider.go create mode 100644 pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go diff --git a/ee/query-service/app/server.go b/ee/query-service/app/server.go index a963cf4e33f2..dab2b7f51630 100644 --- a/ee/query-service/app/server.go +++ b/ee/query-service/app/server.go @@ -325,17 +325,7 @@ func (s *Server) Stop(ctx context.Context) error { return nil } -func makeRulesManager( - ch baseint.Reader, - cache cache.Cache, - alertmanager alertmanager.Alertmanager, - sqlstore sqlstore.SQLStore, - telemetryStore telemetrystore.TelemetryStore, - prometheus prometheus.Prometheus, - orgGetter organization.Getter, - querier querier.Querier, - logger *slog.Logger, -) (*baserules.Manager, error) { +func makeRulesManager(ch baseint.Reader, cache cache.Cache, alertmanager alertmanager.Alertmanager, sqlstore sqlstore.SQLStore, telemetryStore telemetrystore.TelemetryStore, prometheus prometheus.Prometheus, orgGetter organization.Getter, querier querier.Querier, logger *slog.Logger) (*baserules.Manager, error) { ruleStore := sqlrulestore.NewRuleStore(sqlstore) maintenanceStore := sqlrulestore.NewMaintenanceStore(sqlstore) // create manager opts diff --git a/ee/query-service/rules/anomaly.go b/ee/query-service/rules/anomaly.go index 2ac3b56cb949..53f205e8d004 100644 --- a/ee/query-service/rules/anomaly.go +++ b/ee/query-service/rules/anomaly.go @@ -387,6 +387,7 @@ func (r *AnomalyRule) Eval(ctx context.Context, ts time.Time) (interface{}, erro } if smpl.IsMissing { lb.Set(labels.AlertNameLabel, "[No data] "+r.Name()) + lb.Set(labels.NoDataLabel, "true") } lbs := lb.Labels() diff --git a/pkg/alertmanager/alertmanager.go b/pkg/alertmanager/alertmanager.go index 18b790f4de76..e38ddbe633e9 100644 --- a/pkg/alertmanager/alertmanager.go +++ b/pkg/alertmanager/alertmanager.go @@ -55,6 +55,10 @@ type Alertmanager interface { // SetDefaultConfig sets the default config for the organization. SetDefaultConfig(context.Context, string) error + SetNotificationConfig(ctx context.Context, orgID valuer.UUID, ruleId string, config *alertmanagertypes.NotificationConfig) error + + DeleteNotificationConfig(ctx context.Context, orgID valuer.UUID, ruleId string) error + // Collects stats for the organization. statsreporter.StatsCollector } diff --git a/pkg/alertmanager/alertmanagerserver/dispatcher.go b/pkg/alertmanager/alertmanagerserver/dispatcher.go new file mode 100644 index 000000000000..f5063177534d --- /dev/null +++ b/pkg/alertmanager/alertmanagerserver/dispatcher.go @@ -0,0 +1,563 @@ +package alertmanagerserver + +import ( + "context" + "fmt" + "log/slog" + "sort" + "sync" + "time" + + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/errors" + + "github.com/prometheus/alertmanager/dispatch" + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/provider" + "github.com/prometheus/alertmanager/store" + "github.com/prometheus/alertmanager/types" + "github.com/prometheus/common/model" +) + +const ( + noDataLabel = model.LabelName("nodata") +) + +// Dispatcher sorts incoming alerts into aggregation groups and +// assigns the correct notifiers to each. +type Dispatcher struct { + route *dispatch.Route + alerts provider.Alerts + stage notify.Stage + marker types.GroupMarker + metrics *DispatcherMetrics + limits Limits + + timeout func(time.Duration) time.Duration + + mtx sync.RWMutex + aggrGroupsPerRoute map[*dispatch.Route]map[model.Fingerprint]*aggrGroup + aggrGroupsNum int + + done chan struct{} + ctx context.Context + cancel func() + + logger *slog.Logger + notificationManager nfmanager.NotificationManager + orgID string +} + +// We use the upstream Limits interface from Prometheus +type Limits = dispatch.Limits + +// NewDispatcher returns a new Dispatcher. +func NewDispatcher( + ap provider.Alerts, + r *dispatch.Route, + s notify.Stage, + mk types.GroupMarker, + to func(time.Duration) time.Duration, + lim Limits, + l *slog.Logger, + m *DispatcherMetrics, + n nfmanager.NotificationManager, + orgID string, +) *Dispatcher { + if lim == nil { + // Use a simple implementation when no limits are provided + lim = &unlimitedLimits{} + } + + disp := &Dispatcher{ + alerts: ap, + stage: s, + route: r, + marker: mk, + timeout: to, + logger: l.With("component", "signoz-dispatcher"), + metrics: m, + limits: lim, + notificationManager: n, + orgID: orgID, + } + return disp +} + +// Run starts dispatching alerts incoming via the updates channel. +func (d *Dispatcher) Run() { + d.done = make(chan struct{}) + + d.mtx.Lock() + d.aggrGroupsPerRoute = map[*dispatch.Route]map[model.Fingerprint]*aggrGroup{} + d.aggrGroupsNum = 0 + d.metrics.aggrGroups.Set(0) + d.ctx, d.cancel = context.WithCancel(context.Background()) + d.mtx.Unlock() + + d.run(d.alerts.Subscribe()) + close(d.done) +} + +func (d *Dispatcher) run(it provider.AlertIterator) { + maintenance := time.NewTicker(30 * time.Second) + defer maintenance.Stop() + + defer it.Close() + + for { + select { + case alert, ok := <-it.Next(): + if !ok { + // Iterator exhausted for some reason. + if err := it.Err(); err != nil { + d.logger.ErrorContext(d.ctx, "Error on alert update", "err", err) + } + return + } + + d.logger.DebugContext(d.ctx, "SigNoz Custom Dispatcher: Received alert", "alert", alert) + + // Log errors but keep trying. + if err := it.Err(); err != nil { + d.logger.ErrorContext(d.ctx, "Error on alert update", "err", err) + continue + } + + now := time.Now() + for _, r := range d.route.Match(alert.Labels) { + d.processAlert(alert, r) + } + d.metrics.processingDuration.Observe(time.Since(now).Seconds()) + + case <-maintenance.C: + d.doMaintenance() + case <-d.ctx.Done(): + return + } + } +} + +func (d *Dispatcher) doMaintenance() { + d.mtx.Lock() + defer d.mtx.Unlock() + for _, groups := range d.aggrGroupsPerRoute { + for _, ag := range groups { + if ag.empty() { + ag.stop() + d.marker.DeleteByGroupKey(ag.routeID, ag.GroupKey()) + delete(groups, ag.fingerprint()) + d.aggrGroupsNum-- + d.metrics.aggrGroups.Dec() + } + } + } +} + +// AlertGroup represents how alerts exist within an aggrGroup. +type AlertGroup struct { + Alerts types.AlertSlice + Labels model.LabelSet + Receiver string + GroupKey string + RouteID string + Renotify time.Duration +} + +type AlertGroups []*AlertGroup + +func (ag AlertGroups) Swap(i, j int) { ag[i], ag[j] = ag[j], ag[i] } +func (ag AlertGroups) Less(i, j int) bool { + if ag[i].Labels.Equal(ag[j].Labels) { + return ag[i].Receiver < ag[j].Receiver + } + return ag[i].Labels.Before(ag[j].Labels) +} +func (ag AlertGroups) Len() int { return len(ag) } + +// Groups returns a slice of AlertGroups from the dispatcher's internal state. +func (d *Dispatcher) Groups(routeFilter func(*dispatch.Route) bool, alertFilter func(*types.Alert, time.Time) bool) (AlertGroups, map[model.Fingerprint][]string) { + groups := AlertGroups{} + + d.mtx.RLock() + defer d.mtx.RUnlock() + + // Keep a list of receivers for an alert to prevent checking each alert + // again against all routes. The alert has already matched against this + // route on ingestion. + receivers := map[model.Fingerprint][]string{} + + now := time.Now() + for route, ags := range d.aggrGroupsPerRoute { + if !routeFilter(route) { + continue + } + + for _, ag := range ags { + receiver := route.RouteOpts.Receiver + alertGroup := &AlertGroup{ + Labels: ag.labels, + Receiver: receiver, + GroupKey: ag.GroupKey(), + RouteID: ag.routeID, + Renotify: ag.opts.RepeatInterval, + } + + alerts := ag.alerts.List() + filteredAlerts := make([]*types.Alert, 0, len(alerts)) + for _, a := range alerts { + if !alertFilter(a, now) { + continue + } + + fp := a.Fingerprint() + if r, ok := receivers[fp]; ok { + // Receivers slice already exists. Add + // the current receiver to the slice. + receivers[fp] = append(r, receiver) + } else { + // First time we've seen this alert fingerprint. + // Initialize a new receivers slice. + receivers[fp] = []string{receiver} + } + + filteredAlerts = append(filteredAlerts, a) + } + if len(filteredAlerts) == 0 { + continue + } + alertGroup.Alerts = filteredAlerts + + groups = append(groups, alertGroup) + } + } + sort.Sort(groups) + for i := range groups { + sort.Sort(groups[i].Alerts) + } + for i := range receivers { + sort.Strings(receivers[i]) + } + + return groups, receivers +} + +// Stop the dispatcher. +func (d *Dispatcher) Stop() { + if d == nil { + return + } + d.mtx.Lock() + if d.cancel == nil { + d.mtx.Unlock() + return + } + d.cancel() + d.cancel = nil + d.mtx.Unlock() + + <-d.done +} + +// notifyFunc is a function that performs notification for the alert +// with the given fingerprint. It aborts on context cancelation. +// Returns false iff notifying failed. +type notifyFunc func(context.Context, ...*types.Alert) bool + +// processAlert determines in which aggregation group the alert falls +// and inserts it. +func (d *Dispatcher) processAlert(alert *types.Alert, route *dispatch.Route) { + ruleId := getRuleIDFromAlert(alert) + config, err := d.notificationManager.GetNotificationConfig(d.orgID, ruleId) + if err != nil { + d.logger.ErrorContext(d.ctx, "error getting alert notification config", "rule_id", ruleId, "error", err) + return + } + + groupLabels := getGroupLabels(alert, config.NotificationGroup) + + fp := groupLabels.Fingerprint() + + d.mtx.Lock() + defer d.mtx.Unlock() + + routeGroups, ok := d.aggrGroupsPerRoute[route] + if !ok { + routeGroups = map[model.Fingerprint]*aggrGroup{} + d.aggrGroupsPerRoute[route] = routeGroups + } + + ag, ok := routeGroups[fp] + if ok { + ag.insert(alert) + return + } + + // If the group does not exist, create it. But check the limit first. + if limit := d.limits.MaxNumberOfAggregationGroups(); limit > 0 && d.aggrGroupsNum >= limit { + d.metrics.aggrGroupLimitReached.Inc() + d.logger.ErrorContext(d.ctx, "Too many aggregation groups, cannot create new group for alert", "groups", d.aggrGroupsNum, "limit", limit, "alert", alert.Name()) + return + } + renotifyInterval := config.Renotify.RenotifyInterval + + if noDataAlert(alert) { + renotifyInterval = config.Renotify.NoDataInterval + groupLabels[noDataLabel] = alert.Labels[noDataLabel] + } + + ag = newAggrGroup(d.ctx, groupLabels, route, d.timeout, d.logger, renotifyInterval) + + routeGroups[fp] = ag + d.aggrGroupsNum++ + d.metrics.aggrGroups.Inc() + + // Insert the 1st alert in the group before starting the group's run() + // function, to make sure that when the run() will be executed the 1st + // alert is already there. + ag.insert(alert) + + go ag.run(func(ctx context.Context, alerts ...*types.Alert) bool { + _, _, err := d.stage.Exec(ctx, d.logger, alerts...) + if err != nil { + logger := d.logger.With("num_alerts", len(alerts), "err", err) + if errors.Is(ctx.Err(), context.Canceled) { + // It is expected for the context to be canceled on + // configuration reload or shutdown. In this case, the + // message should only be logged at the debug level. + logger.DebugContext(ctx, "Notify for alerts failed") + } else { + logger.ErrorContext(ctx, "Notify for alerts failed") + } + } + return err == nil + }) +} + +// aggrGroup aggregates alert fingerprints into groups to which a +// common set of routing options applies. +// It emits notifications in the specified intervals. +type aggrGroup struct { + labels model.LabelSet + opts *dispatch.RouteOpts + logger *slog.Logger + routeID string + routeKey string + + alerts *store.Alerts + ctx context.Context + cancel func() + done chan struct{} + next *time.Timer + timeout func(time.Duration) time.Duration + + mtx sync.RWMutex + hasFlushed bool +} + +// newAggrGroup returns a new aggregation group. +func newAggrGroup(ctx context.Context, labels model.LabelSet, r *dispatch.Route, to func(time.Duration) time.Duration, logger *slog.Logger, renotify time.Duration) *aggrGroup { + if to == nil { + to = func(d time.Duration) time.Duration { return d } + } + + opts := deepCopyRouteOpts(r.RouteOpts, renotify) + + ag := &aggrGroup{ + labels: labels, + routeID: r.ID(), + routeKey: r.Key(), + opts: &opts, + timeout: to, + alerts: store.NewAlerts(), + done: make(chan struct{}), + } + ag.ctx, ag.cancel = context.WithCancel(ctx) + + ag.logger = logger.With("aggr_group", ag) + + // Set an initial one-time wait before flushing + // the first batch of notifications. + ag.next = time.NewTimer(ag.opts.GroupWait) + + return ag +} + +func (ag *aggrGroup) fingerprint() model.Fingerprint { + return ag.labels.Fingerprint() +} + +func (ag *aggrGroup) GroupKey() string { + return fmt.Sprintf("%s:%s", ag.routeKey, ag.labels) +} + +func (ag *aggrGroup) String() string { + return ag.GroupKey() +} + +func (ag *aggrGroup) run(nf notifyFunc) { + defer close(ag.done) + defer ag.next.Stop() + + for { + select { + case now := <-ag.next.C: + // Give the notifications time until the next flush to + // finish before terminating them. + ctx, cancel := context.WithTimeout(ag.ctx, ag.timeout(ag.opts.GroupInterval)) + + // The now time we retrieve from the ticker is the only reliable + // point of time reference for the subsequent notification pipeline. + // Calculating the current time directly is prone to flaky behavior, + // which usually only becomes apparent in tests. + ctx = notify.WithNow(ctx, now) + + // Populate context with information needed along the pipeline. + ctx = notify.WithGroupKey(ctx, ag.GroupKey()) + ctx = notify.WithGroupLabels(ctx, ag.labels) + ctx = notify.WithReceiverName(ctx, ag.opts.Receiver) + ctx = notify.WithRepeatInterval(ctx, ag.opts.RepeatInterval) + ctx = notify.WithMuteTimeIntervals(ctx, ag.opts.MuteTimeIntervals) + ctx = notify.WithActiveTimeIntervals(ctx, ag.opts.ActiveTimeIntervals) + ctx = notify.WithRouteID(ctx, ag.routeID) + + // Wait the configured interval before calling flush again. + ag.mtx.Lock() + ag.next.Reset(ag.opts.GroupInterval) + ag.hasFlushed = true + ag.mtx.Unlock() + + ag.flush(func(alerts ...*types.Alert) bool { + return nf(ctx, alerts...) + }) + + cancel() + + case <-ag.ctx.Done(): + return + } + } +} + +func (ag *aggrGroup) stop() { + // Calling cancel will terminate all in-process notifications + // and the run() loop. + ag.cancel() + <-ag.done +} + +// insert inserts the alert into the aggregation group. +func (ag *aggrGroup) insert(alert *types.Alert) { + if err := ag.alerts.Set(alert); err != nil { + ag.logger.ErrorContext(ag.ctx, "error on set alert", "err", err) + } + + // Immediately trigger a flush if the wait duration for this + // alert is already over. + ag.mtx.Lock() + defer ag.mtx.Unlock() + if !ag.hasFlushed && alert.StartsAt.Add(ag.opts.GroupWait).Before(time.Now()) { + ag.next.Reset(0) + } +} + +func (ag *aggrGroup) empty() bool { + return ag.alerts.Empty() +} + +// flush sends notifications for all new alerts. +func (ag *aggrGroup) flush(notify func(...*types.Alert) bool) { + if ag.empty() { + return + } + + var ( + alerts = ag.alerts.List() + alertsSlice = make(types.AlertSlice, 0, len(alerts)) + resolvedSlice = make(types.AlertSlice, 0, len(alerts)) + now = time.Now() + ) + for _, alert := range alerts { + a := *alert + // Ensure that alerts don't resolve as time move forwards. + if a.ResolvedAt(now) { + resolvedSlice = append(resolvedSlice, &a) + } else { + a.EndsAt = time.Time{} + } + alertsSlice = append(alertsSlice, &a) + } + sort.Stable(alertsSlice) + + ag.logger.DebugContext(ag.ctx, "flushing", "alerts", fmt.Sprintf("%v", alertsSlice)) + + if notify(alertsSlice...) { + // Delete all resolved alerts as we just sent a notification for them, + // and we don't want to send another one. However, we need to make sure + // that each resolved alert has not fired again during the flush as then + // we would delete an active alert thinking it was resolved. + if err := ag.alerts.DeleteIfNotModified(resolvedSlice); err != nil { + ag.logger.ErrorContext(ag.ctx, "error on delete alerts", "err", err) + } + } +} + +// unlimitedLimits provides unlimited aggregation groups for SigNoz +type unlimitedLimits struct{} + +func (u *unlimitedLimits) MaxNumberOfAggregationGroups() int { return 0 } + +func getRuleIDFromAlert(alert *types.Alert) string { + for name, value := range alert.Labels { + if string(name) == "ruleId" { + return string(value) + } + } + return "" +} + +func deepCopyRouteOpts(opts dispatch.RouteOpts, renotify time.Duration) dispatch.RouteOpts { + newOpts := opts + + if opts.GroupBy != nil { + newOpts.GroupBy = make(map[model.LabelName]struct{}, len(opts.GroupBy)) + for k, v := range opts.GroupBy { + newOpts.GroupBy[k] = v + } + } + + if opts.MuteTimeIntervals != nil { + newOpts.MuteTimeIntervals = make([]string, len(opts.MuteTimeIntervals)) + copy(newOpts.MuteTimeIntervals, opts.MuteTimeIntervals) + } + + if opts.ActiveTimeIntervals != nil { + newOpts.ActiveTimeIntervals = make([]string, len(opts.ActiveTimeIntervals)) + copy(newOpts.ActiveTimeIntervals, opts.ActiveTimeIntervals) + } + + if renotify > 0 { + newOpts.RepeatInterval = renotify + } + + return newOpts +} + +func getGroupLabels(alert *types.Alert, groups map[model.LabelName]struct{}) model.LabelSet { + groupLabels := model.LabelSet{} + for ln, lv := range alert.Labels { + if _, ok := groups[ln]; ok { + groupLabels[ln] = lv + } + } + + return groupLabels +} + +func noDataAlert(alert *types.Alert) bool { + if _, ok := alert.Labels[noDataLabel]; ok { + return true + } else { + return false + } +} diff --git a/pkg/alertmanager/alertmanagerserver/distpatcher_test.go b/pkg/alertmanager/alertmanagerserver/distpatcher_test.go new file mode 100644 index 000000000000..36369a35049d --- /dev/null +++ b/pkg/alertmanager/alertmanagerserver/distpatcher_test.go @@ -0,0 +1,1013 @@ +package alertmanagerserver + +import ( + "context" + "fmt" + "log/slog" + "reflect" + "sort" + "sync" + "testing" + "time" + + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + + "github.com/prometheus/alertmanager/dispatch" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" + "github.com/prometheus/common/promslog" + "github.com/stretchr/testify/require" + + "github.com/prometheus/alertmanager/config" + "github.com/prometheus/alertmanager/notify" + "github.com/prometheus/alertmanager/provider/mem" + "github.com/prometheus/alertmanager/types" +) + +func TestAggrGroup(t *testing.T) { + lset := model.LabelSet{ + "a": "v1", + "b": "v2", + } + opts := &dispatch.RouteOpts{ + Receiver: "n1", + GroupBy: map[model.LabelName]struct{}{ + "a": {}, + "b": {}, + }, + GroupWait: 1 * time.Second, + GroupInterval: 300 * time.Millisecond, + RepeatInterval: 1 * time.Hour, + } + route := &dispatch.Route{ + RouteOpts: *opts, + } + orgId := "test-org-id" + ruleId := "test-rule-id" + notificationConfig := alertmanagertypes.NotificationConfig{ + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 2 * time.Hour, + }, + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("a"): {}, + model.LabelName("b"): {}, + }, + } + // Setup notification manager using nfmanagertest + nfManager := nfmanagertest.NewMock() + nfManager.SetMockConfig(orgId, ruleId, ¬ificationConfig) + + var ( + a1 = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v3", + "ruleId": "test-rule-1", + }, + StartsAt: time.Now().Add(time.Minute), + EndsAt: time.Now().Add(time.Hour), + }, + UpdatedAt: time.Now(), + } + a2 = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v4", + "ruleId": "test-rule-1", + }, + StartsAt: time.Now().Add(-time.Hour), + EndsAt: time.Now().Add(2 * time.Hour), + }, + UpdatedAt: time.Now(), + } + a3 = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v5", + "ruleId": "test-rule-1", + }, + StartsAt: time.Now().Add(time.Minute), + EndsAt: time.Now().Add(5 * time.Minute), + }, + UpdatedAt: time.Now(), + } + ) + + var ( + last = time.Now() + current = time.Now() + lastCurMtx = &sync.Mutex{} + alertsCh = make(chan types.AlertSlice) + ) + + ntfy := func(ctx context.Context, alerts ...*types.Alert) bool { + // Validate that the context is properly populated. + if _, ok := notify.Now(ctx); !ok { + t.Errorf("now missing") + } + if _, ok := notify.GroupKey(ctx); !ok { + t.Errorf("group key missing") + } + if lbls, ok := notify.GroupLabels(ctx); !ok || !reflect.DeepEqual(lbls, lset) { + t.Errorf("wrong group labels: %q", lbls) + } + if rcv, ok := notify.ReceiverName(ctx); !ok || rcv != opts.Receiver { + t.Errorf("wrong receiver: %q", rcv) + } + if ri, ok := notify.RepeatInterval(ctx); !ok || ri != notificationConfig.Renotify.RenotifyInterval { + t.Errorf("wrong repeat interval: %q", ri) + } + + lastCurMtx.Lock() + last = current + // Subtract a millisecond to allow for races. + current = time.Now().Add(-time.Millisecond) + lastCurMtx.Unlock() + + alertsCh <- types.AlertSlice(alerts) + + return true + } + + removeEndsAt := func(as types.AlertSlice) types.AlertSlice { + for i, a := range as { + ac := *a + ac.EndsAt = time.Time{} + as[i] = &ac + } + return as + } + + // Test regular situation where we wait for group_wait to send out alerts. + ag := newAggrGroup(context.Background(), lset, route, nil, promslog.NewNopLogger(), notificationConfig.Renotify.RenotifyInterval) + + go ag.run(ntfy) + + ag.insert(a1) + + select { + case <-time.After(2 * opts.GroupWait): + t.Fatalf("expected initial batch after group_wait") + + case batch := <-alertsCh: + lastCurMtx.Lock() + s := time.Since(last) + lastCurMtx.Unlock() + if s < opts.GroupWait { + t.Fatalf("received batch too early after %v", s) + } + exp := removeEndsAt(types.AlertSlice{a1}) + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + + for i := 0; i < 3; i++ { + // NewMock alert should come in after group interval. + ag.insert(a3) + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + + case batch := <-alertsCh: + lastCurMtx.Lock() + s := time.Since(last) + lastCurMtx.Unlock() + if s < opts.GroupInterval { + t.Fatalf("received batch too early after %v", s) + } + exp := removeEndsAt(types.AlertSlice{a1, a3}) + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + } + + ag.stop() + + // Add an alert that started more than group_interval in the past. We expect + // immediate flushing. + // Finally, set all alerts to be resolved. After successful notify the aggregation group + // should empty itself. + ag = newAggrGroup(context.Background(), lset, route, nil, promslog.NewNopLogger(), notificationConfig.Renotify.RenotifyInterval) + go ag.run(ntfy) + + ag.insert(a1) + ag.insert(a2) + + // a2 lies way in the past so the initial group_wait should be skipped. + select { + case <-time.After(opts.GroupWait / 2): + t.Fatalf("expected immediate alert but received none") + + case batch := <-alertsCh: + exp := removeEndsAt(types.AlertSlice{a1, a2}) + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + + for i := 0; i < 3; i++ { + // NewMock alert should come in after group interval. + ag.insert(a3) + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + + case batch := <-alertsCh: + lastCurMtx.Lock() + s := time.Since(last) + lastCurMtx.Unlock() + if s < opts.GroupInterval { + t.Fatalf("received batch too early after %v", s) + } + exp := removeEndsAt(types.AlertSlice{a1, a2, a3}) + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + } + + // Resolve an alert, and it should be removed after the next batch was sent. + a1r := *a1 + a1r.EndsAt = time.Now() + ag.insert(&a1r) + exp := append(types.AlertSlice{&a1r}, removeEndsAt(types.AlertSlice{a2, a3})...) + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + case batch := <-alertsCh: + lastCurMtx.Lock() + s := time.Since(last) + lastCurMtx.Unlock() + if s < opts.GroupInterval { + t.Fatalf("received batch too early after %v", s) + } + sort.Sort(batch) + + if !reflect.DeepEqual(batch, exp) { + t.Fatalf("expected alerts %v but got %v", exp, batch) + } + } + + // Resolve all remaining alerts, they should be removed after the next batch was sent. + // Do not add a1r as it should have been deleted following the previous batch. + a2r, a3r := *a2, *a3 + resolved := types.AlertSlice{&a2r, &a3r} + for _, a := range resolved { + a.EndsAt = time.Now() + ag.insert(a) + } + + select { + case <-time.After(2 * opts.GroupInterval): + t.Fatalf("expected new batch after group interval but received none") + + case batch := <-alertsCh: + lastCurMtx.Lock() + s := time.Since(last) + lastCurMtx.Unlock() + if s < opts.GroupInterval { + t.Fatalf("received batch too early after %v", s) + } + sort.Sort(batch) + + if !reflect.DeepEqual(batch, resolved) { + t.Fatalf("expected alerts %v but got %v", resolved, batch) + } + + if !ag.empty() { + t.Fatalf("Expected aggregation group to be empty after resolving alerts: %v", ag) + } + } + + ag.stop() +} + +func TestGroupLabels(t *testing.T) { + a := &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v3", + }, + }, + } + + route := &dispatch.Route{ + RouteOpts: dispatch.RouteOpts{ + GroupBy: map[model.LabelName]struct{}{ + "a": {}, + "b": {}, + }, + GroupByAll: false, + }, + } + + expLs := model.LabelSet{ + "a": "v1", + "b": "v2", + } + + ls := getGroupLabels(a, route.RouteOpts.GroupBy) + + if !reflect.DeepEqual(ls, expLs) { + t.Fatalf("expected labels are %v, but got %v", expLs, ls) + } +} + +func TestAggrRouteMap(t *testing.T) { + confData := `receivers: +- name: 'slack' +- name: 'email' +- name: 'pagerduty' + +route: + group_by: ['alertname'] + group_wait: 10ms + group_interval: 10ms + receiver: 'slack' + routes: + - matchers: + - 'ruleId=~"ruleId-OtherAlert|ruleId-TestingAlert"' + receiver: 'slack' + - matchers: + - 'ruleId=~"ruleId-HighLatency|ruleId-HighErrorRate"' + receiver: 'email' + continue: true + - matchers: + - 'ruleId="ruleId-HighLatency"' + receiver: 'pagerduty'` + conf, err := config.Load(confData) + if err != nil { + t.Fatal(err) + } + + logger := promslog.NewNopLogger() + route := dispatch.NewRoute(conf.Route, nil) + marker := types.NewMarker(prometheus.NewRegistry()) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) + if err != nil { + t.Fatal(err) + } + defer alerts.Close() + + timeout := func(d time.Duration) time.Duration { return time.Duration(0) } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) + nfManager := nfmanagertest.NewMock() + orgId := "test-org" + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId) + go dispatcher.Run() + defer dispatcher.Stop() + inputAlerts := []*types.Alert{ + newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd"}), + newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4"}), + } + notiConfigs := map[string]alertmanagertypes.NotificationConfig{ + "ruleId-OtherAlert": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("cluster"): {}, + model.LabelName("service"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 10, + }, + }, + "ruleId-TestingAlert": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("service"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 11, + }, + }, + "ruleId-HighErrorRate": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("cluster"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 12, + }, + }, + "ruleId-HighLatency": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("service"): {}, + model.LabelName("kafka"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 13, + }, + }, + } + + for ruleID, config := range notiConfigs { + nfManager.SetMockConfig(orgId, ruleID, &config) + } + err = alerts.Put(inputAlerts...) + if err != nil { + t.Fatal(err) + } + + // Let alerts get processed. + for i := 0; len(recorder.Alerts()) != 9 && i < 10; i++ { + time.Sleep(200 * time.Millisecond) + } + require.Len(t, recorder.Alerts(), 9) + + alertGroups, receivers := dispatcher.Groups( + func(*dispatch.Route) bool { + return true + }, func(*types.Alert, time.Time) bool { + return true + }, + ) + + dispatcher.mtx.RLock() + aggrGroupsPerRoute := dispatcher.aggrGroupsPerRoute + dispatcher.mtx.RUnlock() + + require.NotEmpty(t, aggrGroupsPerRoute, "Should have aggregation groups per route") + + routeIDsFound := make(map[string]bool) + totalAggrGroups := 0 + + //first lets check for valid route id + for route, groups := range aggrGroupsPerRoute { + routeID := route.ID() + routeIDsFound[routeID] = true + expectedReceiver := "" + switch routeID { + case "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0": + expectedReceiver = "slack" + case "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1": + expectedReceiver = "email" + case "{}/{ruleId=\"ruleId-HighLatency\"}/2": + expectedReceiver = "pagerduty" + } + if expectedReceiver != "" { + require.Equal(t, expectedReceiver, route.RouteOpts.Receiver, + "Route %s should have receiver %s", routeID, expectedReceiver) + } + totalAggrGroups += len(groups) + } + + require.Equal(t, 7, totalAggrGroups, "Should have exactly 7 aggregation groups") + + // Verify specific route group counts + expectedGroupCounts := map[string]int{ + "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0": 2, // OtherAlert + TestingAlert + "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1": 4, // 3 HighErrorRate + 1 HighLatency + "{}/{ruleId=\"ruleId-HighLatency\"}/2": 1, // 1 HighLatency group + } + + for route, groups := range aggrGroupsPerRoute { + routeID := route.ID() + if expectedCount, exists := expectedGroupCounts[routeID]; exists { + require.Equal(t, expectedCount, len(groups), + "Route %s should have %d groups, got %d", routeID, expectedCount, len(groups)) + } + } + + require.Equal(t, AlertGroups{ + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, + Labels: model.LabelSet{ + "kafka": "yes", + "ruleId": "ruleId-HighLatency", + "service": "db", + }, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + Renotify: 13, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, + Labels: model.LabelSet{ + "kafka": "yes", + "ruleId": "ruleId-HighLatency", + "service": "db", + }, + Receiver: "pagerduty", + GroupKey: "{}/{ruleId=\"ruleId-HighLatency\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", + RouteID: "{}/{ruleId=\"ruleId-HighLatency\"}/2", + Renotify: 13, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[1]}, + Labels: model.LabelSet{ + "instance": "inst1", + "ruleId": "ruleId-TestingAlert", + "service": "api", + }, + Renotify: 11, + Receiver: "slack", + GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{instance=\"inst1\", ruleId=\"ruleId-TestingAlert\", service=\"api\"}", + RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[2]}, + Labels: model.LabelSet{ + "cluster": "aa", + "instance": "inst1", + "ruleId": "ruleId-HighErrorRate", + }, + Renotify: 12, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[3]}, + Labels: model.LabelSet{ + "cluster": "aa", + "instance": "inst2", + "ruleId": "ruleId-HighErrorRate", + }, + Renotify: 12, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst2\", ruleId=\"ruleId-HighErrorRate\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[4]}, + Labels: model.LabelSet{ + "cluster": "bb", + "instance": "inst1", + "ruleId": "ruleId-HighErrorRate", + }, + Renotify: 12, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"bb\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[0]}, + Labels: model.LabelSet{ + "cluster": "cc", + "ruleId": "ruleId-OtherAlert", + "service": "dd", + }, + Renotify: 10, + Receiver: "slack", + GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{cluster=\"cc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}", + RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", + }, + }, alertGroups) + require.Equal(t, map[model.Fingerprint][]string{ + inputAlerts[0].Fingerprint(): {"slack"}, + inputAlerts[1].Fingerprint(): {"slack"}, + inputAlerts[2].Fingerprint(): {"email"}, + inputAlerts[3].Fingerprint(): {"email"}, + inputAlerts[4].Fingerprint(): {"email"}, + inputAlerts[5].Fingerprint(): {"email", "pagerduty"}, + inputAlerts[6].Fingerprint(): {"email", "pagerduty"}, + }, receivers) +} + +func TestGroupsWithNodata(t *testing.T) { + confData := `receivers: +- name: 'slack' +- name: 'email' +- name: 'pagerduty' + +route: + group_by: ['alertname'] + group_wait: 10ms + group_interval: 10ms + receiver: 'slack' + routes: + - matchers: + - 'ruleId=~"ruleId-OtherAlert|ruleId-TestingAlert"' + receiver: 'slack' + - matchers: + - 'ruleId=~"ruleId-HighLatency|ruleId-HighErrorRate"' + receiver: 'email' + continue: true + - matchers: + - 'ruleId="ruleId-HighLatency"' + receiver: 'pagerduty'` + conf, err := config.Load(confData) + if err != nil { + t.Fatal(err) + } + + logger := promslog.NewNopLogger() + route := dispatch.NewRoute(conf.Route, nil) + marker := types.NewMarker(prometheus.NewRegistry()) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) + if err != nil { + t.Fatal(err) + } + defer alerts.Close() + + timeout := func(d time.Duration) time.Duration { return time.Duration(0) } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) + nfManager := nfmanagertest.NewMock() + orgId := "test-org" + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, orgId) + go dispatcher.Run() + defer dispatcher.Stop() + + // Create alerts. the dispatcher will automatically create the groups. + inputAlerts := []*types.Alert{ + // Matches the parent route. + newAlert(model.LabelSet{"ruleId": "ruleId-OtherAlert", "cluster": "cc", "service": "dd"}), + // Matches the first sub-route. + newAlert(model.LabelSet{"env": "testing", "ruleId": "ruleId-TestingAlert", "service": "api", "instance": "inst1"}), + // Matches the second sub-route. + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}), + // Matches the second sub-route. + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}), + // Matches the second and third sub-route. + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}), + newAlert(model.LabelSet{"env": "prod", "ruleId": "ruleId-HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst4"}), + newAlert(model.LabelSet{"ruleId": "ruleId-HighLatency", "nodata": "true"}), + } + notiConfigs := map[string]alertmanagertypes.NotificationConfig{ + "ruleId-OtherAlert": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("cluster"): {}, + model.LabelName("service"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 10, + }, + }, + "ruleId-TestingAlert": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("service"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 11, + }, + }, + "ruleId-HighErrorRate": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("cluster"): {}, + model.LabelName("instance"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 12, + }, + }, + "ruleId-HighLatency": { + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + model.LabelName("service"): {}, + model.LabelName("kafka"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 13, + NoDataInterval: 14, + }, + }, + } + + for ruleID, config := range notiConfigs { + nfManager.SetMockConfig(orgId, ruleID, &config) + } + err = alerts.Put(inputAlerts...) + if err != nil { + t.Fatal(err) + } + + // Let alerts get processed. + for i := 0; len(recorder.Alerts()) != 11 && i < 15; i++ { + time.Sleep(200 * time.Millisecond) + } + require.Len(t, recorder.Alerts(), 11) + + alertGroups, receivers := dispatcher.Groups( + func(*dispatch.Route) bool { + return true + }, func(*types.Alert, time.Time) bool { + return true + }, + ) + + require.Equal(t, AlertGroups{ + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[7]}, + Labels: model.LabelSet{ + "ruleId": "ruleId-HighLatency", + "nodata": "true", + }, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{nodata=\"true\", ruleId=\"ruleId-HighLatency\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + Renotify: 14, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[7]}, + Labels: model.LabelSet{ + "ruleId": "ruleId-HighLatency", + "nodata": "true", + }, + Receiver: "pagerduty", + GroupKey: "{}/{ruleId=\"ruleId-HighLatency\"}:{nodata=\"true\", ruleId=\"ruleId-HighLatency\"}", + RouteID: "{}/{ruleId=\"ruleId-HighLatency\"}/2", + Renotify: 14, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, + Labels: model.LabelSet{ + "kafka": "yes", + "ruleId": "ruleId-HighLatency", + "service": "db", + }, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + Renotify: 13, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[5], inputAlerts[6]}, + Labels: model.LabelSet{ + "kafka": "yes", + "ruleId": "ruleId-HighLatency", + "service": "db", + }, + Receiver: "pagerduty", + GroupKey: "{}/{ruleId=\"ruleId-HighLatency\"}:{kafka=\"yes\", ruleId=\"ruleId-HighLatency\", service=\"db\"}", + RouteID: "{}/{ruleId=\"ruleId-HighLatency\"}/2", + Renotify: 13, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[1]}, + Labels: model.LabelSet{ + "instance": "inst1", + "ruleId": "ruleId-TestingAlert", + "service": "api", + }, + Receiver: "slack", + GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{instance=\"inst1\", ruleId=\"ruleId-TestingAlert\", service=\"api\"}", + RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", + Renotify: 11, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[2]}, + Labels: model.LabelSet{ + "cluster": "aa", + "instance": "inst1", + "ruleId": "ruleId-HighErrorRate", + }, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + Renotify: 12, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[3]}, + Labels: model.LabelSet{ + "cluster": "aa", + "instance": "inst2", + "ruleId": "ruleId-HighErrorRate", + }, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"aa\", instance=\"inst2\", ruleId=\"ruleId-HighErrorRate\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + Renotify: 12, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[4]}, + Labels: model.LabelSet{ + "cluster": "bb", + "instance": "inst1", + "ruleId": "ruleId-HighErrorRate", + }, + Receiver: "email", + GroupKey: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}:{cluster=\"bb\", instance=\"inst1\", ruleId=\"ruleId-HighErrorRate\"}", + RouteID: "{}/{ruleId=~\"ruleId-HighLatency|ruleId-HighErrorRate\"}/1", + Renotify: 12, + }, + &AlertGroup{ + Alerts: []*types.Alert{inputAlerts[0]}, + Labels: model.LabelSet{ + "cluster": "cc", + "ruleId": "ruleId-OtherAlert", + "service": "dd", + }, + Receiver: "slack", + GroupKey: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}:{cluster=\"cc\", ruleId=\"ruleId-OtherAlert\", service=\"dd\"}", + RouteID: "{}/{ruleId=~\"ruleId-OtherAlert|ruleId-TestingAlert\"}/0", + Renotify: 10, + }, + }, alertGroups) + require.Equal(t, map[model.Fingerprint][]string{ + inputAlerts[0].Fingerprint(): {"slack"}, + inputAlerts[1].Fingerprint(): {"slack"}, + inputAlerts[2].Fingerprint(): {"email"}, + inputAlerts[3].Fingerprint(): {"email"}, + inputAlerts[4].Fingerprint(): {"email"}, + inputAlerts[5].Fingerprint(): {"email", "pagerduty"}, + inputAlerts[6].Fingerprint(): {"email", "pagerduty"}, + inputAlerts[7].Fingerprint(): {"email", "pagerduty"}, + }, receivers) +} + +type recordStage struct { + mtx sync.RWMutex + alerts map[string]map[model.Fingerprint]*types.Alert +} + +func (r *recordStage) Alerts() []*types.Alert { + r.mtx.RLock() + defer r.mtx.RUnlock() + alerts := make([]*types.Alert, 0) + for k := range r.alerts { + for _, a := range r.alerts[k] { + alerts = append(alerts, a) + } + } + return alerts +} + +func (r *recordStage) Exec(ctx context.Context, l *slog.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) { + r.mtx.Lock() + defer r.mtx.Unlock() + gk, ok := notify.GroupKey(ctx) + if !ok { + panic("GroupKey not present!") + } + if _, ok := r.alerts[gk]; !ok { + r.alerts[gk] = make(map[model.Fingerprint]*types.Alert) + } + for _, a := range alerts { + r.alerts[gk][a.Fingerprint()] = a + } + return ctx, nil, nil +} + +var ( + // Set the start time in the past to trigger a flush immediately. + t0 = time.Now().Add(-time.Minute) + // Set the end time in the future to avoid deleting the alert. + t1 = t0.Add(2 * time.Minute) +) + +func newAlert(labels model.LabelSet) *types.Alert { + return &types.Alert{ + Alert: model.Alert{ + Labels: labels, + Annotations: model.LabelSet{"foo": "bar"}, + StartsAt: t0, + EndsAt: t1, + GeneratorURL: "http://example.com/prometheus", + }, + UpdatedAt: t0, + Timeout: false, + } +} + +func TestDispatcherRace(t *testing.T) { + logger := promslog.NewNopLogger() + marker := types.NewMarker(prometheus.NewRegistry()) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) + if err != nil { + t.Fatal(err) + } + defer alerts.Close() + + timeout := func(d time.Duration) time.Duration { return time.Duration(0) } + metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) + nfManager := nfmanagertest.NewMock() + // Set up default expectation that won't be called in this race test + dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, nil, logger, metrics, nfManager, "test-org") + go dispatcher.Run() + dispatcher.Stop() +} + +func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T) { + const numAlerts = 5000 + + logger := promslog.NewNopLogger() + marker := types.NewMarker(prometheus.NewRegistry()) + alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, nil, logger, nil) + if err != nil { + t.Fatal(err) + } + defer alerts.Close() + + route := &dispatch.Route{ + RouteOpts: dispatch.RouteOpts{ + Receiver: "default", + GroupBy: map[model.LabelName]struct{}{"ruleId": {}}, + GroupWait: 0, + GroupInterval: 1 * time.Hour, // Should never hit in this test. + RepeatInterval: 1 * time.Hour, // Should never hit in this test. + }, + } + + timeout := func(d time.Duration) time.Duration { return d } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + metrics := NewDispatcherMetrics(false, prometheus.NewRegistry()) + nfManager := nfmanagertest.NewMock() + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, metrics, nfManager, "test-org") + go dispatcher.Run() + defer dispatcher.Stop() + + // Push all alerts. + for i := 0; i < numAlerts; i++ { + alert := newAlert(model.LabelSet{"ruleId": model.LabelValue(fmt.Sprintf("Alert_%d", i))}) + require.NoError(t, alerts.Put(alert)) + } + + // Wait until the alerts have been notified or the waiting timeout expires. + for deadline := time.Now().Add(5 * time.Second); time.Now().Before(deadline); { + if len(recorder.Alerts()) >= numAlerts { + break + } + + // Throttle. + time.Sleep(10 * time.Millisecond) + } + + // We expect all alerts to be notified immediately, since they all belong to different groups. + require.Len(t, recorder.Alerts(), numAlerts) +} + +func TestDispatcher_DoMaintenance(t *testing.T) { + r := prometheus.NewRegistry() + marker := types.NewMarker(r) + + alerts, err := mem.NewAlerts(context.Background(), marker, time.Minute, nil, promslog.NewNopLogger(), nil) + if err != nil { + t.Fatal(err) + } + + route := &dispatch.Route{ + RouteOpts: dispatch.RouteOpts{ + GroupBy: map[model.LabelName]struct{}{"alertname": {}}, + GroupWait: 0, + GroupInterval: 5 * time.Minute, // Should never hit in this test. + }, + } + timeout := func(d time.Duration) time.Duration { return d } + recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)} + + ctx := context.Background() + metrics := NewDispatcherMetrics(false, r) + nfManager := nfmanagertest.NewMock() + // Set up default expectation that may be called during maintenance + dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, promslog.NewNopLogger(), metrics, nfManager, "test-org") + aggrGroups := make(map[*dispatch.Route]map[model.Fingerprint]*aggrGroup) + aggrGroups[route] = make(map[model.Fingerprint]*aggrGroup) + + // Insert an aggregation group with no alerts. + labels := model.LabelSet{"alertname": "1"} + aggrGroup1 := newAggrGroup(ctx, labels, route, timeout, promslog.NewNopLogger(), time.Hour) + aggrGroups[route][aggrGroup1.fingerprint()] = aggrGroup1 + dispatcher.aggrGroupsPerRoute = aggrGroups + // Must run otherwise doMaintenance blocks on aggrGroup1.stop(). + go aggrGroup1.run(func(context.Context, ...*types.Alert) bool { return true }) + + // Insert a marker for the aggregation group's group key. + marker.SetMuted(route.ID(), aggrGroup1.GroupKey(), []string{"weekends"}) + mutedBy, isMuted := marker.Muted(route.ID(), aggrGroup1.GroupKey()) + require.True(t, isMuted) + require.Equal(t, []string{"weekends"}, mutedBy) + + // Run the maintenance and the marker should be removed. + dispatcher.doMaintenance() + mutedBy, isMuted = marker.Muted(route.ID(), aggrGroup1.GroupKey()) + require.False(t, isMuted) + require.Empty(t, mutedBy) +} diff --git a/pkg/alertmanager/alertmanagerserver/server.go b/pkg/alertmanager/alertmanagerserver/server.go index 86ea94570be0..d4c0ddad7215 100644 --- a/pkg/alertmanager/alertmanagerserver/server.go +++ b/pkg/alertmanager/alertmanagerserver/server.go @@ -8,6 +8,7 @@ import ( "time" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagernotify" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/errors" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" "github.com/prometheus/alertmanager/dispatch" @@ -50,29 +51,31 @@ type Server struct { stateStore alertmanagertypes.StateStore // alertmanager primitives from upstream alertmanager - alerts *mem.Alerts - nflog *nflog.Log - dispatcher *dispatch.Dispatcher - dispatcherMetrics *dispatch.DispatcherMetrics - inhibitor *inhibit.Inhibitor - silencer *silence.Silencer - silences *silence.Silences - timeIntervals map[string][]timeinterval.TimeInterval - pipelineBuilder *notify.PipelineBuilder - marker *alertmanagertypes.MemMarker - tmpl *template.Template - wg sync.WaitGroup - stopc chan struct{} + alerts *mem.Alerts + nflog *nflog.Log + dispatcher *Dispatcher + dispatcherMetrics *DispatcherMetrics + inhibitor *inhibit.Inhibitor + silencer *silence.Silencer + silences *silence.Silences + timeIntervals map[string][]timeinterval.TimeInterval + pipelineBuilder *notify.PipelineBuilder + marker *alertmanagertypes.MemMarker + tmpl *template.Template + wg sync.WaitGroup + stopc chan struct{} + notificationManager nfmanager.NotificationManager } -func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registerer, srvConfig Config, orgID string, stateStore alertmanagertypes.StateStore) (*Server, error) { +func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registerer, srvConfig Config, orgID string, stateStore alertmanagertypes.StateStore, nfManager nfmanager.NotificationManager) (*Server, error) { server := &Server{ - logger: logger.With("pkg", "go.signoz.io/pkg/alertmanager/alertmanagerserver"), - registry: registry, - srvConfig: srvConfig, - orgID: orgID, - stateStore: stateStore, - stopc: make(chan struct{}), + logger: logger.With("pkg", "go.signoz.io/pkg/alertmanager/alertmanagerserver"), + registry: registry, + srvConfig: srvConfig, + orgID: orgID, + stateStore: stateStore, + stopc: make(chan struct{}), + notificationManager: nfManager, } signozRegisterer := prometheus.WrapRegistererWithPrefix("signoz_", registry) signozRegisterer = prometheus.WrapRegistererWith(prometheus.Labels{"org_id": server.orgID}, signozRegisterer) @@ -190,7 +193,7 @@ func New(ctx context.Context, logger *slog.Logger, registry prometheus.Registere } server.pipelineBuilder = notify.NewPipelineBuilder(signozRegisterer, featurecontrol.NoopFlags{}) - server.dispatcherMetrics = dispatch.NewDispatcherMetrics(false, signozRegisterer) + server.dispatcherMetrics = NewDispatcherMetrics(false, signozRegisterer) return server, nil } @@ -204,7 +207,6 @@ func (server *Server) GetAlerts(ctx context.Context, params alertmanagertypes.Ge func (server *Server) PutAlerts(ctx context.Context, postableAlerts alertmanagertypes.PostableAlerts) error { alerts, err := alertmanagertypes.NewAlertsFromPostableAlerts(postableAlerts, time.Duration(server.srvConfig.Global.ResolveTimeout), time.Now()) - // Notification sending alert takes precedence over validation errors. if err := server.alerts.Put(alerts...); err != nil { return err @@ -295,7 +297,7 @@ func (server *Server) SetConfig(ctx context.Context, alertmanagerConfig *alertma return d } - server.dispatcher = dispatch.NewDispatcher( + server.dispatcher = NewDispatcher( server.alerts, routes, pipeline, @@ -304,6 +306,8 @@ func (server *Server) SetConfig(ctx context.Context, alertmanagerConfig *alertma nil, server.logger, server.dispatcherMetrics, + server.notificationManager, + server.orgID, ) // Do not try to add these to server.wg as there seems to be a race condition if diff --git a/pkg/alertmanager/alertmanagerserver/server_test.go b/pkg/alertmanager/alertmanagerserver/server_test.go index f5cc5fe67004..8aad88b2ff4a 100644 --- a/pkg/alertmanager/alertmanagerserver/server_test.go +++ b/pkg/alertmanager/alertmanagerserver/server_test.go @@ -11,6 +11,7 @@ import ( "testing" "time" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" "github.com/SigNoz/signoz/pkg/types/alertmanagertypes/alertmanagertypestest" "github.com/go-openapi/strfmt" @@ -23,7 +24,8 @@ import ( ) func TestServerSetConfigAndStop(t *testing.T) { - server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore()) + notificationManager := nfmanagertest.NewMock() + server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager) require.NoError(t, err) amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1") @@ -34,7 +36,8 @@ func TestServerSetConfigAndStop(t *testing.T) { } func TestServerTestReceiverTypeWebhook(t *testing.T) { - server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore()) + notificationManager := nfmanagertest.NewMock() + server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), NewConfig(), "1", alertmanagertypestest.NewStateStore(), notificationManager) require.NoError(t, err) amConfig, err := alertmanagertypes.NewDefaultConfig(alertmanagertypes.GlobalConfig{}, alertmanagertypes.RouteConfig{GroupInterval: 1 * time.Minute, RepeatInterval: 1 * time.Minute, GroupWait: 1 * time.Minute}, "1") @@ -81,7 +84,8 @@ func TestServerPutAlerts(t *testing.T) { stateStore := alertmanagertypestest.NewStateStore() srvCfg := NewConfig() srvCfg.Route.GroupInterval = 1 * time.Second - server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore) + notificationManager := nfmanagertest.NewMock() + server, err := New(context.Background(), slog.New(slog.NewTextHandler(io.Discard, nil)), prometheus.NewRegistry(), srvCfg, "1", stateStore, notificationManager) require.NoError(t, err) amConfig, err := alertmanagertypes.NewDefaultConfig(srvCfg.Global, srvCfg.Route, "1") diff --git a/pkg/alertmanager/alertmanagerserver/telemetry.go b/pkg/alertmanager/alertmanagerserver/telemetry.go new file mode 100644 index 000000000000..1ad1d0f55590 --- /dev/null +++ b/pkg/alertmanager/alertmanagerserver/telemetry.go @@ -0,0 +1,43 @@ +package alertmanagerserver + +import "github.com/prometheus/client_golang/prometheus" + +type DispatcherMetrics struct { + aggrGroups prometheus.Gauge + processingDuration prometheus.Summary + aggrGroupLimitReached prometheus.Counter +} + +// NewDispatcherMetrics returns a new registered DispatchMetrics. +// todo(aniketio-ctrl): change prom metrics to otel metrics +func NewDispatcherMetrics(registerLimitMetrics bool, r prometheus.Registerer) *DispatcherMetrics { + m := DispatcherMetrics{ + aggrGroups: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "signoz_alertmanager_dispatcher_aggregation_groups", + Help: "Number of active aggregation groups", + }, + ), + processingDuration: prometheus.NewSummary( + prometheus.SummaryOpts{ + Name: "signoz_alertmanager_dispatcher_alert_processing_duration_seconds", + Help: "Summary of latencies for the processing of alerts.", + }, + ), + aggrGroupLimitReached: prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "signoz_alertmanager_dispatcher_aggregation_group_limit_reached_total", + Help: "Number of times when dispatcher failed to create new aggregation group due to limit.", + }, + ), + } + + if r != nil { + r.MustRegister(m.aggrGroups, m.processingDuration) + if registerLimitMetrics { + r.MustRegister(m.aggrGroupLimitReached) + } + } + + return &m +} diff --git a/pkg/alertmanager/nfmanager/config.go b/pkg/alertmanager/nfmanager/config.go new file mode 100644 index 000000000000..641cdf670132 --- /dev/null +++ b/pkg/alertmanager/nfmanager/config.go @@ -0,0 +1,18 @@ +package nfmanager + +import "github.com/SigNoz/signoz/pkg/factory" + +type Config struct { +} + +func NewConfigFactory() factory.ConfigFactory { + return factory.NewConfigFactory(factory.MustNewName("nfmanager"), newConfig) +} + +func newConfig() factory.Config { + return Config{} +} + +func (c Config) Validate() error { + return nil +} diff --git a/pkg/alertmanager/nfmanager/nfmanagertest/provider.go b/pkg/alertmanager/nfmanager/nfmanagertest/provider.go new file mode 100644 index 000000000000..2a321ce80712 --- /dev/null +++ b/pkg/alertmanager/nfmanager/nfmanagertest/provider.go @@ -0,0 +1,75 @@ +package nfmanagertest + +import ( + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" +) + +// MockNotificationManager is a simple mock implementation of NotificationManager +type MockNotificationManager struct { + configs map[string]*alertmanagertypes.NotificationConfig + errors map[string]error +} + +// NewMock creates a new mock notification manager +func NewMock() *MockNotificationManager { + return &MockNotificationManager{ + configs: make(map[string]*alertmanagertypes.NotificationConfig), + errors: make(map[string]error), + } +} + +func getKey(orgId string, ruleId string) string { + return orgId + ":" + ruleId +} + +func (m *MockNotificationManager) GetNotificationConfig(orgID string, ruleID string) (*alertmanagertypes.NotificationConfig, error) { + key := getKey(orgID, ruleID) + if err := m.errors[key]; err != nil { + return nil, err + } + if config := m.configs[key]; config != nil { + return config, nil + } + + notificationConfig := alertmanagertypes.GetDefaultNotificationConfig() + return ¬ificationConfig, nil +} + +func (m *MockNotificationManager) SetNotificationConfig(orgID string, ruleID string, config *alertmanagertypes.NotificationConfig) error { + key := getKey(orgID, ruleID) + if err := m.errors[key]; err != nil { + return err + } + m.configs[key] = config + return nil +} + +func (m *MockNotificationManager) DeleteNotificationConfig(orgID string, ruleID string) error { + key := getKey(orgID, ruleID) + if err := m.errors[key]; err != nil { + return err + } + delete(m.configs, key) + return nil +} + +func (m *MockNotificationManager) SetMockConfig(orgID, ruleID string, config *alertmanagertypes.NotificationConfig) { + key := getKey(orgID, ruleID) + m.configs[key] = config +} + +func (m *MockNotificationManager) SetMockError(orgID, ruleID string, err error) { + key := getKey(orgID, ruleID) + m.errors[key] = err +} + +func (m *MockNotificationManager) ClearMockData() { + m.configs = make(map[string]*alertmanagertypes.NotificationConfig) + m.errors = make(map[string]error) +} + +func (m *MockNotificationManager) HasConfig(orgID, ruleID string) bool { + key := getKey(orgID, ruleID) + _, exists := m.configs[key] + return exists +} diff --git a/pkg/alertmanager/nfmanager/notificationmanager.go b/pkg/alertmanager/nfmanager/notificationmanager.go new file mode 100644 index 000000000000..531c2baae725 --- /dev/null +++ b/pkg/alertmanager/nfmanager/notificationmanager.go @@ -0,0 +1,13 @@ +// Package nfmanager provides interfaces and implementations for alert notification grouping strategies. +package nfmanager + +import ( + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" +) + +// NotificationManager defines how alerts should be grouped and configured for notification with multi-tenancy support. +type NotificationManager interface { + GetNotificationConfig(orgID string, ruleID string) (*alertmanagertypes.NotificationConfig, error) + SetNotificationConfig(orgID string, ruleID string, config *alertmanagertypes.NotificationConfig) error + DeleteNotificationConfig(orgID string, ruleID string) error +} diff --git a/pkg/alertmanager/nfmanager/rulebasednotification/provider.go b/pkg/alertmanager/nfmanager/rulebasednotification/provider.go new file mode 100644 index 000000000000..0ce4141ad1d9 --- /dev/null +++ b/pkg/alertmanager/nfmanager/rulebasednotification/provider.go @@ -0,0 +1,103 @@ +package rulebasednotification + +import ( + "context" + "sync" + + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/errors" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + + "github.com/SigNoz/signoz/pkg/factory" +) + +type provider struct { + settings factory.ScopedProviderSettings + orgToFingerprintToNotificationConfig map[string]map[string]alertmanagertypes.NotificationConfig + mutex sync.RWMutex +} + +// NewFactory creates a new factory for the rule-based grouping strategy. +func NewFactory() factory.ProviderFactory[nfmanager.NotificationManager, nfmanager.Config] { + return factory.NewProviderFactory( + factory.MustNewName("rulebased"), + func(ctx context.Context, settings factory.ProviderSettings, config nfmanager.Config) (nfmanager.NotificationManager, error) { + return New(ctx, settings, config) + }, + ) +} + +// New creates a new rule-based grouping strategy provider. +func New(ctx context.Context, providerSettings factory.ProviderSettings, config nfmanager.Config) (nfmanager.NotificationManager, error) { + settings := factory.NewScopedProviderSettings(providerSettings, "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification") + + return &provider{ + settings: settings, + orgToFingerprintToNotificationConfig: make(map[string]map[string]alertmanagertypes.NotificationConfig), + }, nil +} + +// GetNotificationConfig retrieves the notification configuration for the specified alert and organization. +func (r *provider) GetNotificationConfig(orgID string, ruleID string) (*alertmanagertypes.NotificationConfig, error) { + notificationConfig := alertmanagertypes.GetDefaultNotificationConfig() + if orgID == "" || ruleID == "" { + return ¬ificationConfig, nil + } + + r.mutex.RLock() + defer r.mutex.RUnlock() + + if orgConfigs, exists := r.orgToFingerprintToNotificationConfig[orgID]; exists { + if config, configExists := orgConfigs[ruleID]; configExists { + if config.Renotify.RenotifyInterval != 0 { + notificationConfig.Renotify.RenotifyInterval = config.Renotify.RenotifyInterval + } + if config.Renotify.NoDataInterval != 0 { + notificationConfig.Renotify.NoDataInterval = config.Renotify.NoDataInterval + } + for k, v := range config.NotificationGroup { + notificationConfig.NotificationGroup[k] = v + } + } + } + + return ¬ificationConfig, nil +} + +// SetNotificationConfig updates the notification configuration for the specified alert and organization. +func (r *provider) SetNotificationConfig(orgID string, ruleID string, config *alertmanagertypes.NotificationConfig) error { + if orgID == "" || ruleID == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "no org or rule id provided") + } + + if config == nil { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "notification config cannot be nil") + } + + r.mutex.Lock() + defer r.mutex.Unlock() + + // Initialize org map if it doesn't exist + if r.orgToFingerprintToNotificationConfig[orgID] == nil { + r.orgToFingerprintToNotificationConfig[orgID] = make(map[string]alertmanagertypes.NotificationConfig) + } + + r.orgToFingerprintToNotificationConfig[orgID][ruleID] = config.DeepCopy() + + return nil +} + +func (r *provider) DeleteNotificationConfig(orgID string, ruleID string) error { + if orgID == "" || ruleID == "" { + return errors.NewInvalidInputf(errors.CodeInvalidInput, "no org or rule id provided") + } + + r.mutex.Lock() + defer r.mutex.Unlock() + + if _, exists := r.orgToFingerprintToNotificationConfig[orgID]; exists { + delete(r.orgToFingerprintToNotificationConfig[orgID], ruleID) + } + + return nil +} diff --git a/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go b/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go new file mode 100644 index 000000000000..b380cc1cee89 --- /dev/null +++ b/pkg/alertmanager/nfmanager/rulebasednotification/provider_test.go @@ -0,0 +1,270 @@ +package rulebasednotification + +import ( + "context" + "github.com/prometheus/common/model" + "sync" + "testing" + "time" + + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/factory" + "github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" + "github.com/prometheus/alertmanager/types" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func createTestProviderSettings() factory.ProviderSettings { + return instrumentationtest.New().ToProviderSettings() +} + +func TestNewFactory(t *testing.T) { + providerFactory := NewFactory() + assert.NotNil(t, providerFactory) + assert.Equal(t, "rulebased", providerFactory.Name().String()) +} + +func TestNew(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + provider, err := New(ctx, providerSettings, config) + require.NoError(t, err) + assert.NotNil(t, provider) + + // Verify provider implements the interface correctly + assert.Implements(t, (*nfmanager.NotificationManager)(nil), provider) +} + +func TestProvider_SetNotificationConfig(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + provider, err := New(ctx, providerSettings, config) + require.NoError(t, err) + + tests := []struct { + name string + orgID string + ruleID string + config *alertmanagertypes.NotificationConfig + wantErr bool + }{ + { + name: "valid parameters", + orgID: "org1", + ruleID: "rule1", + config: &alertmanagertypes.NotificationConfig{ + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 2 * time.Hour, + NoDataInterval: 2 * time.Hour, + }, + }, + wantErr: false, + }, + { + name: "empty orgID", + orgID: "", + ruleID: "rule1", + config: &alertmanagertypes.NotificationConfig{ + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: time.Hour, + NoDataInterval: time.Hour, + }, + }, + wantErr: true, // Should error due to validation + }, + { + name: "empty ruleID", + orgID: "org1", + ruleID: "", + config: &alertmanagertypes.NotificationConfig{ + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: time.Hour, + NoDataInterval: time.Hour, + }, + }, + wantErr: true, // Should error due to validation + }, + { + name: "nil config", + orgID: "org1", + ruleID: "rule1", + config: nil, + wantErr: true, // Should error due to nil config + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := provider.SetNotificationConfig(tt.orgID, tt.ruleID, tt.config) + if tt.wantErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + + // If we set a config successfully, we should be able to retrieve it + if tt.orgID != "" && tt.ruleID != "" && tt.config != nil { + retrievedConfig, retrieveErr := provider.GetNotificationConfig(tt.orgID, tt.ruleID) + assert.NoError(t, retrieveErr) + assert.NotNil(t, retrievedConfig) + assert.Equal(t, tt.config.Renotify, retrievedConfig.Renotify) + } + } + }) + } +} + +func TestProvider_GetNotificationConfig(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + provider, err := New(ctx, providerSettings, config) + require.NoError(t, err) + + orgID := "test-org" + ruleID := "rule1" + customConfig := &alertmanagertypes.NotificationConfig{ + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 30 * time.Minute, + NoDataInterval: 30 * time.Minute, + }, + } + + ruleId1 := "rule-1" + customConfig1 := &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("group1"): {}, + model.LabelName("group2"): {}, + }, + } + + // Set config for alert1 + err = provider.SetNotificationConfig(orgID, ruleID, customConfig) + require.NoError(t, err) + + err = provider.SetNotificationConfig(orgID, ruleId1, customConfig1) + require.NoError(t, err) + + tests := []struct { + name string + orgID string + ruleID string + alert *types.Alert + expectedConfig *alertmanagertypes.NotificationConfig + shouldFallback bool + }{ + { + name: "existing config", + orgID: orgID, + ruleID: ruleID, + expectedConfig: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("ruleId"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 30 * time.Minute, + NoDataInterval: 30 * time.Minute, + }, + }, + shouldFallback: false, + }, + { + name: "non-existing config - fallback", + orgID: orgID, + ruleID: ruleId1, + expectedConfig: &alertmanagertypes.NotificationConfig{ + NotificationGroup: map[model.LabelName]struct{}{ + model.LabelName("group1"): {}, + model.LabelName("group2"): {}, + model.LabelName("ruleId"): {}, + }, + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: 4 * time.Hour, + NoDataInterval: 4 * time.Hour, + }, + }, // Will get fallback from standardnotification + shouldFallback: false, + }, + { + name: "empty orgID - fallback", + orgID: "", + ruleID: ruleID, + expectedConfig: nil, // Will get fallback + shouldFallback: true, + }, + { + name: "nil alert - fallback", + orgID: orgID, + ruleID: "rule3", // Different ruleID to get fallback + alert: nil, + expectedConfig: nil, // Will get fallback + shouldFallback: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + config, err := provider.GetNotificationConfig(tt.orgID, tt.ruleID) + assert.NoError(t, err) + + if tt.shouldFallback { + // Should get fallback config (4 hour default) + assert.NotNil(t, config) + assert.Equal(t, 4*time.Hour, config.Renotify.RenotifyInterval) + } else { + // Should get our custom config + assert.NotNil(t, config) + assert.Equal(t, tt.expectedConfig, config) + } + }) + } +} + +func TestProvider_ConcurrentAccess(t *testing.T) { + ctx := context.Background() + providerSettings := createTestProviderSettings() + config := nfmanager.Config{} + + provider, err := New(ctx, providerSettings, config) + require.NoError(t, err) + + orgID := "test-org" + + var wg sync.WaitGroup + + // Writer goroutine + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 50; i++ { + config := &alertmanagertypes.NotificationConfig{ + Renotify: alertmanagertypes.ReNotificationConfig{ + RenotifyInterval: time.Duration(i+1) * time.Minute, + NoDataInterval: time.Duration(i+1) * time.Minute, + }, + } + err := provider.SetNotificationConfig(orgID, "rule1", config) + assert.NoError(t, err) + } + }() + + // Reader goroutine + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 50; i++ { + config, err := provider.GetNotificationConfig(orgID, "rule1") + assert.NoError(t, err) + assert.NotNil(t, config) + } + }() + + // Wait for both goroutines to complete + wg.Wait() +} diff --git a/pkg/alertmanager/service.go b/pkg/alertmanager/service.go index c67c9e9edc40..163c673b7622 100644 --- a/pkg/alertmanager/service.go +++ b/pkg/alertmanager/service.go @@ -5,6 +5,7 @@ import ( "sync" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/errors" "github.com/SigNoz/signoz/pkg/factory" "github.com/SigNoz/signoz/pkg/modules/organization" @@ -32,6 +33,8 @@ type Service struct { // Mutex to protect the servers map serversMtx sync.RWMutex + + notificationManager nfmanager.NotificationManager } func New( @@ -41,15 +44,17 @@ func New( stateStore alertmanagertypes.StateStore, configStore alertmanagertypes.ConfigStore, orgGetter organization.Getter, + nfManager nfmanager.NotificationManager, ) *Service { service := &Service{ - config: config, - stateStore: stateStore, - configStore: configStore, - orgGetter: orgGetter, - settings: settings, - servers: make(map[string]*alertmanagerserver.Server), - serversMtx: sync.RWMutex{}, + config: config, + stateStore: stateStore, + configStore: configStore, + orgGetter: orgGetter, + settings: settings, + servers: make(map[string]*alertmanagerserver.Server), + serversMtx: sync.RWMutex{}, + notificationManager: nfManager, } return service @@ -167,7 +172,7 @@ func (service *Service) newServer(ctx context.Context, orgID string) (*alertmana return nil, err } - server, err := alertmanagerserver.New(ctx, service.settings.Logger(), service.settings.PrometheusRegisterer(), service.config, orgID, service.stateStore) + server, err := alertmanagerserver.New(ctx, service.settings.Logger(), service.settings.PrometheusRegisterer(), service.config, orgID, service.stateStore, service.notificationManager) if err != nil { return nil, err } diff --git a/pkg/alertmanager/signozalertmanager/provider.go b/pkg/alertmanager/signozalertmanager/provider.go index 3f2b274585c7..a92c5ab4e89f 100644 --- a/pkg/alertmanager/signozalertmanager/provider.go +++ b/pkg/alertmanager/signozalertmanager/provider.go @@ -6,6 +6,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerstore/sqlalertmanagerstore" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/errors" "github.com/SigNoz/signoz/pkg/factory" "github.com/SigNoz/signoz/pkg/modules/organization" @@ -15,21 +16,22 @@ import ( ) type provider struct { - service *alertmanager.Service - config alertmanager.Config - settings factory.ScopedProviderSettings - configStore alertmanagertypes.ConfigStore - stateStore alertmanagertypes.StateStore - stopC chan struct{} + service *alertmanager.Service + config alertmanager.Config + settings factory.ScopedProviderSettings + configStore alertmanagertypes.ConfigStore + stateStore alertmanagertypes.StateStore + notificationManager nfmanager.NotificationManager + stopC chan struct{} } -func NewFactory(sqlstore sqlstore.SQLStore, orgGetter organization.Getter) factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config] { +func NewFactory(sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config] { return factory.NewProviderFactory(factory.MustNewName("signoz"), func(ctx context.Context, settings factory.ProviderSettings, config alertmanager.Config) (alertmanager.Alertmanager, error) { - return New(ctx, settings, config, sqlstore, orgGetter) + return New(ctx, settings, config, sqlstore, orgGetter, notificationManager) }) } -func New(ctx context.Context, providerSettings factory.ProviderSettings, config alertmanager.Config, sqlstore sqlstore.SQLStore, orgGetter organization.Getter) (*provider, error) { +func New(ctx context.Context, providerSettings factory.ProviderSettings, config alertmanager.Config, sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) (*provider, error) { settings := factory.NewScopedProviderSettings(providerSettings, "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager") configStore := sqlalertmanagerstore.NewConfigStore(sqlstore) stateStore := sqlalertmanagerstore.NewStateStore(sqlstore) @@ -42,12 +44,14 @@ func New(ctx context.Context, providerSettings factory.ProviderSettings, config stateStore, configStore, orgGetter, + notificationManager, ), - settings: settings, - config: config, - configStore: configStore, - stateStore: stateStore, - stopC: make(chan struct{}), + settings: settings, + config: config, + configStore: configStore, + stateStore: stateStore, + notificationManager: notificationManager, + stopC: make(chan struct{}), } return p, nil @@ -191,3 +195,19 @@ func (provider *provider) Collect(ctx context.Context, orgID valuer.UUID) (map[s return alertmanagertypes.NewStatsFromChannels(channels), nil } + +func (provider *provider) SetNotificationConfig(ctx context.Context, orgID valuer.UUID, ruleId string, config *alertmanagertypes.NotificationConfig) error { + err := provider.notificationManager.SetNotificationConfig(orgID.StringValue(), ruleId, config) + if err != nil { + return err + } + return nil +} + +func (provider *provider) DeleteNotificationConfig(ctx context.Context, orgID valuer.UUID, ruleId string) error { + err := provider.notificationManager.DeleteNotificationConfig(orgID.StringValue(), ruleId) + if err != nil { + return err + } + return nil +} diff --git a/pkg/query-service/app/cloudintegrations/controller_test.go b/pkg/query-service/app/cloudintegrations/controller_test.go index 2bbcd200d0b5..345a2599b37c 100644 --- a/pkg/query-service/app/cloudintegrations/controller_test.go +++ b/pkg/query-service/app/cloudintegrations/controller_test.go @@ -7,6 +7,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics/analyticstest" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" @@ -35,7 +36,9 @@ func TestRegenerateConnectionUrlWithUpdatedConfig(t *testing.T) { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlStore), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter, notificationManager) require.NoError(err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() @@ -92,7 +95,9 @@ func TestAgentCheckIns(t *testing.T) { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlStore), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter, notificationManager) require.NoError(err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() @@ -188,7 +193,9 @@ func TestCantDisconnectNonExistentAccount(t *testing.T) { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlStore), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter, notificationManager) require.NoError(err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() @@ -216,7 +223,9 @@ func TestConfigureService(t *testing.T) { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlStore), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter, notificationManager) require.NoError(err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/query-service/app/integrations/manager_test.go b/pkg/query-service/app/integrations/manager_test.go index aec4b0ac38c9..43f6706061e0 100644 --- a/pkg/query-service/app/integrations/manager_test.go +++ b/pkg/query-service/app/integrations/manager_test.go @@ -7,6 +7,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics/analyticstest" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" @@ -29,7 +30,8 @@ func TestIntegrationLifecycle(t *testing.T) { providerSettings := instrumentationtest.New().ToProviderSettings() sharder, _ := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) orgGetter := implorganization.NewGetter(implorganization.NewStore(store), sharder) - alertmanager, _ := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, store, orgGetter) + notificationManager := nfmanagertest.NewMock() + alertmanager, _ := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, store, orgGetter, notificationManager) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() analytics := analyticstest.New() diff --git a/pkg/query-service/rules/manager.go b/pkg/query-service/rules/manager.go index 1a57414f13e2..8263b03d5bbd 100644 --- a/pkg/query-service/rules/manager.go +++ b/pkg/query-service/rules/manager.go @@ -226,6 +226,7 @@ func NewManager(o *ManagerOptions) (*Manager, error) { sqlstore: o.SqlStore, } + zap.L().Debug("Manager created successfully with NotificationGroup") return m, nil } @@ -278,7 +279,14 @@ func (m *Manager) initiate(ctx context.Context) error { loadErrors = append(loadErrors, err) continue } - + if parsedRule.NotificationSettings != nil { + config := parsedRule.NotificationSettings.GetAlertManagerNotificationConfig() + err = m.alertmanager.SetNotificationConfig(ctx, org.ID, rec.ID.StringValue(), &config) + if err != nil { + loadErrors = append(loadErrors, err) + zap.L().Info("failed to set rule notification config", zap.String("ruleId", rec.ID.StringValue())) + } + } if !parsedRule.Disabled { err := m.addTask(ctx, org.ID, &parsedRule, taskName) if err != nil { @@ -360,17 +368,22 @@ func (m *Manager) EditRule(ctx context.Context, ruleStr string, id valuer.UUID) } else { preferredChannels = parsedRule.PreferredChannels } - err = cfg.UpdateRuleIDMatcher(id.StringValue(), preferredChannels) if err != nil { return err } + if parsedRule.NotificationSettings != nil { + config := parsedRule.NotificationSettings.GetAlertManagerNotificationConfig() + err = m.alertmanager.SetNotificationConfig(ctx, orgID, existingRule.ID.StringValue(), &config) + if err != nil { + return err + } + } err = m.alertmanager.SetConfig(ctx, cfg) if err != nil { return err } - err = m.syncRuleStateWithTask(ctx, orgID, prepareTaskName(existingRule.ID.StringValue()), &parsedRule) if err != nil { return err @@ -453,6 +466,11 @@ func (m *Manager) DeleteRule(ctx context.Context, idStr string) error { return err } + orgID, err := valuer.NewUUID(claims.OrgID) + if err != nil { + return err + } + return m.ruleStore.DeleteRule(ctx, id, func(ctx context.Context) error { cfg, err := m.alertmanager.GetConfig(ctx, claims.OrgID) if err != nil { @@ -469,6 +487,8 @@ func (m *Manager) DeleteRule(ctx context.Context, idStr string) error { return err } + err = m.alertmanager.DeleteNotificationConfig(ctx, orgID, id.String()) + taskName := prepareTaskName(id.StringValue()) m.deleteTask(taskName) @@ -547,6 +567,14 @@ func (m *Manager) CreateRule(ctx context.Context, ruleStr string) (*ruletypes.Ge preferredChannels = parsedRule.PreferredChannels } + if parsedRule.NotificationSettings != nil { + config := parsedRule.NotificationSettings.GetAlertManagerNotificationConfig() + err = m.alertmanager.SetNotificationConfig(ctx, orgID, storedRule.ID.StringValue(), &config) + if err != nil { + return err + } + } + err = cfg.CreateRuleIDMatcher(id.StringValue(), preferredChannels) if err != nil { return err @@ -558,7 +586,7 @@ func (m *Manager) CreateRule(ctx context.Context, ruleStr string) (*ruletypes.Ge } taskName := prepareTaskName(id.StringValue()) - if err := m.addTask(ctx, orgID, &parsedRule, taskName); err != nil { + if err = m.addTask(ctx, orgID, &parsedRule, taskName); err != nil { return err } @@ -732,13 +760,12 @@ func (m *Manager) prepareTestNotifyFunc() NotifyFunc { alert := alerts[0] generatorURL := alert.GeneratorURL - a := &alertmanagertypes.PostableAlert{ - Annotations: alert.Annotations.Map(), - StartsAt: strfmt.DateTime(alert.FiredAt), - Alert: alertmanagertypes.AlertModel{ - Labels: alert.Labels.Map(), - GeneratorURL: strfmt.URI(generatorURL), - }, + a := &alertmanagertypes.PostableAlert{} + a.Annotations = alert.Annotations.Map() + a.StartsAt = strfmt.DateTime(alert.FiredAt) + a.Alert = alertmanagertypes.AlertModel{ + Labels: alert.Labels.Map(), + GeneratorURL: strfmt.URI(generatorURL), } if !alert.ResolvedAt.IsZero() { a.EndsAt = strfmt.DateTime(alert.ResolvedAt) diff --git a/pkg/query-service/rules/manager_test.go b/pkg/query-service/rules/manager_test.go index 795a918345b1..2702a93153ea 100644 --- a/pkg/query-service/rules/manager_test.go +++ b/pkg/query-service/rules/manager_test.go @@ -5,6 +5,7 @@ import ( "testing" "time" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/stretchr/testify/assert" "go.uber.org/zap" @@ -265,7 +266,8 @@ func setupTestManager(t *testing.T) (*Manager, *rulestoretest.MockSQLRuleStore, t.Fatalf("Failed to create noop sharder: %v", err) } orgGetter := implorganization.NewGetter(implorganization.NewStore(testDB), noopSharder) - alertManager, err := signozalertmanager.New(context.TODO(), settings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter) + notificationManager := nfmanagertest.NewMock() + alertManager, err := signozalertmanager.New(context.TODO(), settings, alertmanager.Config{Provider: "signoz", Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter, notificationManager) if err != nil { t.Fatalf("Failed to create alert manager: %v", err) } diff --git a/pkg/query-service/rules/threshold_rule.go b/pkg/query-service/rules/threshold_rule.go index b1ebb069056b..536ee1cf2f0f 100644 --- a/pkg/query-service/rules/threshold_rule.go +++ b/pkg/query-service/rules/threshold_rule.go @@ -654,6 +654,7 @@ func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time) (interface{}, er } if smpl.IsMissing { lb.Set(labels.AlertNameLabel, "[No data] "+r.Name()) + lb.Set(labels.NoDataLabel, "true") } // Links with timestamps should go in annotations since labels diff --git a/pkg/query-service/tests/integration/filter_suggestions_test.go b/pkg/query-service/tests/integration/filter_suggestions_test.go index 77b3e1b87c3d..65764a9bebfe 100644 --- a/pkg/query-service/tests/integration/filter_suggestions_test.go +++ b/pkg/query-service/tests/integration/filter_suggestions_test.go @@ -13,6 +13,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics/analyticstest" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" @@ -312,7 +313,9 @@ func NewFilterSuggestionsTestBed(t *testing.T) *FilterSuggestionsTestBed { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(t, err) orgGetter := implorganization.NewGetter(implorganization.NewStore(testDB), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(t, err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter, notificationManager) require.NoError(t, err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/query-service/tests/integration/logparsingpipeline_test.go b/pkg/query-service/tests/integration/logparsingpipeline_test.go index 6be197bacdad..25046bcd5e71 100644 --- a/pkg/query-service/tests/integration/logparsingpipeline_test.go +++ b/pkg/query-service/tests/integration/logparsingpipeline_test.go @@ -14,6 +14,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics/analyticstest" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" @@ -492,7 +493,9 @@ func NewTestbedWithoutOpamp(t *testing.T, sqlStore sqlstore.SQLStore) *LogPipeli sharder, err := noopsharder.New(context.Background(), providerSettings, sharder.Config{}) require.NoError(t, err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlStore), sharder) - alertmanager, err := signozalertmanager.New(context.Background(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(t, err) + alertmanager, err := signozalertmanager.New(context.Background(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, sqlStore, orgGetter, notificationManager) require.NoError(t, err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/query-service/tests/integration/signoz_cloud_integrations_test.go b/pkg/query-service/tests/integration/signoz_cloud_integrations_test.go index 35faf33d1628..7d78b5a0b87f 100644 --- a/pkg/query-service/tests/integration/signoz_cloud_integrations_test.go +++ b/pkg/query-service/tests/integration/signoz_cloud_integrations_test.go @@ -11,6 +11,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics/analyticstest" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" @@ -373,7 +374,9 @@ func NewCloudIntegrationsTestBed(t *testing.T, testDB sqlstore.SQLStore) *CloudI sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(t, err) orgGetter := implorganization.NewGetter(implorganization.NewStore(testDB), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter) + nfmanager := nfmanagertest.NewMock() + require.NoError(t, err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter, nfmanager) require.NoError(t, err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/query-service/tests/integration/signoz_integrations_test.go b/pkg/query-service/tests/integration/signoz_integrations_test.go index c8177b3bc703..f42e42e19889 100644 --- a/pkg/query-service/tests/integration/signoz_integrations_test.go +++ b/pkg/query-service/tests/integration/signoz_integrations_test.go @@ -11,6 +11,7 @@ import ( "github.com/SigNoz/signoz/pkg/alertmanager" "github.com/SigNoz/signoz/pkg/alertmanager/alertmanagerserver" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics/analyticstest" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" @@ -588,7 +589,11 @@ func NewIntegrationsTestBed(t *testing.T, testDB sqlstore.SQLStore) *Integration sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(t, err) orgGetter := implorganization.NewGetter(implorganization.NewStore(testDB), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter) + nfManager := nfmanagertest.NewMock() + if err != nil { + t.Fatal(err) + } + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{Signoz: alertmanager.Signoz{PollInterval: 10 * time.Second, Config: alertmanagerserver.NewConfig()}}, testDB, orgGetter, nfManager) require.NoError(t, err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/query-service/utils/labels/labels.go b/pkg/query-service/utils/labels/labels.go index a04d3ce58c08..8398d6e0afed 100644 --- a/pkg/query-service/utils/labels/labels.go +++ b/pkg/query-service/utils/labels/labels.go @@ -17,6 +17,7 @@ const ( MetricNameLabel = "__name__" TemporalityLabel = "__temporality__" AlertNameLabel = "alertname" + NoDataLabel = "nodata" // AlertStateLabel is the label name indicating the state of an alert. AlertStateLabel = "alertstate" diff --git a/pkg/signoz/handler_test.go b/pkg/signoz/handler_test.go index 204093c62c21..93a6ca960e61 100644 --- a/pkg/signoz/handler_test.go +++ b/pkg/signoz/handler_test.go @@ -8,6 +8,7 @@ import ( "github.com/DATA-DOG/go-sqlmock" "github.com/SigNoz/signoz/pkg/alertmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" "github.com/SigNoz/signoz/pkg/factory/factorytest" @@ -29,7 +30,9 @@ func TestNewHandlers(t *testing.T) { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(t, err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstore), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(t, err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager) require.NoError(t, err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/signoz/module_test.go b/pkg/signoz/module_test.go index 676b4b880c33..a833861f144f 100644 --- a/pkg/signoz/module_test.go +++ b/pkg/signoz/module_test.go @@ -8,6 +8,7 @@ import ( "github.com/DATA-DOG/go-sqlmock" "github.com/SigNoz/signoz/pkg/alertmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/emailing/emailingtest" "github.com/SigNoz/signoz/pkg/factory/factorytest" @@ -29,7 +30,9 @@ func TestNewModules(t *testing.T) { sharder, err := noopsharder.New(context.TODO(), providerSettings, sharder.Config{}) require.NoError(t, err) orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstore), sharder) - alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter) + notificationManager := nfmanagertest.NewMock() + require.NoError(t, err) + alertmanager, err := signozalertmanager.New(context.TODO(), providerSettings, alertmanager.Config{}, sqlstore, orgGetter, notificationManager) require.NoError(t, err) jwt := authtypes.NewJWT("", 1*time.Hour, 1*time.Hour) emailing := emailingtest.New() diff --git a/pkg/signoz/provider.go b/pkg/signoz/provider.go index 878ec34a6c90..72b037ad31f5 100644 --- a/pkg/signoz/provider.go +++ b/pkg/signoz/provider.go @@ -2,6 +2,8 @@ package signoz import ( "github.com/SigNoz/signoz/pkg/alertmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/rulebasednotification" "github.com/SigNoz/signoz/pkg/alertmanager/signozalertmanager" "github.com/SigNoz/signoz/pkg/analytics" "github.com/SigNoz/signoz/pkg/analytics/noopanalytics" @@ -153,9 +155,15 @@ func NewPrometheusProviderFactories(telemetryStore telemetrystore.TelemetryStore ) } -func NewAlertmanagerProviderFactories(sqlstore sqlstore.SQLStore, orgGetter organization.Getter) factory.NamedMap[factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config]] { +func NewNotificationManagerProviderFactories() factory.NamedMap[factory.ProviderFactory[nfmanager.NotificationManager, nfmanager.Config]] { return factory.MustNewNamedMap( - signozalertmanager.NewFactory(sqlstore, orgGetter), + rulebasednotification.NewFactory(), + ) +} + +func NewAlertmanagerProviderFactories(sqlstore sqlstore.SQLStore, orgGetter organization.Getter, notificationManager nfmanager.NotificationManager) factory.NamedMap[factory.ProviderFactory[alertmanager.Alertmanager, alertmanager.Config]] { + return factory.MustNewNamedMap( + signozalertmanager.NewFactory(sqlstore, orgGetter, notificationManager), ) } diff --git a/pkg/signoz/provider_test.go b/pkg/signoz/provider_test.go index a693687790bc..921d47bf13cd 100644 --- a/pkg/signoz/provider_test.go +++ b/pkg/signoz/provider_test.go @@ -4,6 +4,7 @@ import ( "testing" "github.com/DATA-DOG/go-sqlmock" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager/nfmanagertest" "github.com/SigNoz/signoz/pkg/analytics" "github.com/SigNoz/signoz/pkg/instrumentation/instrumentationtest" "github.com/SigNoz/signoz/pkg/modules/organization/implorganization" @@ -54,7 +55,8 @@ func TestNewProviderFactories(t *testing.T) { assert.NotPanics(t, func() { orgGetter := implorganization.NewGetter(implorganization.NewStore(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual)), nil) - NewAlertmanagerProviderFactories(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual), orgGetter) + notificationManager := nfmanagertest.NewMock() + NewAlertmanagerProviderFactories(sqlstoretest.New(sqlstore.Config{Provider: "sqlite"}, sqlmock.QueryMatcherEqual), orgGetter, notificationManager) }) assert.NotPanics(t, func() { diff --git a/pkg/signoz/signoz.go b/pkg/signoz/signoz.go index 050526772fad..08fd5cb60c4b 100644 --- a/pkg/signoz/signoz.go +++ b/pkg/signoz/signoz.go @@ -2,8 +2,8 @@ package signoz import ( "context" - "github.com/SigNoz/signoz/pkg/alertmanager" + "github.com/SigNoz/signoz/pkg/alertmanager/nfmanager" "github.com/SigNoz/signoz/pkg/analytics" "github.com/SigNoz/signoz/pkg/cache" "github.com/SigNoz/signoz/pkg/emailing" @@ -230,12 +230,24 @@ func New( // Initialize user getter userGetter := impluser.NewGetter(impluser.NewStore(sqlstore, providerSettings)) + // shared NotificationManager instance for both alertmanager and rules + notificationManager, err := factory.NewProviderFromNamedMap( + ctx, + providerSettings, + nfmanager.Config{}, + NewNotificationManagerProviderFactories(), + "rulebased", + ) + if err != nil { + return nil, err + } + // Initialize alertmanager from the available alertmanager provider factories alertmanager, err := factory.NewProviderFromNamedMap( ctx, providerSettings, config.Alertmanager, - NewAlertmanagerProviderFactories(sqlstore, orgGetter), + NewAlertmanagerProviderFactories(sqlstore, orgGetter, notificationManager), config.Alertmanager.Provider, ) if err != nil { @@ -315,6 +327,7 @@ func New( Prometheus: prometheus, Alertmanager: alertmanager, Querier: querier, + Rules: ruler, Zeus: zeus, Licensing: licensing, Emailing: emailing, diff --git a/pkg/types/alertmanagertypes/alert.go b/pkg/types/alertmanagertypes/alert.go index 78a0e59f95eb..971ec23b1ccd 100644 --- a/pkg/types/alertmanagertypes/alert.go +++ b/pkg/types/alertmanagertypes/alert.go @@ -9,7 +9,6 @@ import ( "github.com/SigNoz/signoz/pkg/errors" "github.com/go-openapi/runtime/middleware" - "github.com/go-openapi/strfmt" v2 "github.com/prometheus/alertmanager/api/v2" "github.com/prometheus/alertmanager/api/v2/models" "github.com/prometheus/alertmanager/api/v2/restapi/operations/alert" @@ -28,11 +27,9 @@ type ( // An alias for the Alert type from the alertmanager package. Alert = types.Alert - // An alias for the PostableAlert type from the alertmanager package. PostableAlert = models.PostableAlert - // A slice of PostableAlert. - PostableAlerts = []*PostableAlert + PostableAlerts = models.PostableAlerts // An alias for the GettableAlert type from the alertmanager package. GettableAlert = models.GettableAlert @@ -86,26 +83,6 @@ func NewDeprecatedGettableAlertsFromGettableAlerts(gettableAlerts GettableAlerts return deprecatedGettableAlerts } -// Converts a slice of Alert to a slice of PostableAlert. -func NewPostableAlertsFromAlerts(alerts []*types.Alert) PostableAlerts { - postableAlerts := make(PostableAlerts, 0, len(alerts)) - for _, alert := range alerts { - start := strfmt.DateTime(alert.StartsAt) - end := strfmt.DateTime(alert.EndsAt) - postableAlerts = append(postableAlerts, &models.PostableAlert{ - Annotations: v2.ModelLabelSetToAPILabelSet(alert.Annotations), - EndsAt: end, - StartsAt: start, - Alert: models.Alert{ - GeneratorURL: strfmt.URI(alert.GeneratorURL), - Labels: v2.ModelLabelSetToAPILabelSet(alert.Labels), - }, - }) - } - - return postableAlerts -} - // Converts a slice of PostableAlert to a slice of Alert. func NewAlertsFromPostableAlerts(postableAlerts PostableAlerts, resolveTimeout time.Duration, now time.Time) ([]*types.Alert, []error) { alerts := v2.OpenAPIAlertsToAlerts(postableAlerts) diff --git a/pkg/types/alertmanagertypes/config.go b/pkg/types/alertmanagertypes/config.go index 35f3fc3ac105..a438afc7ee12 100644 --- a/pkg/types/alertmanagertypes/config.go +++ b/pkg/types/alertmanagertypes/config.go @@ -14,11 +14,13 @@ import ( "github.com/SigNoz/signoz/pkg/valuer" "github.com/prometheus/alertmanager/config" commoncfg "github.com/prometheus/common/config" + "github.com/prometheus/common/model" "github.com/uptrace/bun" ) const ( DefaultReceiverName string = "default-receiver" + DefaultGroupBy string = "ruleId" ) var ( @@ -398,3 +400,54 @@ func init() { commoncfg.MarshalSecretValue = true config.MarshalSecretValue = true } + +// NotificationConfig holds configuration for alert notifications timing. +type NotificationConfig struct { + NotificationGroup map[model.LabelName]struct{} + Renotify ReNotificationConfig +} + +func (nc *NotificationConfig) DeepCopy() NotificationConfig { + deepCopy := *nc + deepCopy.NotificationGroup = make(map[model.LabelName]struct{}) + deepCopy.Renotify.NoDataInterval = nc.Renotify.NoDataInterval + deepCopy.Renotify.RenotifyInterval = nc.Renotify.RenotifyInterval + for k, v := range nc.NotificationGroup { + deepCopy.NotificationGroup[k] = v + } + return deepCopy +} + +type ReNotificationConfig struct { + NoDataInterval time.Duration + RenotifyInterval time.Duration +} + +func NewNotificationConfig(groups []string, renotifyInterval time.Duration, noDataRenotifyInterval time.Duration) NotificationConfig { + notificationConfig := GetDefaultNotificationConfig() + + if renotifyInterval != 0 { + notificationConfig.Renotify.RenotifyInterval = renotifyInterval + } + + if noDataRenotifyInterval != 0 { + notificationConfig.Renotify.NoDataInterval = noDataRenotifyInterval + } + for _, group := range groups { + notificationConfig.NotificationGroup[model.LabelName(group)] = struct{}{} + } + + return notificationConfig +} + +func GetDefaultNotificationConfig() NotificationConfig { + defaultGroups := make(map[model.LabelName]struct{}) + defaultGroups[model.LabelName(DefaultGroupBy)] = struct{}{} + return NotificationConfig{ + NotificationGroup: defaultGroups, + Renotify: ReNotificationConfig{ + RenotifyInterval: 4 * time.Hour, + NoDataInterval: 4 * time.Hour, + }, //substitute for no - notify + } +} diff --git a/pkg/types/ruletypes/api_params.go b/pkg/types/ruletypes/api_params.go index ff5e8917b6d1..4cc44ed90230 100644 --- a/pkg/types/ruletypes/api_params.go +++ b/pkg/types/ruletypes/api_params.go @@ -3,6 +3,8 @@ package ruletypes import ( "context" "encoding/json" + "fmt" + "slices" "time" "unicode/utf8" @@ -12,6 +14,7 @@ import ( "github.com/SigNoz/signoz/pkg/query-service/utils/times" "github.com/SigNoz/signoz/pkg/query-service/utils/timestamp" + "github.com/SigNoz/signoz/pkg/types/alertmanagertypes" ) type AlertType string @@ -57,6 +60,47 @@ type PostableRule struct { Evaluation *EvaluationEnvelope `yaml:"evaluation,omitempty" json:"evaluation,omitempty"` SchemaVersion string `json:"schemaVersion,omitempty"` + + NotificationSettings *NotificationSettings `json:"notificationSettings,omitempty"` +} + +type NotificationSettings struct { + NotificationGroupBy []string `json:"notificationGroupBy,omitempty"` + ReNotifyInterval Duration `json:"renotify,omitempty"` + AlertStates []model.AlertState `json:"alertStates,omitempty"` +} + +func (ns *NotificationSettings) GetAlertManagerNotificationConfig() alertmanagertypes.NotificationConfig { + var renotifyInterval Duration + var noDataRenotifyInterval Duration + if slices.Contains(ns.AlertStates, model.StateNoData) { + noDataRenotifyInterval = ns.ReNotifyInterval + } + if slices.Contains(ns.AlertStates, model.StateFiring) { + renotifyInterval = ns.ReNotifyInterval + } + return alertmanagertypes.NewNotificationConfig(ns.NotificationGroupBy, time.Duration(renotifyInterval), time.Duration(noDataRenotifyInterval)) +} + +func (ns *NotificationSettings) UnmarshalJSON(data []byte) error { + type Alias NotificationSettings + aux := &struct { + *Alias + }{ + Alias: (*Alias)(ns), + } + + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + + // Validate states after unmarshaling + for _, state := range ns.AlertStates { + if state != model.StateFiring && state != model.StateNoData { + return fmt.Errorf("invalid alert state: %s", state) + } + } + return nil } func (r *PostableRule) processRuleDefaults() error { From 411414fa45a9907598dc299e3a1815cac285a9d8 Mon Sep 17 00:00:00 2001 From: Amlan Kumar Nandy <45410599+amlannandy@users.noreply.github.com> Date: Sat, 27 Sep 2025 22:32:14 +0700 Subject: [PATCH 3/4] chore: add routing polices page (#9198) --- .../routingPolicies/createRoutingPolicy.ts | 36 ++ .../routingPolicies/deleteRoutingPolicy.ts | 30 ++ .../api/routingPolicies/getRoutingPolicies.ts | 40 ++ .../routingPolicies/updateRoutingPolicy.ts | 40 ++ frontend/src/constants/reactQueryKeys.ts | 3 + .../RoutingPolicies/DeleteRoutingPolicy.tsx | 47 ++ .../RoutingPolicies/RoutingPolicies.tsx | 118 +++++ .../RoutingPolicies/RoutingPolicyDetails.tsx | 208 ++++++++ .../RoutingPolicies/RoutingPolicyList.tsx | 73 +++ .../RoutingPolicies/RoutingPolicyListItem.tsx | 137 ++++++ .../__tests__/DeleteRoutingPolicy.test.tsx | 81 ++++ .../__tests__/RoutingPolicies.test.tsx | 126 +++++ .../__tests__/RoutingPoliciesList.test.tsx | 89 ++++ .../__tests__/RoutingPolicyDetails.test.tsx | 423 ++++++++++++++++ .../__tests__/RoutingPolicyListItem.test.tsx | 126 +++++ .../RoutingPolicies/__tests__/testUtils.ts | 121 +++++ .../container/RoutingPolicies/constants.ts | 8 + .../src/container/RoutingPolicies/index.ts | 3 + .../src/container/RoutingPolicies/styles.scss | 452 ++++++++++++++++++ .../src/container/RoutingPolicies/types.ts | 115 +++++ .../RoutingPolicies/useRoutingPolicies.ts | 240 ++++++++++ .../src/container/RoutingPolicies/utils.tsx | 61 +++ .../routingPolicies/useCreateRoutingPolicy.ts | 24 + .../routingPolicies/useDeleteRoutingPolicy.ts | 19 + .../routingPolicies/useGetRoutingPolicies.ts | 39 ++ .../routingPolicies/useUpdateRoutingPolicy.ts | 25 + .../src/pages/AlertList/AlertList.styles.scss | 10 + frontend/src/pages/AlertList/index.tsx | 40 +- 28 files changed, 2729 insertions(+), 5 deletions(-) create mode 100644 frontend/src/api/routingPolicies/createRoutingPolicy.ts create mode 100644 frontend/src/api/routingPolicies/deleteRoutingPolicy.ts create mode 100644 frontend/src/api/routingPolicies/getRoutingPolicies.ts create mode 100644 frontend/src/api/routingPolicies/updateRoutingPolicy.ts create mode 100644 frontend/src/container/RoutingPolicies/DeleteRoutingPolicy.tsx create mode 100644 frontend/src/container/RoutingPolicies/RoutingPolicies.tsx create mode 100644 frontend/src/container/RoutingPolicies/RoutingPolicyDetails.tsx create mode 100644 frontend/src/container/RoutingPolicies/RoutingPolicyList.tsx create mode 100644 frontend/src/container/RoutingPolicies/RoutingPolicyListItem.tsx create mode 100644 frontend/src/container/RoutingPolicies/__tests__/DeleteRoutingPolicy.test.tsx create mode 100644 frontend/src/container/RoutingPolicies/__tests__/RoutingPolicies.test.tsx create mode 100644 frontend/src/container/RoutingPolicies/__tests__/RoutingPoliciesList.test.tsx create mode 100644 frontend/src/container/RoutingPolicies/__tests__/RoutingPolicyDetails.test.tsx create mode 100644 frontend/src/container/RoutingPolicies/__tests__/RoutingPolicyListItem.test.tsx create mode 100644 frontend/src/container/RoutingPolicies/__tests__/testUtils.ts create mode 100644 frontend/src/container/RoutingPolicies/constants.ts create mode 100644 frontend/src/container/RoutingPolicies/index.ts create mode 100644 frontend/src/container/RoutingPolicies/styles.scss create mode 100644 frontend/src/container/RoutingPolicies/types.ts create mode 100644 frontend/src/container/RoutingPolicies/useRoutingPolicies.ts create mode 100644 frontend/src/container/RoutingPolicies/utils.tsx create mode 100644 frontend/src/hooks/routingPolicies/useCreateRoutingPolicy.ts create mode 100644 frontend/src/hooks/routingPolicies/useDeleteRoutingPolicy.ts create mode 100644 frontend/src/hooks/routingPolicies/useGetRoutingPolicies.ts create mode 100644 frontend/src/hooks/routingPolicies/useUpdateRoutingPolicy.ts diff --git a/frontend/src/api/routingPolicies/createRoutingPolicy.ts b/frontend/src/api/routingPolicies/createRoutingPolicy.ts new file mode 100644 index 000000000000..5ec9847b69b7 --- /dev/null +++ b/frontend/src/api/routingPolicies/createRoutingPolicy.ts @@ -0,0 +1,36 @@ +import axios from 'api'; +import { ErrorResponseHandlerV2 } from 'api/ErrorResponseHandlerV2'; +import { AxiosError } from 'axios'; +import { ErrorResponseV2, ErrorV2Resp, SuccessResponseV2 } from 'types/api'; + +export interface CreateRoutingPolicyBody { + name: string; + expression: string; + actions: { + channels: string[]; + }; + description?: string; +} + +export interface CreateRoutingPolicyResponse { + success: boolean; + message: string; +} + +const createRoutingPolicy = async ( + props: CreateRoutingPolicyBody, +): Promise< + SuccessResponseV2 | ErrorResponseV2 +> => { + try { + const response = await axios.post(`/notification-policy`, props); + return { + httpStatusCode: response.status, + data: response.data, + }; + } catch (error) { + return ErrorResponseHandlerV2(error as AxiosError); + } +}; + +export default createRoutingPolicy; diff --git a/frontend/src/api/routingPolicies/deleteRoutingPolicy.ts b/frontend/src/api/routingPolicies/deleteRoutingPolicy.ts new file mode 100644 index 000000000000..5b0d3df14d97 --- /dev/null +++ b/frontend/src/api/routingPolicies/deleteRoutingPolicy.ts @@ -0,0 +1,30 @@ +import axios from 'api'; +import { ErrorResponseHandlerV2 } from 'api/ErrorResponseHandlerV2'; +import { AxiosError } from 'axios'; +import { ErrorResponseV2, ErrorV2Resp, SuccessResponseV2 } from 'types/api'; + +export interface DeleteRoutingPolicyResponse { + success: boolean; + message: string; +} + +const deleteRoutingPolicy = async ( + routingPolicyId: string, +): Promise< + SuccessResponseV2 | ErrorResponseV2 +> => { + try { + const response = await axios.delete( + `/notification-policy/${routingPolicyId}`, + ); + + return { + httpStatusCode: response.status, + data: response.data, + }; + } catch (error) { + return ErrorResponseHandlerV2(error as AxiosError); + } +}; + +export default deleteRoutingPolicy; diff --git a/frontend/src/api/routingPolicies/getRoutingPolicies.ts b/frontend/src/api/routingPolicies/getRoutingPolicies.ts new file mode 100644 index 000000000000..43191aebd77f --- /dev/null +++ b/frontend/src/api/routingPolicies/getRoutingPolicies.ts @@ -0,0 +1,40 @@ +import axios from 'api'; +import { ErrorResponseHandlerV2 } from 'api/ErrorResponseHandlerV2'; +import { AxiosError } from 'axios'; +import { ErrorResponseV2, ErrorV2Resp, SuccessResponseV2 } from 'types/api'; + +export interface ApiRoutingPolicy { + id: string; + name: string; + expression: string; + description: string; + channels: string[]; + createdAt: string; + updatedAt: string; + createdBy: string; + updatedBy: string; +} + +export interface GetRoutingPoliciesResponse { + status: string; + data?: ApiRoutingPolicy[]; +} + +export const getRoutingPolicies = async ( + signal?: AbortSignal, + headers?: Record, +): Promise | ErrorResponseV2> => { + try { + const response = await axios.get('/notification-policy', { + signal, + headers, + }); + + return { + httpStatusCode: response.status, + data: response.data, + }; + } catch (error) { + return ErrorResponseHandlerV2(error as AxiosError); + } +}; diff --git a/frontend/src/api/routingPolicies/updateRoutingPolicy.ts b/frontend/src/api/routingPolicies/updateRoutingPolicy.ts new file mode 100644 index 000000000000..08448562cdd0 --- /dev/null +++ b/frontend/src/api/routingPolicies/updateRoutingPolicy.ts @@ -0,0 +1,40 @@ +import axios from 'api'; +import { ErrorResponseHandlerV2 } from 'api/ErrorResponseHandlerV2'; +import { AxiosError } from 'axios'; +import { ErrorResponseV2, ErrorV2Resp, SuccessResponseV2 } from 'types/api'; + +export interface UpdateRoutingPolicyBody { + name: string; + expression: string; + actions: { + channels: string[]; + }; + description: string; +} + +export interface UpdateRoutingPolicyResponse { + success: boolean; + message: string; +} + +const updateRoutingPolicy = async ( + id: string, + props: UpdateRoutingPolicyBody, +): Promise< + SuccessResponseV2 | ErrorResponseV2 +> => { + try { + const response = await axios.put(`/notification-policy/${id}`, { + ...props, + }); + + return { + httpStatusCode: response.status, + data: response.data, + }; + } catch (error) { + return ErrorResponseHandlerV2(error as AxiosError); + } +}; + +export default updateRoutingPolicy; diff --git a/frontend/src/constants/reactQueryKeys.ts b/frontend/src/constants/reactQueryKeys.ts index 6d34aa4b29c0..f59f2c21124b 100644 --- a/frontend/src/constants/reactQueryKeys.ts +++ b/frontend/src/constants/reactQueryKeys.ts @@ -86,4 +86,7 @@ export const REACT_QUERY_KEY = { SPAN_LOGS: 'SPAN_LOGS', SPAN_BEFORE_LOGS: 'SPAN_BEFORE_LOGS', SPAN_AFTER_LOGS: 'SPAN_AFTER_LOGS', + + // Routing Policies Query Keys + GET_ROUTING_POLICIES: 'GET_ROUTING_POLICIES', } as const; diff --git a/frontend/src/container/RoutingPolicies/DeleteRoutingPolicy.tsx b/frontend/src/container/RoutingPolicies/DeleteRoutingPolicy.tsx new file mode 100644 index 000000000000..da2d78935231 --- /dev/null +++ b/frontend/src/container/RoutingPolicies/DeleteRoutingPolicy.tsx @@ -0,0 +1,47 @@ +import { Button, Modal, Typography } from 'antd'; +import { Trash2, X } from 'lucide-react'; + +import { DeleteRoutingPolicyProps } from './types'; + +function DeleteRoutingPolicy({ + handleClose, + handleDelete, + routingPolicy, + isDeletingRoutingPolicy, +}: DeleteRoutingPolicyProps): JSX.Element { + return ( + Delete Routing Policy} + open + closable={false} + onCancel={handleClose} + footer={[ + , + , + ]} + > + + {`Are you sure you want to delete ${routingPolicy?.name} routing policy? Deleting a routing policy is irreversible and cannot be undone.`} + + + ); +} + +export default DeleteRoutingPolicy; diff --git a/frontend/src/container/RoutingPolicies/RoutingPolicies.tsx b/frontend/src/container/RoutingPolicies/RoutingPolicies.tsx new file mode 100644 index 000000000000..b5cb3f08d4f5 --- /dev/null +++ b/frontend/src/container/RoutingPolicies/RoutingPolicies.tsx @@ -0,0 +1,118 @@ +import './styles.scss'; + +import { PlusOutlined } from '@ant-design/icons'; +import { Color } from '@signozhq/design-tokens'; +import { Button, Flex, Input, Tooltip, Typography } from 'antd'; +import { Search } from 'lucide-react'; +import { useAppContext } from 'providers/App/App'; +import { ChangeEvent, useMemo } from 'react'; +import { USER_ROLES } from 'types/roles'; + +import DeleteRoutingPolicy from './DeleteRoutingPolicy'; +import RoutingPolicyDetails from './RoutingPolicyDetails'; +import RoutingPolicyList from './RoutingPolicyList'; +import useRoutingPolicies from './useRoutingPolicies'; + +function RoutingPolicies(): JSX.Element { + const { user } = useAppContext(); + const { + // Routing Policies + selectedRoutingPolicy, + routingPoliciesData, + isLoadingRoutingPolicies, + isErrorRoutingPolicies, + // Channels + channels, + isLoadingChannels, + isErrorChannels, + refreshChannels, + // Search + searchTerm, + setSearchTerm, + // Delete Modal + isDeleteModalOpen, + handleDeleteModalOpen, + handleDeleteModalClose, + handleDeleteRoutingPolicy, + isDeletingRoutingPolicy, + // Policy Details Modal + policyDetailsModalState, + handlePolicyDetailsModalClose, + handlePolicyDetailsModalOpen, + handlePolicyDetailsModalAction, + isPolicyDetailsModalActionLoading, + } = useRoutingPolicies(); + + const disableCreateButton = user?.role === USER_ROLES.VIEWER; + + const tooltipTitle = useMemo(() => { + if (user?.role === USER_ROLES.VIEWER) { + return 'You need edit permissions to create a routing policy'; + } + return ''; + }, [user?.role]); + + const handleSearch = (e: ChangeEvent): void => { + setSearchTerm(e.target.value || ''); + }; + + return ( +
+
+ Routing Policies + + Create and manage routing policies. + + + } + value={searchTerm} + onChange={handleSearch} + /> + + + + +
+ + {policyDetailsModalState.isOpen && ( + + )} + {isDeleteModalOpen && ( + + )} +
+
+ ); +} + +export default RoutingPolicies; diff --git a/frontend/src/container/RoutingPolicies/RoutingPolicyDetails.tsx b/frontend/src/container/RoutingPolicies/RoutingPolicyDetails.tsx new file mode 100644 index 000000000000..0a0726fe0714 --- /dev/null +++ b/frontend/src/container/RoutingPolicies/RoutingPolicyDetails.tsx @@ -0,0 +1,208 @@ +import { + Button, + Divider, + Flex, + Form, + Input, + Modal, + Select, + Typography, +} from 'antd'; +import { useForm } from 'antd/lib/form/Form'; +import ROUTES from 'constants/routes'; +import { ModalTitle } from 'container/PipelinePage/PipelineListsView/styles'; +import { useAppContext } from 'providers/App/App'; +import { useMemo } from 'react'; +import { USER_ROLES } from 'types/roles'; + +import { INITIAL_ROUTING_POLICY_DETAILS_FORM_STATE } from './constants'; +import { + RoutingPolicyDetailsFormState, + RoutingPolicyDetailsProps, +} from './types'; + +function RoutingPolicyDetails({ + closeModal, + mode, + channels, + isErrorChannels, + isLoadingChannels, + routingPolicy, + handlePolicyDetailsModalAction, + isPolicyDetailsModalActionLoading, + refreshChannels, +}: RoutingPolicyDetailsProps): JSX.Element { + const [form] = useForm(); + const { user } = useAppContext(); + + const initialFormState = useMemo(() => { + if (mode === 'edit') { + return { + name: routingPolicy?.name || '', + expression: routingPolicy?.expression || '', + channels: routingPolicy?.channels || [], + description: routingPolicy?.description || '', + }; + } + return INITIAL_ROUTING_POLICY_DETAILS_FORM_STATE; + }, [routingPolicy, mode]); + + const modalTitle = + mode === 'edit' ? 'Edit routing policy' : 'Create routing policy'; + + const handleSave = (): void => { + handlePolicyDetailsModalAction(mode, { + name: form.getFieldValue('name'), + expression: form.getFieldValue('expression'), + channels: form.getFieldValue('channels'), + description: form.getFieldValue('description'), + }); + }; + + const notificationChannelsNotFoundContent = ( + + + No channels yet. + {user?.role === USER_ROLES.ADMIN ? ( + + Create one + + + ) : ( + Please ask your admin to create one. + )} + + + + ); + + return ( + {modalTitle}} + centered + open + className="create-policy-modal" + width={600} + onCancel={closeModal} + footer={null} + maskClosable={false} + > + + + form={form} + initialValues={initialFormState} + onFinish={handleSave} + > +
+
+ Routing Policy Name + + + +
+
+ Description + + + +
+
+ Expression + + + +
+
+ Notification Channels + +