Selaa lähdekoodia

Alerting: Makes timeouts and retries configurable (#16259)

Adds new alert settings for configuring timeouts and retries named 
evaluation_timeout_seconds, notification_timeout_seconds 
and max_attempts.

Closes #16240
Zzy 6 vuotta sitten
vanhempi
commit
1b84a924a3

+ 10 - 0
conf/defaults.ini

@@ -521,6 +521,16 @@ nodata_or_nullvalues = no_data
 # This limit will protect the server from render overloading and make sure notifications are sent out quickly
 concurrent_render_limit = 5
 
+# Default setting for alert calculation timeout. Default value is 30
+evaluation_timeout_seconds = 30
+
+# Default setting for alert notification timeout. Default value is 30
+notification_timeout_seconds = 30
+
+# Default setting for max attempts to sending alert notifications. Default value is 3
+max_attempts = 3
+
+
 #################################### Explore #############################
 [explore]
 # Enable the Explore section

+ 10 - 0
conf/sample.ini

@@ -446,6 +446,16 @@ log_queries =
 # This limit will protect the server from render overloading and make sure notifications are sent out quickly
 ;concurrent_render_limit = 5
 
+
+# Default setting for alert calculation timeout. Default value is 30
+;evaluation_timeout_seconds = 30
+
+# Default setting for alert notification timeout. Default value is 30
+;notification_timeout_seconds = 30
+
+# Default setting for max attempts to sending alert notifications. Default value is 3
+;max_attempts = 3
+
 #################################### Explore #############################
 [explore]
 # Enable the Explore section

+ 14 - 0
docs/sources/installation/configuration.md

@@ -650,6 +650,20 @@ Alert notifications can include images, but rendering many images at the same ti
 This limit will protect the server from render overloading and make sure notifications are sent out quickly. Default
 value is `5`.
 
+
+### evaluation_timeout_seconds 
+
+Default setting for alert calculation timeout. Default value is `30` 
+
+### notification_timeout_seconds
+
+Default setting for alert notification timeout. Default value is `30` 
+
+### max_attempts
+
+Default setting for max attempts to sending alert notifications. Default value is `3` 
+
+
 ## [panels]
 
 ### enable_alpha

+ 4 - 8
pkg/services/alerting/engine.go

@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
 
 var (
 	unfinishedWorkTimeout = time.Second * 5
-	// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
-	alertTimeout        = time.Second * 30
-	resultHandleTimeout = time.Second * 30
-	alertMaxAttempts    = 3
 )
 
 func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
 		}
 	}()
 
-	cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
+	cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
 	attemptChan := make(chan int, 1)
 
 	// Initialize with first attemptID=1
@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 		}
 	}()
 
-	alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
+	alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
 	cancelChan <- cancelFn
 	span := opentracing.StartSpan("alert execution")
 	alertCtx = opentracing.ContextWithSpan(alertCtx, span)
@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 				tlog.Error(evalContext.Error),
 				tlog.String("message", "alerting execution attempt failed"),
 			)
-			if attemptID < alertMaxAttempts {
+			if attemptID < setting.AlertingMaxAttempts {
 				span.Finish()
 				e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
 				attemptChan <- (attemptID + 1)
@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 		}
 
 		// create new context with timeout for notifications
-		resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
+		resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
 		cancelChan <- resultHandleCancelFn
 
 		// override the context used for evaluation with a new context for notifications.

+ 5 - 3
pkg/services/alerting/engine_integration_test.go

@@ -11,20 +11,22 @@ import (
 	"testing"
 	"time"
 
+	"github.com/grafana/grafana/pkg/setting"
 	. "github.com/smartystreets/goconvey/convey"
 )
 
 func TestEngineTimeouts(t *testing.T) {
 	Convey("Alerting engine timeout tests", t, func() {
 		engine := NewEngine()
+		setting.AlertingNotificationTimeout = 30 * time.Second
+		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
 		job := &Job{Running: true, Rule: &Rule{}}
 
 		Convey("Should trigger as many retries as needed", func() {
 			Convey("pended alert for datasource -> result handler should be worked", func() {
 				// reduce alert timeout to test quickly
-				originAlertTimeout := alertTimeout
-				alertTimeout = 2 * time.Second
+				setting.AlertingEvaluationTimeout = 30 * time.Second
 				transportTimeoutInterval := 2 * time.Second
 				serverBusySleepDuration := 1 * time.Second
 
@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
 				So(resultHandler.ResultHandleSucceed, ShouldEqual, true)
 
 				// initialize for other tests.
-				alertTimeout = originAlertTimeout
+				setting.AlertingEvaluationTimeout = 2 * time.Second
 				engine.resultHandler = &FakeResultHandler{}
 			})
 		})

+ 12 - 7
pkg/services/alerting/engine_test.go

@@ -6,7 +6,9 @@ import (
 	"math"
 	"testing"
 
+	"github.com/grafana/grafana/pkg/setting"
 	. "github.com/smartystreets/goconvey/convey"
+	"time"
 )
 
 type FakeEvalHandler struct {
@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
 func TestEngineProcessJob(t *testing.T) {
 	Convey("Alerting engine job processing", t, func() {
 		engine := NewEngine()
+		setting.AlertingEvaluationTimeout = 30 * time.Second
+		setting.AlertingNotificationTimeout = 30 * time.Second
+		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
 		job := &Job{Running: true, Rule: &Rule{}}
 
@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("error + not last attempt -> retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(0)
 
-				for i := 1; i < alertMaxAttempts; i++ {
+				for i := 1; i < setting.AlertingMaxAttempts; i++ {
 					attemptChan := make(chan int, 1)
-					cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+					cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
 
 					engine.processJob(i, attemptChan, cancelChan, job)
 					nextAttemptID, more := <-attemptChan
@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("error + last attempt -> no retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(0)
 				attemptChan := make(chan int, 1)
-				cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+				cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
 
-				engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
+				engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
 				nextAttemptID, more := <-attemptChan
 
 				So(nextAttemptID, ShouldEqual, 0)
@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("no error -> no retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(1)
 				attemptChan := make(chan int, 1)
-				cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+				cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
 
 				engine.processJob(1, attemptChan, cancelChan, job)
 				nextAttemptID, more := <-attemptChan
@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
 		Convey("Should trigger as many retries as needed", func() {
 
 			Convey("never success -> max retries number", func() {
-				expectedAttempts := alertMaxAttempts
+				expectedAttempts := setting.AlertingMaxAttempts
 				evalHandler := NewFakeEvalHandler(0)
 				engine.evalHandler = evalHandler
 
@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
 			})
 
 			Convey("some errors before success -> some retries", func() {
-				expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
+				expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
 				evalHandler := NewFakeEvalHandler(expectedAttempts)
 				engine.evalHandler = evalHandler
 

+ 1 - 1
pkg/services/alerting/notifier.go

@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
 	renderOpts := rendering.Opts{
 		Width:           1000,
 		Height:          500,
-		Timeout:         time.Duration(float64(alertTimeout) * 0.9),
+		Timeout:         time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
 		OrgId:           context.Rule.OrgId,
 		OrgRole:         m.ROLE_ADMIN,
 		ConcurrentLimit: setting.AlertingRenderLimit,

+ 8 - 0
pkg/setting/setting.go

@@ -179,6 +179,10 @@ var (
 	AlertingErrorOrTimeout     string
 	AlertingNoDataOrNullValues string
 
+	AlertingEvaluationTimeout   time.Duration
+	AlertingNotificationTimeout time.Duration
+	AlertingMaxAttempts         int
+
 	// Explore UI
 	ExploreEnabled bool
 
@@ -760,6 +764,10 @@ func (cfg *Cfg) Load(args *CommandLineArgs) error {
 	AlertingErrorOrTimeout = alerting.Key("error_or_timeout").MustString("alerting")
 	AlertingNoDataOrNullValues = alerting.Key("nodata_or_nullvalues").MustString("no_data")
 
+	AlertingEvaluationTimeout = alerting.Key("evaluation_timeout_seconds").MustDuration(time.Second * 30)
+	AlertingNotificationTimeout = alerting.Key("notification_timeout_seconds").MustDuration(time.Second * 30)
+	AlertingMaxAttempts = alerting.Key("max_attempts").MustInt(3)
+
 	explore := iniFile.Section("explore")
 	ExploreEnabled = explore.Key("enabled").MustBool(true)