engine.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. package alerting
  2. import (
  3. "context"
  4. "fmt"
  5. "time"
  6. "github.com/opentracing/opentracing-go"
  7. "github.com/opentracing/opentracing-go/ext"
  8. tlog "github.com/opentracing/opentracing-go/log"
  9. "github.com/benbjohnson/clock"
  10. "github.com/grafana/grafana/pkg/infra/log"
  11. "github.com/grafana/grafana/pkg/registry"
  12. "github.com/grafana/grafana/pkg/services/rendering"
  13. "github.com/grafana/grafana/pkg/setting"
  14. "golang.org/x/sync/errgroup"
  15. )
  16. // AlertEngine is the background process that
  17. // schedules alert evaluations and makes sure notifications
  18. // are sent.
  19. type AlertEngine struct {
  20. RenderService rendering.Service `inject:""`
  21. execQueue chan *Job
  22. ticker *Ticker
  23. scheduler scheduler
  24. evalHandler evalHandler
  25. ruleReader ruleReader
  26. log log.Logger
  27. resultHandler resultHandler
  28. }
  29. func init() {
  30. registry.RegisterService(&AlertEngine{})
  31. }
  32. // IsDisabled returns true if the alerting service is disable for this instance.
  33. func (e *AlertEngine) IsDisabled() bool {
  34. return !setting.AlertingEnabled || !setting.ExecuteAlerts
  35. }
  36. // Init initalizes the AlertingService.
  37. func (e *AlertEngine) Init() error {
  38. e.ticker = NewTicker(time.Now(), time.Second*0, clock.New())
  39. e.execQueue = make(chan *Job, 1000)
  40. e.scheduler = newScheduler()
  41. e.evalHandler = NewEvalHandler()
  42. e.ruleReader = newRuleReader()
  43. e.log = log.New("alerting.engine")
  44. e.resultHandler = newResultHandler(e.RenderService)
  45. return nil
  46. }
  47. // Run starts the alerting service background process.
  48. func (e *AlertEngine) Run(ctx context.Context) error {
  49. alertGroup, ctx := errgroup.WithContext(ctx)
  50. alertGroup.Go(func() error { return e.alertingTicker(ctx) })
  51. alertGroup.Go(func() error { return e.runJobDispatcher(ctx) })
  52. err := alertGroup.Wait()
  53. return err
  54. }
  55. func (e *AlertEngine) alertingTicker(grafanaCtx context.Context) error {
  56. defer func() {
  57. if err := recover(); err != nil {
  58. e.log.Error("Scheduler Panic: stopping alertingTicker", "error", err, "stack", log.Stack(1))
  59. }
  60. }()
  61. tickIndex := 0
  62. for {
  63. select {
  64. case <-grafanaCtx.Done():
  65. return grafanaCtx.Err()
  66. case tick := <-e.ticker.C:
  67. // TEMP SOLUTION update rules ever tenth tick
  68. if tickIndex%10 == 0 {
  69. e.scheduler.Update(e.ruleReader.fetch())
  70. }
  71. e.scheduler.Tick(tick, e.execQueue)
  72. tickIndex++
  73. }
  74. }
  75. }
  76. func (e *AlertEngine) runJobDispatcher(grafanaCtx context.Context) error {
  77. dispatcherGroup, alertCtx := errgroup.WithContext(grafanaCtx)
  78. for {
  79. select {
  80. case <-grafanaCtx.Done():
  81. return dispatcherGroup.Wait()
  82. case job := <-e.execQueue:
  83. dispatcherGroup.Go(func() error { return e.processJobWithRetry(alertCtx, job) })
  84. }
  85. }
  86. }
  87. var (
  88. unfinishedWorkTimeout = time.Second * 5
  89. )
  90. func (e *AlertEngine) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
  91. defer func() {
  92. if err := recover(); err != nil {
  93. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  94. }
  95. }()
  96. cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
  97. attemptChan := make(chan int, 1)
  98. // Initialize with first attemptID=1
  99. attemptChan <- 1
  100. job.SetRunning(true)
  101. for {
  102. select {
  103. case <-grafanaCtx.Done():
  104. // In case grafana server context is cancel, let a chance to job processing
  105. // to finish gracefully - by waiting a timeout duration - before forcing its end.
  106. unfinishedWorkTimer := time.NewTimer(unfinishedWorkTimeout)
  107. select {
  108. case <-unfinishedWorkTimer.C:
  109. return e.endJob(grafanaCtx.Err(), cancelChan, job)
  110. case <-attemptChan:
  111. return e.endJob(nil, cancelChan, job)
  112. }
  113. case attemptID, more := <-attemptChan:
  114. if !more {
  115. return e.endJob(nil, cancelChan, job)
  116. }
  117. go e.processJob(attemptID, attemptChan, cancelChan, job)
  118. }
  119. }
  120. }
  121. func (e *AlertEngine) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
  122. job.SetRunning(false)
  123. close(cancelChan)
  124. for cancelFn := range cancelChan {
  125. cancelFn()
  126. }
  127. return err
  128. }
  129. func (e *AlertEngine) processJob(attemptID int, attemptChan chan int, cancelChan chan context.CancelFunc, job *Job) {
  130. defer func() {
  131. if err := recover(); err != nil {
  132. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  133. }
  134. }()
  135. alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
  136. cancelChan <- cancelFn
  137. span := opentracing.StartSpan("alert execution")
  138. alertCtx = opentracing.ContextWithSpan(alertCtx, span)
  139. evalContext := NewEvalContext(alertCtx, job.Rule)
  140. evalContext.Ctx = alertCtx
  141. go func() {
  142. defer func() {
  143. if err := recover(); err != nil {
  144. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  145. ext.Error.Set(span, true)
  146. span.LogFields(
  147. tlog.Error(fmt.Errorf("%v", err)),
  148. tlog.String("message", "failed to execute alert rule. panic was recovered."),
  149. )
  150. span.Finish()
  151. close(attemptChan)
  152. }
  153. }()
  154. e.evalHandler.Eval(evalContext)
  155. span.SetTag("alertId", evalContext.Rule.ID)
  156. span.SetTag("dashboardId", evalContext.Rule.DashboardID)
  157. span.SetTag("firing", evalContext.Firing)
  158. span.SetTag("nodatapoints", evalContext.NoDataFound)
  159. span.SetTag("attemptID", attemptID)
  160. if evalContext.Error != nil {
  161. ext.Error.Set(span, true)
  162. span.LogFields(
  163. tlog.Error(evalContext.Error),
  164. tlog.String("message", "alerting execution attempt failed"),
  165. )
  166. if attemptID < setting.AlertingMaxAttempts {
  167. span.Finish()
  168. e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.ID, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
  169. attemptChan <- (attemptID + 1)
  170. return
  171. }
  172. }
  173. // create new context with timeout for notifications
  174. resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
  175. cancelChan <- resultHandleCancelFn
  176. // override the context used for evaluation with a new context for notifications.
  177. // This makes it possible for notifiers to execute when datasources
  178. // dont respond within the timeout limit. We should rewrite this so notifications
  179. // dont reuse the evalContext and get its own context.
  180. evalContext.Ctx = resultHandleCtx
  181. evalContext.Rule.State = evalContext.GetNewState()
  182. e.resultHandler.handle(evalContext)
  183. span.Finish()
  184. e.log.Debug("Job Execution completed", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.ID, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
  185. close(attemptChan)
  186. }()
  187. }