engine.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. package alerting
  2. import (
  3. "context"
  4. "fmt"
  5. "time"
  6. "github.com/opentracing/opentracing-go"
  7. "github.com/opentracing/opentracing-go/ext"
  8. tlog "github.com/opentracing/opentracing-go/log"
  9. "github.com/benbjohnson/clock"
  10. "github.com/grafana/grafana/pkg/log"
  11. "github.com/grafana/grafana/pkg/registry"
  12. "github.com/grafana/grafana/pkg/setting"
  13. "golang.org/x/sync/errgroup"
  14. )
  15. type Engine struct {
  16. execQueue chan *Job
  17. //clock clock.Clock
  18. ticker *Ticker
  19. scheduler Scheduler
  20. evalHandler EvalHandler
  21. ruleReader RuleReader
  22. log log.Logger
  23. resultHandler ResultHandler
  24. }
  25. func init() {
  26. registry.RegisterService(&Engine{})
  27. }
  28. func NewEngine() *Engine {
  29. e := &Engine{}
  30. e.Init()
  31. return e
  32. }
  33. func (e *Engine) IsDisabled() bool {
  34. return !setting.AlertingEnabled || !setting.ExecuteAlerts
  35. }
  36. func (e *Engine) Init() error {
  37. e.ticker = NewTicker(time.Now(), time.Second*0, clock.New())
  38. e.execQueue = make(chan *Job, 1000)
  39. e.scheduler = NewScheduler()
  40. e.evalHandler = NewEvalHandler()
  41. e.ruleReader = NewRuleReader()
  42. e.log = log.New("alerting.engine")
  43. e.resultHandler = NewResultHandler()
  44. return nil
  45. }
  46. func (e *Engine) Run(ctx context.Context) error {
  47. alertGroup, ctx := errgroup.WithContext(ctx)
  48. alertGroup.Go(func() error { return e.alertingTicker(ctx) })
  49. alertGroup.Go(func() error { return e.runJobDispatcher(ctx) })
  50. err := alertGroup.Wait()
  51. return err
  52. }
  53. func (e *Engine) alertingTicker(grafanaCtx context.Context) error {
  54. defer func() {
  55. if err := recover(); err != nil {
  56. e.log.Error("Scheduler Panic: stopping alertingTicker", "error", err, "stack", log.Stack(1))
  57. }
  58. }()
  59. tickIndex := 0
  60. for {
  61. select {
  62. case <-grafanaCtx.Done():
  63. return grafanaCtx.Err()
  64. case tick := <-e.ticker.C:
  65. // TEMP SOLUTION update rules ever tenth tick
  66. if tickIndex%10 == 0 {
  67. e.scheduler.Update(e.ruleReader.Fetch())
  68. }
  69. e.scheduler.Tick(tick, e.execQueue)
  70. tickIndex++
  71. }
  72. }
  73. }
  74. func (e *Engine) runJobDispatcher(grafanaCtx context.Context) error {
  75. dispatcherGroup, alertCtx := errgroup.WithContext(grafanaCtx)
  76. for {
  77. select {
  78. case <-grafanaCtx.Done():
  79. return dispatcherGroup.Wait()
  80. case job := <-e.execQueue:
  81. dispatcherGroup.Go(func() error { return e.processJobWithRetry(alertCtx, job) })
  82. }
  83. }
  84. }
  85. var (
  86. unfinishedWorkTimeout time.Duration = time.Second * 5
  87. // TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
  88. alertTimeout time.Duration = time.Second * 30
  89. alertMaxAttempts int = 3
  90. )
  91. func (e *Engine) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
  92. defer func() {
  93. if err := recover(); err != nil {
  94. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  95. }
  96. }()
  97. cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
  98. attemptChan := make(chan int, 1)
  99. // Initialize with first attemptID=1
  100. attemptChan <- 1
  101. job.Running = true
  102. for {
  103. select {
  104. case <-grafanaCtx.Done():
  105. // In case grafana server context is cancel, let a chance to job processing
  106. // to finish gracefully - by waiting a timeout duration - before forcing its end.
  107. unfinishedWorkTimer := time.NewTimer(unfinishedWorkTimeout)
  108. select {
  109. case <-unfinishedWorkTimer.C:
  110. return e.endJob(grafanaCtx.Err(), cancelChan, job)
  111. case <-attemptChan:
  112. return e.endJob(nil, cancelChan, job)
  113. }
  114. case attemptID, more := <-attemptChan:
  115. if !more {
  116. return e.endJob(nil, cancelChan, job)
  117. }
  118. go e.processJob(attemptID, attemptChan, cancelChan, job)
  119. }
  120. }
  121. }
  122. func (e *Engine) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
  123. job.Running = false
  124. close(cancelChan)
  125. for cancelFn := range cancelChan {
  126. cancelFn()
  127. }
  128. return err
  129. }
  130. func (e *Engine) processJob(attemptID int, attemptChan chan int, cancelChan chan context.CancelFunc, job *Job) {
  131. defer func() {
  132. if err := recover(); err != nil {
  133. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  134. }
  135. }()
  136. alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
  137. cancelChan <- cancelFn
  138. span := opentracing.StartSpan("alert execution")
  139. alertCtx = opentracing.ContextWithSpan(alertCtx, span)
  140. evalContext := NewEvalContext(alertCtx, job.Rule)
  141. evalContext.Ctx = alertCtx
  142. go func() {
  143. defer func() {
  144. if err := recover(); err != nil {
  145. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  146. ext.Error.Set(span, true)
  147. span.LogFields(
  148. tlog.Error(fmt.Errorf("%v", err)),
  149. tlog.String("message", "failed to execute alert rule. panic was recovered."),
  150. )
  151. span.Finish()
  152. close(attemptChan)
  153. }
  154. }()
  155. e.evalHandler.Eval(evalContext)
  156. span.SetTag("alertId", evalContext.Rule.Id)
  157. span.SetTag("dashboardId", evalContext.Rule.DashboardId)
  158. span.SetTag("firing", evalContext.Firing)
  159. span.SetTag("nodatapoints", evalContext.NoDataFound)
  160. span.SetTag("attemptID", attemptID)
  161. if evalContext.Error != nil {
  162. ext.Error.Set(span, true)
  163. span.LogFields(
  164. tlog.Error(evalContext.Error),
  165. tlog.String("message", "alerting execution attempt failed"),
  166. )
  167. if attemptID < alertMaxAttempts {
  168. span.Finish()
  169. e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
  170. attemptChan <- (attemptID + 1)
  171. return
  172. }
  173. }
  174. evalContext.Rule.State = evalContext.GetNewState()
  175. e.resultHandler.Handle(evalContext)
  176. span.Finish()
  177. e.log.Debug("Job Execution completed", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
  178. close(attemptChan)
  179. }()
  180. }