engine.go 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. package alerting
  2. import (
  3. "context"
  4. "fmt"
  5. "time"
  6. "github.com/opentracing/opentracing-go"
  7. "github.com/opentracing/opentracing-go/ext"
  8. tlog "github.com/opentracing/opentracing-go/log"
  9. "github.com/benbjohnson/clock"
  10. "github.com/grafana/grafana/pkg/log"
  11. "golang.org/x/sync/errgroup"
  12. )
  13. type Engine struct {
  14. execQueue chan *Job
  15. //clock clock.Clock
  16. ticker *Ticker
  17. scheduler Scheduler
  18. evalHandler EvalHandler
  19. ruleReader RuleReader
  20. log log.Logger
  21. resultHandler ResultHandler
  22. }
  23. func NewEngine() *Engine {
  24. e := &Engine{
  25. ticker: NewTicker(time.Now(), time.Second*0, clock.New()),
  26. execQueue: make(chan *Job, 1000),
  27. scheduler: NewScheduler(),
  28. evalHandler: NewEvalHandler(),
  29. ruleReader: NewRuleReader(),
  30. log: log.New("alerting.engine"),
  31. resultHandler: NewResultHandler(),
  32. }
  33. return e
  34. }
  35. func (e *Engine) Run(ctx context.Context) error {
  36. e.log.Info("Initializing Alerting")
  37. alertGroup, ctx := errgroup.WithContext(ctx)
  38. alertGroup.Go(func() error { return e.alertingTicker(ctx) })
  39. alertGroup.Go(func() error { return e.runJobDispatcher(ctx) })
  40. err := alertGroup.Wait()
  41. e.log.Info("Stopped Alerting", "reason", err)
  42. return err
  43. }
  44. func (e *Engine) alertingTicker(grafanaCtx context.Context) error {
  45. defer func() {
  46. if err := recover(); err != nil {
  47. e.log.Error("Scheduler Panic: stopping alertingTicker", "error", err, "stack", log.Stack(1))
  48. }
  49. }()
  50. tickIndex := 0
  51. for {
  52. select {
  53. case <-grafanaCtx.Done():
  54. return grafanaCtx.Err()
  55. case tick := <-e.ticker.C:
  56. // TEMP SOLUTION update rules ever tenth tick
  57. if tickIndex%10 == 0 {
  58. e.scheduler.Update(e.ruleReader.Fetch())
  59. }
  60. e.scheduler.Tick(tick, e.execQueue)
  61. tickIndex++
  62. }
  63. }
  64. }
  65. func (e *Engine) runJobDispatcher(grafanaCtx context.Context) error {
  66. dispatcherGroup, alertCtx := errgroup.WithContext(grafanaCtx)
  67. for {
  68. select {
  69. case <-grafanaCtx.Done():
  70. return dispatcherGroup.Wait()
  71. case job := <-e.execQueue:
  72. dispatcherGroup.Go(func() error { return e.processJobWithRetry(alertCtx, job) })
  73. }
  74. }
  75. }
  76. var (
  77. unfinishedWorkTimeout time.Duration = time.Second * 5
  78. // TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
  79. alertTimeout time.Duration = time.Second * 30
  80. alertMaxAttempts int = 3
  81. )
  82. func (e *Engine) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
  83. defer func() {
  84. if err := recover(); err != nil {
  85. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  86. }
  87. }()
  88. cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
  89. attemptChan := make(chan int, 1)
  90. // Initialize with first attemptID=1
  91. attemptChan <- 1
  92. job.Running = true
  93. for {
  94. select {
  95. case <-grafanaCtx.Done():
  96. // In case grafana server context is cancel, let a chance to job processing
  97. // to finish gracefully - by waiting a timeout duration - before forcing its end.
  98. unfinishedWorkTimer := time.NewTimer(unfinishedWorkTimeout)
  99. select {
  100. case <-unfinishedWorkTimer.C:
  101. return e.endJob(grafanaCtx.Err(), cancelChan, job)
  102. case <-attemptChan:
  103. return e.endJob(nil, cancelChan, job)
  104. }
  105. case attemptID, more := <-attemptChan:
  106. if !more {
  107. return e.endJob(nil, cancelChan, job)
  108. }
  109. go e.processJob(attemptID, attemptChan, cancelChan, job)
  110. }
  111. }
  112. }
  113. func (e *Engine) endJob(err error, cancelChan chan context.CancelFunc, job *Job) error {
  114. job.Running = false
  115. close(cancelChan)
  116. for cancelFn := range cancelChan {
  117. cancelFn()
  118. }
  119. return err
  120. }
  121. func (e *Engine) processJob(attemptID int, attemptChan chan int, cancelChan chan context.CancelFunc, job *Job) {
  122. defer func() {
  123. if err := recover(); err != nil {
  124. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  125. }
  126. }()
  127. alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
  128. cancelChan <- cancelFn
  129. span := opentracing.StartSpan("alert execution")
  130. alertCtx = opentracing.ContextWithSpan(alertCtx, span)
  131. evalContext := NewEvalContext(alertCtx, job.Rule)
  132. evalContext.Ctx = alertCtx
  133. go func() {
  134. defer func() {
  135. if err := recover(); err != nil {
  136. e.log.Error("Alert Panic", "error", err, "stack", log.Stack(1))
  137. ext.Error.Set(span, true)
  138. span.LogFields(
  139. tlog.Error(fmt.Errorf("%v", err)),
  140. tlog.String("message", "failed to execute alert rule. panic was recovered."),
  141. )
  142. span.Finish()
  143. close(attemptChan)
  144. }
  145. }()
  146. e.evalHandler.Eval(evalContext)
  147. span.SetTag("alertId", evalContext.Rule.Id)
  148. span.SetTag("dashboardId", evalContext.Rule.DashboardId)
  149. span.SetTag("firing", evalContext.Firing)
  150. span.SetTag("nodatapoints", evalContext.NoDataFound)
  151. span.SetTag("attemptID", attemptID)
  152. if evalContext.Error != nil {
  153. ext.Error.Set(span, true)
  154. span.LogFields(
  155. tlog.Error(evalContext.Error),
  156. tlog.String("message", "alerting execution attempt failed"),
  157. )
  158. if attemptID < alertMaxAttempts {
  159. span.Finish()
  160. e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
  161. attemptChan <- (attemptID + 1)
  162. return
  163. }
  164. }
  165. evalContext.Rule.State = evalContext.GetNewState()
  166. e.resultHandler.Handle(evalContext)
  167. span.Finish()
  168. e.log.Debug("Job Execution completed", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
  169. close(attemptChan)
  170. }()
  171. }