engine.go 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. package alerting
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/benbjohnson/clock"
  6. "github.com/grafana/grafana/pkg/bus"
  7. "github.com/grafana/grafana/pkg/log"
  8. m "github.com/grafana/grafana/pkg/models"
  9. "github.com/grafana/grafana/pkg/services/alerting/alertstates"
  10. )
  11. type Engine struct {
  12. execQueue chan *AlertJob
  13. resultQueue chan *AlertResult
  14. clock clock.Clock
  15. ticker *Ticker
  16. scheduler Scheduler
  17. handler AlertingHandler
  18. ruleReader RuleReader
  19. log log.Logger
  20. }
  21. func NewEngine() *Engine {
  22. e := &Engine{
  23. ticker: NewTicker(time.Now(), time.Second*0, clock.New()),
  24. execQueue: make(chan *AlertJob, 1000),
  25. resultQueue: make(chan *AlertResult, 1000),
  26. scheduler: NewScheduler(),
  27. handler: NewHandler(),
  28. ruleReader: NewRuleReader(),
  29. log: log.New("alerting.engine"),
  30. }
  31. return e
  32. }
  33. func (e *Engine) Start() {
  34. e.log.Info("Starting Alerting Engine")
  35. go e.alertingTicker()
  36. go e.execDispatch()
  37. go e.resultHandler()
  38. }
  39. func (e *Engine) Stop() {
  40. close(e.execQueue)
  41. close(e.resultQueue)
  42. }
  43. func (e *Engine) alertingTicker() {
  44. defer func() {
  45. if err := recover(); err != nil {
  46. e.log.Error("Scheduler Panic, stopping...", "error", err, "stack", log.Stack(1))
  47. }
  48. }()
  49. tickIndex := 0
  50. for {
  51. select {
  52. case tick := <-e.ticker.C:
  53. // TEMP SOLUTION update rules ever tenth tick
  54. if tickIndex%10 == 0 {
  55. e.scheduler.Update(e.ruleReader.Fetch())
  56. }
  57. e.scheduler.Tick(tick, e.execQueue)
  58. tickIndex++
  59. }
  60. }
  61. }
  62. func (e *Engine) execDispatch() {
  63. for job := range e.execQueue {
  64. log.Trace("Alerting: engine:execDispatch() starting job %s", job.Rule.Name)
  65. job.Running = true
  66. e.executeJob(job)
  67. }
  68. }
  69. func (e *Engine) executeJob(job *AlertJob) {
  70. now := time.Now()
  71. resultChan := make(chan *AlertResult, 1)
  72. go e.handler.Execute(job, resultChan)
  73. select {
  74. case <-time.After(time.Second * 5):
  75. e.resultQueue <- &AlertResult{
  76. State: alertstates.Pending,
  77. Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
  78. Error: fmt.Errorf("Timeout"),
  79. AlertJob: job,
  80. }
  81. e.log.Debug("Job Execution timeout", "alertRuleId", job.Rule.Id)
  82. case result := <-resultChan:
  83. result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
  84. e.log.Debug("Job Execution done", "timeTakenMs", result.Duration, "ruleId", job.Rule.Id)
  85. e.resultQueue <- result
  86. }
  87. }
  88. func (e *Engine) resultHandler() {
  89. for result := range e.resultQueue {
  90. e.log.Debug("Alert Rule Result", "ruleId", result.AlertJob.Rule.Id, "state", result.State, "value", result.ActualValue, "retry", result.AlertJob.RetryCount)
  91. result.AlertJob.Running = false
  92. if result.Error != nil {
  93. result.AlertJob.IncRetry()
  94. if result.AlertJob.Retryable() {
  95. e.log.Error("Alert Rule Result Error", "ruleId", result.AlertJob.Rule.Id, "error", result.Error, "retry", result.AlertJob.RetryCount)
  96. e.execQueue <- result.AlertJob
  97. } else {
  98. e.log.Error("Alert Rule Result Error After Max Retries", "ruleId", result.AlertJob.Rule.Id, "error", result.Error, "retry", result.AlertJob.RetryCount)
  99. result.State = alertstates.Critical
  100. result.Description = fmt.Sprintf("Failed to run check after %d retires, Error: %v", maxAlertExecutionRetries, result.Error)
  101. e.saveState(result)
  102. }
  103. } else {
  104. result.AlertJob.ResetRetry()
  105. e.saveState(result)
  106. }
  107. }
  108. }
  109. func (e *Engine) saveState(result *AlertResult) {
  110. cmd := &m.UpdateAlertStateCommand{
  111. AlertId: result.AlertJob.Rule.Id,
  112. NewState: result.State,
  113. Info: result.Description,
  114. }
  115. if err := bus.Dispatch(cmd); err != nil {
  116. e.log.Error("Failed to save state", "error", err)
  117. }
  118. }