engine.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. package alerting
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/benbjohnson/clock"
  6. "github.com/grafana/grafana/pkg/bus"
  7. "github.com/grafana/grafana/pkg/log"
  8. m "github.com/grafana/grafana/pkg/models"
  9. "github.com/grafana/grafana/pkg/services/alerting/alertstates"
  10. )
  11. type Engine struct {
  12. execQueue chan *AlertJob
  13. resultQueue chan *AlertResult
  14. clock clock.Clock
  15. ticker *Ticker
  16. scheduler Scheduler
  17. executor Executor
  18. ruleReader RuleReader
  19. log log.Logger
  20. }
  21. func NewEngine() *Engine {
  22. e := &Engine{
  23. ticker: NewTicker(time.Now(), time.Second*0, clock.New()),
  24. execQueue: make(chan *AlertJob, 1000),
  25. resultQueue: make(chan *AlertResult, 1000),
  26. scheduler: NewScheduler(),
  27. executor: NewExecutor(),
  28. ruleReader: NewRuleReader(),
  29. log: log.New("alerting.engine"),
  30. }
  31. return e
  32. }
  33. func (e *Engine) Start() {
  34. e.log.Info("Starting Alerting Engine")
  35. go e.alertingTicker()
  36. go e.execDispatch()
  37. go e.resultHandler()
  38. }
  39. func (e *Engine) Stop() {
  40. close(e.execQueue)
  41. close(e.resultQueue)
  42. }
  43. func (e *Engine) alertingTicker() {
  44. tickIndex := 0
  45. for {
  46. select {
  47. case tick := <-e.ticker.C:
  48. // TEMP SOLUTION update rules ever tenth tick
  49. if tickIndex%10 == 0 {
  50. e.scheduler.Update(e.ruleReader.Fetch())
  51. }
  52. e.scheduler.Tick(tick, e.execQueue)
  53. tickIndex++
  54. }
  55. }
  56. }
  57. func (e *Engine) execDispatch() {
  58. for job := range e.execQueue {
  59. log.Trace("Alerting: engine:execDispatch() starting job %s", job.Rule.Name)
  60. job.Running = true
  61. e.executeJob(job)
  62. }
  63. }
  64. func (e *Engine) executeJob(job *AlertJob) {
  65. now := time.Now()
  66. resultChan := make(chan *AlertResult, 1)
  67. go e.executor.Execute(job, resultChan)
  68. select {
  69. case <-time.After(time.Second * 5):
  70. e.resultQueue <- &AlertResult{
  71. State: alertstates.Pending,
  72. Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
  73. Error: fmt.Errorf("Timeout"),
  74. AlertJob: job,
  75. }
  76. e.log.Debug("Job Execution timeout", "alertRuleId", job.Rule.Id)
  77. case result := <-resultChan:
  78. result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
  79. e.log.Debug("Job Execution done", "time_taken", result.Duration, "ruleId", job.Rule.Id)
  80. e.resultQueue <- result
  81. }
  82. }
  83. func (e *Engine) resultHandler() {
  84. for result := range e.resultQueue {
  85. e.log.Debug("Alert Rule Result", "ruleId", result.AlertJob.Rule.Id, "state", result.State, "value", result.ActualValue, "retry", result.AlertJob.RetryCount)
  86. result.AlertJob.Running = false
  87. // handle result error
  88. if result.Error != nil {
  89. result.AlertJob.RetryCount++
  90. if result.AlertJob.RetryCount < maxRetries {
  91. e.log.Error("Alert Rule Result Error", "ruleId", result.AlertJob.Rule.Id, "error", result.Error, "retry", result.AlertJob.RetryCount)
  92. e.execQueue <- result.AlertJob
  93. } else {
  94. e.log.Error("Alert Rule Result Error After Max Retries", "ruleId", result.AlertJob.Rule.Id, "error", result.Error, "retry", result.AlertJob.RetryCount)
  95. result.State = alertstates.Critical
  96. result.Description = fmt.Sprintf("Failed to run check after %d retires, Error: %v", maxRetries, result.Error)
  97. saveState(result)
  98. }
  99. } else {
  100. result.AlertJob.RetryCount = 0
  101. saveState(result)
  102. }
  103. }
  104. }
  105. func (e *Engine) saveState(result *AlertResult) {
  106. cmd := &m.UpdateAlertStateCommand{
  107. AlertId: result.AlertJob.Rule.Id,
  108. NewState: result.State,
  109. Info: result.Description,
  110. }
  111. if err := bus.Dispatch(cmd); err != nil {
  112. e.log.Error("Failed to save state", "error", err)
  113. }
  114. }