alerting.go 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. package alerting
  2. import (
  3. "fmt"
  4. "time"
  5. "github.com/grafana/grafana/pkg/bus"
  6. "github.com/grafana/grafana/pkg/log"
  7. m "github.com/grafana/grafana/pkg/models"
  8. "github.com/grafana/grafana/pkg/setting"
  9. )
  10. var (
  11. maxRetries = 3
  12. )
  13. func Init() {
  14. if !setting.AlertingEnabled {
  15. return
  16. }
  17. log.Info("Alerting: Initializing scheduler...")
  18. scheduler := NewScheduler()
  19. reader := NewRuleReader()
  20. go scheduler.dispatch(reader)
  21. go scheduler.executor(&ExecutorImpl{})
  22. go scheduler.handleResponses()
  23. }
  24. type Scheduler struct {
  25. jobs map[int64]*m.AlertJob
  26. runQueue chan *m.AlertJob
  27. responseQueue chan *m.AlertResult
  28. alertRuleFetcher RuleReader
  29. }
  30. func NewScheduler() *Scheduler {
  31. return &Scheduler{
  32. jobs: make(map[int64]*m.AlertJob, 0),
  33. runQueue: make(chan *m.AlertJob, 1000),
  34. responseQueue: make(chan *m.AlertResult, 1000),
  35. }
  36. }
  37. func (scheduler *Scheduler) dispatch(reader RuleReader) {
  38. reschedule := time.NewTicker(time.Second * 10)
  39. secondTicker := time.NewTicker(time.Second)
  40. scheduler.updateJobs(reader.Fetch)
  41. for {
  42. select {
  43. case <-secondTicker.C:
  44. scheduler.queueJobs()
  45. case <-reschedule.C:
  46. scheduler.updateJobs(reader.Fetch)
  47. }
  48. }
  49. }
  50. func (scheduler *Scheduler) updateJobs(alertRuleFn func() []m.AlertRule) {
  51. log.Debug("Scheduler: UpdateJobs()")
  52. jobs := make(map[int64]*m.AlertJob, 0)
  53. rules := alertRuleFn()
  54. for i, rule := range rules {
  55. var job *m.AlertJob
  56. if scheduler.jobs[rule.Id] != nil {
  57. job = scheduler.jobs[rule.Id]
  58. } else {
  59. job = &m.AlertJob{
  60. Running: false,
  61. RetryCount: 0,
  62. }
  63. }
  64. job.Rule = rule
  65. job.Offset = int64(i)
  66. jobs[rule.Id] = job
  67. }
  68. log.Debug("Scheduler: Selected %d jobs", len(jobs))
  69. scheduler.jobs = jobs
  70. }
  71. func (scheduler *Scheduler) queueJobs() {
  72. now := time.Now().Unix()
  73. for _, job := range scheduler.jobs {
  74. if now%job.Rule.Frequency == 0 && job.Running == false {
  75. log.Info("Scheduler: Putting job on to run queue: %s", job.Rule.Title)
  76. scheduler.runQueue <- job
  77. }
  78. }
  79. }
  80. func (scheduler *Scheduler) executor(executor Executor) {
  81. for job := range scheduler.runQueue {
  82. //log.Info("Executor: queue length %d", len(this.runQueue))
  83. log.Info("Executor: executing %s", job.Rule.Title)
  84. job.Running = true
  85. scheduler.measureAndExecute(executor, job)
  86. }
  87. }
  88. func (scheduler *Scheduler) handleResponses() {
  89. for response := range scheduler.responseQueue {
  90. log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d)", response.Id, response.State, response.ActualValue, response.AlertJob.RetryCount)
  91. response.AlertJob.Running = false
  92. if response.IsResultIncomplete() {
  93. response.AlertJob.RetryCount++
  94. if response.AlertJob.RetryCount < maxRetries {
  95. scheduler.runQueue <- response.AlertJob
  96. } else {
  97. saveState(&m.AlertResult{
  98. Id: response.Id,
  99. State: m.AlertStateCritical,
  100. Description: fmt.Sprintf("Failed to run check after %d retires", maxRetries),
  101. })
  102. }
  103. } else {
  104. response.AlertJob.RetryCount = 0
  105. saveState(response)
  106. }
  107. }
  108. }
  109. func saveState(response *m.AlertResult) {
  110. cmd := &m.UpdateAlertStateCommand{
  111. AlertId: response.Id,
  112. NewState: response.State,
  113. Info: response.Description,
  114. }
  115. if err := bus.Dispatch(cmd); err != nil {
  116. log.Error(2, "failed to save state %v", err)
  117. }
  118. }
  119. func (scheduler *Scheduler) measureAndExecute(exec Executor, job *m.AlertJob) {
  120. now := time.Now()
  121. responseChan := make(chan *m.AlertResult, 1)
  122. go exec.Execute(job, responseChan)
  123. select {
  124. case <-time.After(time.Second * 5):
  125. scheduler.responseQueue <- &m.AlertResult{
  126. Id: job.Rule.Id,
  127. State: m.AlertStatePending,
  128. Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
  129. AlertJob: job,
  130. }
  131. case result := <-responseChan:
  132. result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
  133. log.Info("Schedular: exeuction took %vms", result.Duration)
  134. scheduler.responseQueue <- result
  135. }
  136. }