| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- package alerting
- import (
- "math/rand"
- "strconv"
- "time"
- //"github.com/grafana/grafana/pkg/bus"
- "github.com/grafana/grafana/pkg/bus"
- "github.com/grafana/grafana/pkg/log"
- m "github.com/grafana/grafana/pkg/models"
- "github.com/grafana/grafana/pkg/setting"
- "sync"
- )
- func Init() {
- if !setting.AlertingEnabled {
- return
- }
- log.Info("Alerting: Initializing scheduler...")
- scheduler := NewScheduler()
- go scheduler.Dispatch(&AlertRuleReader{})
- go scheduler.Executor(&ExecutorImpl{})
- go scheduler.HandleResponses()
- }
- type Scheduler struct {
- jobs map[int64]*AlertJob
- runQueue chan *AlertJob
- responseQueue chan *AlertResult
- mtx sync.RWMutex
- alertRuleFetcher RuleReader
- serverId string
- serverPosition int
- clusterSize int
- }
- func NewScheduler() *Scheduler {
- return &Scheduler{
- jobs: make(map[int64]*AlertJob, 0),
- runQueue: make(chan *AlertJob, 1000),
- responseQueue: make(chan *AlertResult, 1000),
- serverId: strconv.Itoa(rand.Intn(1000)),
- }
- }
- func (this *Scheduler) heartBeat() {
- //Lets cheat on this until we focus on clustering
- //log.Info("Heartbeat: Sending heartbeat from " + this.serverId)
- this.clusterSize = 1
- this.serverPosition = 1
- /*
- cmd := &m.HeartBeatCommand{ServerId: this.serverId}
- err := bus.Dispatch(cmd)
- if err != nil {
- log.Error(1, "Failed to send heartbeat.")
- } else {
- this.clusterSize = cmd.Result.ClusterSize
- this.serverPosition = cmd.Result.UptimePosition
- }
- */
- }
- func (this *Scheduler) Dispatch(reader RuleReader) {
- reschedule := time.NewTicker(time.Second * 100)
- secondTicker := time.NewTicker(time.Second)
- heartbeat := time.NewTicker(time.Second * 5)
- this.heartBeat()
- this.updateJobs(reader.Fetch)
- for {
- select {
- case <-secondTicker.C:
- this.queueJobs()
- case <-reschedule.C:
- this.updateJobs(reader.Fetch)
- case <-heartbeat.C:
- this.heartBeat()
- }
- }
- }
- func (this *Scheduler) updateJobs(f func() []m.AlertRule) {
- log.Debug("Scheduler: UpdateJobs()")
- jobs := make(map[int64]*AlertJob, 0)
- rules := f()
- this.mtx.Lock()
- defer this.mtx.Unlock()
- for i := this.serverPosition - 1; i < len(rules); i += this.clusterSize {
- rule := rules[i]
- jobs[rule.Id] = &AlertJob{rule: rule, offset: int64(len(jobs))}
- }
- log.Debug("Scheduler: Selected %d jobs", len(jobs))
- this.jobs = jobs
- }
- func (this *Scheduler) queueJobs() {
- now := time.Now().Unix()
- for _, job := range this.jobs {
- if now%job.rule.Frequency == 0 && job.running == false {
- log.Info("Scheduler: Putting job on to run queue: %s", job.rule.Title)
- this.runQueue <- job
- }
- }
- }
- func (this *Scheduler) Executor(executor Executor) {
- for job := range this.runQueue {
- //log.Info("Executor: queue length %d", len(this.runQueue))
- log.Info("Executor: executing %s", job.rule.Title)
- this.jobs[job.rule.Id].running = true
- this.MeasureAndExecute(executor, job)
- }
- }
- func (this *Scheduler) HandleResponses() {
- for response := range this.responseQueue {
- log.Info("Response: alert(%d) status(%s) actual(%v)", response.Id, response.State, response.ActualValue)
- if this.jobs[response.Id] != nil {
- this.jobs[response.Id].running = false
- }
- cmd := m.UpdateAlertStateCommand{
- AlertId: response.Id,
- NewState: response.State,
- }
- if err := bus.Dispatch(&cmd); err != nil {
- log.Error(1, "failed to save state", err)
- }
- }
- }
- func (this *Scheduler) MeasureAndExecute(exec Executor, rule *AlertJob) {
- now := time.Now()
- response := make(chan *AlertResult, 1)
- go exec.Execute(rule.rule, response)
- select {
- case <-time.After(time.Second * 5):
- this.responseQueue <- &AlertResult{Id: rule.rule.Id, State: "timed out", Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000)}
- case r := <-response:
- r.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
- log.Info("Schedular: exeuction took %vms", r.Duration)
- this.responseQueue <- r
- }
- }
- type AlertJob struct {
- offset int64
- delay bool
- running bool
- rule m.AlertRule
- }
- type AlertResult struct {
- Id int64
- State string
- ActualValue float64
- Duration float64
- }
|