|
@@ -1,16 +1,12 @@
|
|
|
package alerting
|
|
package alerting
|
|
|
|
|
|
|
|
import (
|
|
import (
|
|
|
- "math/rand"
|
|
|
|
|
- "strconv"
|
|
|
|
|
"time"
|
|
"time"
|
|
|
|
|
|
|
|
- //"github.com/grafana/grafana/pkg/bus"
|
|
|
|
|
"github.com/grafana/grafana/pkg/bus"
|
|
"github.com/grafana/grafana/pkg/bus"
|
|
|
"github.com/grafana/grafana/pkg/log"
|
|
"github.com/grafana/grafana/pkg/log"
|
|
|
m "github.com/grafana/grafana/pkg/models"
|
|
m "github.com/grafana/grafana/pkg/models"
|
|
|
"github.com/grafana/grafana/pkg/setting"
|
|
"github.com/grafana/grafana/pkg/setting"
|
|
|
- "sync"
|
|
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
func Init() {
|
|
func Init() {
|
|
@@ -21,59 +17,34 @@ func Init() {
|
|
|
log.Info("Alerting: Initializing scheduler...")
|
|
log.Info("Alerting: Initializing scheduler...")
|
|
|
|
|
|
|
|
scheduler := NewScheduler()
|
|
scheduler := NewScheduler()
|
|
|
- go scheduler.Dispatch(&AlertRuleReader{})
|
|
|
|
|
|
|
+ reader := NewRuleReader()
|
|
|
|
|
+
|
|
|
|
|
+ go scheduler.Dispatch(reader)
|
|
|
go scheduler.Executor(&ExecutorImpl{})
|
|
go scheduler.Executor(&ExecutorImpl{})
|
|
|
go scheduler.HandleResponses()
|
|
go scheduler.HandleResponses()
|
|
|
|
|
+
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
type Scheduler struct {
|
|
type Scheduler struct {
|
|
|
- jobs map[int64]*AlertJob
|
|
|
|
|
- runQueue chan *AlertJob
|
|
|
|
|
|
|
+ jobs map[int64]*m.AlertJob
|
|
|
|
|
+ runQueue chan *m.AlertJob
|
|
|
responseQueue chan *AlertResult
|
|
responseQueue chan *AlertResult
|
|
|
- mtx sync.RWMutex
|
|
|
|
|
|
|
|
|
|
alertRuleFetcher RuleReader
|
|
alertRuleFetcher RuleReader
|
|
|
-
|
|
|
|
|
- serverId string
|
|
|
|
|
- serverPosition int
|
|
|
|
|
- clusterSize int
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
func NewScheduler() *Scheduler {
|
|
func NewScheduler() *Scheduler {
|
|
|
return &Scheduler{
|
|
return &Scheduler{
|
|
|
- jobs: make(map[int64]*AlertJob, 0),
|
|
|
|
|
- runQueue: make(chan *AlertJob, 1000),
|
|
|
|
|
|
|
+ jobs: make(map[int64]*m.AlertJob, 0),
|
|
|
|
|
+ runQueue: make(chan *m.AlertJob, 1000),
|
|
|
responseQueue: make(chan *AlertResult, 1000),
|
|
responseQueue: make(chan *AlertResult, 1000),
|
|
|
- serverId: strconv.Itoa(rand.Intn(1000)),
|
|
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-func (this *Scheduler) heartBeat() {
|
|
|
|
|
-
|
|
|
|
|
- //Lets cheat on this until we focus on clustering
|
|
|
|
|
- //log.Info("Heartbeat: Sending heartbeat from " + this.serverId)
|
|
|
|
|
- this.clusterSize = 1
|
|
|
|
|
- this.serverPosition = 1
|
|
|
|
|
-
|
|
|
|
|
- /*
|
|
|
|
|
- cmd := &m.HeartBeatCommand{ServerId: this.serverId}
|
|
|
|
|
- err := bus.Dispatch(cmd)
|
|
|
|
|
-
|
|
|
|
|
- if err != nil {
|
|
|
|
|
- log.Error(1, "Failed to send heartbeat.")
|
|
|
|
|
- } else {
|
|
|
|
|
- this.clusterSize = cmd.Result.ClusterSize
|
|
|
|
|
- this.serverPosition = cmd.Result.UptimePosition
|
|
|
|
|
- }
|
|
|
|
|
- */
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
func (this *Scheduler) Dispatch(reader RuleReader) {
|
|
func (this *Scheduler) Dispatch(reader RuleReader) {
|
|
|
- reschedule := time.NewTicker(time.Second * 100)
|
|
|
|
|
|
|
+ reschedule := time.NewTicker(time.Second * 5)
|
|
|
secondTicker := time.NewTicker(time.Second)
|
|
secondTicker := time.NewTicker(time.Second)
|
|
|
- heartbeat := time.NewTicker(time.Second * 5)
|
|
|
|
|
|
|
|
|
|
- this.heartBeat()
|
|
|
|
|
this.updateJobs(reader.Fetch)
|
|
this.updateJobs(reader.Fetch)
|
|
|
|
|
|
|
|
for {
|
|
for {
|
|
@@ -82,24 +53,20 @@ func (this *Scheduler) Dispatch(reader RuleReader) {
|
|
|
this.queueJobs()
|
|
this.queueJobs()
|
|
|
case <-reschedule.C:
|
|
case <-reschedule.C:
|
|
|
this.updateJobs(reader.Fetch)
|
|
this.updateJobs(reader.Fetch)
|
|
|
- case <-heartbeat.C:
|
|
|
|
|
- this.heartBeat()
|
|
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-func (this *Scheduler) updateJobs(f func() []m.AlertRule) {
|
|
|
|
|
|
|
+func (this *Scheduler) updateJobs(f func() []m.AlertJob) {
|
|
|
log.Debug("Scheduler: UpdateJobs()")
|
|
log.Debug("Scheduler: UpdateJobs()")
|
|
|
|
|
|
|
|
- jobs := make(map[int64]*AlertJob, 0)
|
|
|
|
|
|
|
+ jobs := make(map[int64]*m.AlertJob, 0)
|
|
|
rules := f()
|
|
rules := f()
|
|
|
|
|
|
|
|
- this.mtx.Lock()
|
|
|
|
|
- defer this.mtx.Unlock()
|
|
|
|
|
-
|
|
|
|
|
- for i := this.serverPosition - 1; i < len(rules); i += this.clusterSize {
|
|
|
|
|
|
|
+ for i := 0; i < len(rules); i++ {
|
|
|
rule := rules[i]
|
|
rule := rules[i]
|
|
|
- jobs[rule.Id] = &AlertJob{rule: rule, offset: int64(len(jobs))}
|
|
|
|
|
|
|
+ //jobs[rule.Rule.Id] = &m.AlertJob{Rule: rule, Offset: int64(len(jobs))}
|
|
|
|
|
+ jobs[rule.Rule.Id] = &rule
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
log.Debug("Scheduler: Selected %d jobs", len(jobs))
|
|
log.Debug("Scheduler: Selected %d jobs", len(jobs))
|
|
@@ -111,8 +78,8 @@ func (this *Scheduler) queueJobs() {
|
|
|
now := time.Now().Unix()
|
|
now := time.Now().Unix()
|
|
|
|
|
|
|
|
for _, job := range this.jobs {
|
|
for _, job := range this.jobs {
|
|
|
- if now%job.rule.Frequency == 0 && job.running == false {
|
|
|
|
|
- log.Info("Scheduler: Putting job on to run queue: %s", job.rule.Title)
|
|
|
|
|
|
|
+ if now%job.Rule.Frequency == 0 && job.Running == false {
|
|
|
|
|
+ log.Info("Scheduler: Putting job on to run queue: %s", job.Rule.Title)
|
|
|
this.runQueue <- job
|
|
this.runQueue <- job
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -121,8 +88,8 @@ func (this *Scheduler) queueJobs() {
|
|
|
func (this *Scheduler) Executor(executor Executor) {
|
|
func (this *Scheduler) Executor(executor Executor) {
|
|
|
for job := range this.runQueue {
|
|
for job := range this.runQueue {
|
|
|
//log.Info("Executor: queue length %d", len(this.runQueue))
|
|
//log.Info("Executor: queue length %d", len(this.runQueue))
|
|
|
- log.Info("Executor: executing %s", job.rule.Title)
|
|
|
|
|
- this.jobs[job.rule.Id].running = true
|
|
|
|
|
|
|
+ log.Info("Executor: executing %s", job.Rule.Title)
|
|
|
|
|
+ this.jobs[job.Rule.Id].Running = true
|
|
|
this.MeasureAndExecute(executor, job)
|
|
this.MeasureAndExecute(executor, job)
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -131,8 +98,9 @@ func (this *Scheduler) HandleResponses() {
|
|
|
for response := range this.responseQueue {
|
|
for response := range this.responseQueue {
|
|
|
log.Info("Response: alert(%d) status(%s) actual(%v)", response.Id, response.State, response.ActualValue)
|
|
log.Info("Response: alert(%d) status(%s) actual(%v)", response.Id, response.State, response.ActualValue)
|
|
|
if this.jobs[response.Id] != nil {
|
|
if this.jobs[response.Id] != nil {
|
|
|
- this.jobs[response.Id].running = false
|
|
|
|
|
|
|
+ this.jobs[response.Id].Running = false
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
cmd := m.UpdateAlertStateCommand{
|
|
cmd := m.UpdateAlertStateCommand{
|
|
|
AlertId: response.Id,
|
|
AlertId: response.Id,
|
|
|
NewState: response.State,
|
|
NewState: response.State,
|
|
@@ -144,15 +112,15 @@ func (this *Scheduler) HandleResponses() {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-func (this *Scheduler) MeasureAndExecute(exec Executor, rule *AlertJob) {
|
|
|
|
|
|
|
+func (this *Scheduler) MeasureAndExecute(exec Executor, rule *m.AlertJob) {
|
|
|
now := time.Now()
|
|
now := time.Now()
|
|
|
|
|
|
|
|
response := make(chan *AlertResult, 1)
|
|
response := make(chan *AlertResult, 1)
|
|
|
- go exec.Execute(rule.rule, response)
|
|
|
|
|
|
|
+ go exec.Execute(rule, response)
|
|
|
|
|
|
|
|
select {
|
|
select {
|
|
|
case <-time.After(time.Second * 5):
|
|
case <-time.After(time.Second * 5):
|
|
|
- this.responseQueue <- &AlertResult{Id: rule.rule.Id, State: "timed out", Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000)}
|
|
|
|
|
|
|
+ this.responseQueue <- &AlertResult{Id: rule.Rule.Id, State: "timed out", Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000)}
|
|
|
case r := <-response:
|
|
case r := <-response:
|
|
|
r.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
|
|
r.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
|
|
|
log.Info("Schedular: exeuction took %vms", r.Duration)
|
|
log.Info("Schedular: exeuction took %vms", r.Duration)
|
|
@@ -160,13 +128,6 @@ func (this *Scheduler) MeasureAndExecute(exec Executor, rule *AlertJob) {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-type AlertJob struct {
|
|
|
|
|
- offset int64
|
|
|
|
|
- delay bool
|
|
|
|
|
- running bool
|
|
|
|
|
- rule m.AlertRule
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
type AlertResult struct {
|
|
type AlertResult struct {
|
|
|
Id int64
|
|
Id int64
|
|
|
State string
|
|
State string
|