|
|
@@ -74,8 +74,8 @@ func (scheduler *Scheduler) updateJobs(alertRuleFn func() []m.AlertRule) {
|
|
|
job = scheduler.jobs[rule.Id]
|
|
|
} else {
|
|
|
job = &m.AlertJob{
|
|
|
- Running: false,
|
|
|
- Retry: 0,
|
|
|
+ Running: false,
|
|
|
+ RetryCount: 0,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -110,24 +110,28 @@ func (scheduler *Scheduler) executor(executor Executor) {
|
|
|
|
|
|
func (scheduler *Scheduler) handleResponses() {
|
|
|
for response := range scheduler.responseQueue {
|
|
|
- log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Retry, response.AlertJob.Running)
|
|
|
+ log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d)", response.Id, response.State, response.ActualValue, response.AlertJob.RetryCount)
|
|
|
response.AlertJob.Running = false
|
|
|
|
|
|
- if response.State == m.AlertStatePending {
|
|
|
- response.AlertJob.Retry++
|
|
|
- if response.AlertJob.Retry > maxRetries {
|
|
|
- response.State = m.AlertStateCritical
|
|
|
- response.Description = fmt.Sprintf("Failed to run check after %d retires", maxRetries)
|
|
|
- scheduler.saveState(response)
|
|
|
+ if response.IsResultIncomplete() {
|
|
|
+ response.AlertJob.RetryCount++
|
|
|
+ if response.AlertJob.RetryCount < maxRetries {
|
|
|
+ scheduler.runQueue <- response.AlertJob
|
|
|
+ } else {
|
|
|
+ saveState(&m.AlertResult{
|
|
|
+ Id: response.Id,
|
|
|
+ State: m.AlertStateCritical,
|
|
|
+ Description: fmt.Sprintf("Failed to run check after %d retires", maxRetries),
|
|
|
+ })
|
|
|
}
|
|
|
} else {
|
|
|
- response.AlertJob.Retry = 0
|
|
|
- scheduler.saveState(response)
|
|
|
+ response.AlertJob.RetryCount = 0
|
|
|
+ saveState(response)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func (scheduler *Scheduler) saveState(response *m.AlertResult) {
|
|
|
+func saveState(response *m.AlertResult) {
|
|
|
cmd := &m.UpdateAlertStateCommand{
|
|
|
AlertId: response.Id,
|
|
|
NewState: response.State,
|