alerting_model.json 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. {
  2. "alert": {
  3. "name": "Majority servers down",
  4. "frequency": 60,
  5. "notify": ["group1", "group2"],
  6. "expressions": [
  7. {
  8. "left": [
  9. {
  10. "type": "query",
  11. "refId": "A",
  12. "timeRange": {"from": "5m", "to": "now-1m"},
  13. },
  14. {
  15. "type": "function",
  16. "name": "max"
  17. }
  18. ],
  19. "operator": ">",
  20. "right": [
  21. {
  22. "type": "constant",
  23. "value": 100
  24. }
  25. ],
  26. "level": 2,
  27. }
  28. ]
  29. },
  30. "alert": {
  31. "name": "Majority servers down take2",
  32. "frequency": 60,
  33. "notify": ["group1", "group2"],
  34. "expressions": [
  35. {
  36. "left": [
  37. {
  38. "type": "query",
  39. "refId": "A",
  40. "timeRange": {"from": "5m", "to": "now-1m"},
  41. },
  42. {
  43. "type": "function",
  44. "name": "max"
  45. }
  46. ],
  47. "operator": ">",
  48. "right": [
  49. {
  50. "type": "query",
  51. "refId": "A",
  52. "timeRange": {"from": "now-1d-5m", "to": "now-1d"},
  53. },
  54. {
  55. "type": "function",
  56. "name": "max"
  57. }
  58. ],
  59. "level": 2,
  60. }
  61. ]
  62. },
  63. "alert": {
  64. "name": "CPU usage last 5min above 90%",
  65. "frequency": 60,
  66. "expressions": [
  67. {
  68. "expr": "query(#A, 5m, now, avg)",
  69. "operator": ">",
  70. "critLevel": 90,
  71. }
  72. ]
  73. },
  74. "alert": {
  75. "name": "Series count above 10",
  76. "frequency": "1m",
  77. "expressions": [
  78. {
  79. "expr": "query(#A, 5m, now, avg) | countSeries()",
  80. "operator": ">",
  81. "critLevel": 10,
  82. }
  83. ]
  84. },
  85. "alert": {
  86. "name": "Disk Free Zero in 3 days",
  87. "frequency": "1d",
  88. "expressions": [
  89. {
  90. "expr": "query(#A, 1d, now, trend(3d))",
  91. "operator": ">",
  92. "critLevel": 0,
  93. }
  94. ]
  95. },
  96. "alert": {
  97. "name": "Server requests is zero for more than 10min",
  98. "frequency": "1d",
  99. "expressions": [
  100. {
  101. "expr": "query(#A, 10m, now, sum)",
  102. "operator": "=",
  103. "critLevel": 0,
  104. }
  105. ]
  106. },
  107. "alert": {
  108. "name": "Timeouts should not be more than 0.1% of requests",
  109. "frequency": "1d",
  110. "expressions": [
  111. {
  112. "expr": "query(#A, 10m, now, sum) | subtract | query(#B, 10m, now, sum)",
  113. "operator": ">",
  114. "critLevel": 0,
  115. }
  116. ]
  117. },
  118. "alert": {
  119. "name": "CPU usage last 5min changed by more than 20% compared to last 24hours",
  120. "frequency": "1m",
  121. "value": "query(#A, 5m, now, avg)",
  122. "operator": "percent change",
  123. "threshold": "query(#A, 1d, now, avg)",
  124. },
  125. "alert": {
  126. "name": "CPU higher than 90%",
  127. "frequency": "1m",
  128. "valueExpr": "query(#A, 5m, now, avg)",
  129. "evalType": "greater than",
  130. "critLevel": 20,
  131. "warnLevel": 10,
  132. },
  133. "alert": {
  134. "name": "CPU usage last 5min changed by more than 20% compared to last 24hours",
  135. "frequency": "1m",
  136. "expr": "query(#A, 5m, now, avg) percentGreaterThan()",
  137. "evalType": "percentscre change",
  138. "evalExpr": "query(#A, 1d, now, avg)",
  139. "critLevel": 20,
  140. "warnLevel": 10,
  141. },
  142. "alert": {
  143. "name": "CPU usage last 5min changed by more than 20% compared to last 24hours",
  144. "frequency": "1m",
  145. "valueQuery": "query(#A, 5m, now, avg) ",
  146. "evalType": "simple", "// other options are: percent change, trend"
  147. "evalQuery": "query(#A, 1d, now, avg)",
  148. "comparison": "greater than",
  149. "critLevel": 20,
  150. "warnLevel": 10,
  151. },
  152. "alert": {
  153. "name": "CPU usage last 5min changed by more than 20% compared to last 24hours",
  154. "frequency": "1m",
  155. "valueQuery": "query(#A, 5m, now, avg) | Evaluate Against: Static Threshold | >200 Warn | >300 Critical",
  156. "valueQuery": "query(#A, 5m, now, avg) | Evaluate Against: Percent Change Compared To | query(#B, 5m, now, avg) | >200 Warn | >300 Critical",
  157. "valueQuery": "query(#A, 5m, now, trend) | Evaluate Against: Forcast | 7days | >200 Warn | >300 Critical",
  158. "evalType": "simple", "// other options are: percent change, trend"
  159. "evalQuery": "query(#A, 1d, now, avg)",
  160. "comparison": "greater than",
  161. "critLevel": 20,
  162. "warnLevel": 10,
  163. },
  164. }