Saltstack Official Linux Formula
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

243 lines
9.5KB

  1. {%- from "linux/map.jinja" import monitoring with context %}
  2. server:
  3. alert:
  4. SystemCpuFullWarning:
  5. {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
  6. if: >-
  7. 100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
  8. {% raw %}
  9. for: 2m
  10. labels:
  11. severity: warning
  12. service: system
  13. annotations:
  14. summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
  15. description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
  16. SystemLoadTooHighWarning:
  17. {%- endraw %}
  18. {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
  19. if: >-
  20. system_load5 / system_n_cpus > {{ load_threshold }}
  21. {%- raw %}
  22. for: 5m
  23. labels:
  24. severity: warning
  25. service: system
  26. annotations:
  27. summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
  28. description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
  29. SystemLoadTooHighCritical:
  30. {%- endraw %}
  31. {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
  32. if: >-
  33. system_load5 / system_n_cpus > {{ load_threshold }}
  34. {%- raw %}
  35. for: 5m
  36. labels:
  37. severity: warning
  38. service: system
  39. annotations:
  40. summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
  41. description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
  42. SystemDiskFullWarning:
  43. {%- endraw %}
  44. {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
  45. if: >-
  46. disk_used_percent >= {{ disk_threshold }}
  47. {%- raw %}
  48. for: 2m
  49. labels:
  50. severity: warning
  51. service: system
  52. annotations:
  53. summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
  54. description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
  55. SystemDiskFullMajor:
  56. {%- endraw %}
  57. {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
  58. if: >-
  59. disk_used_percent >= {{ disk_threshold }}
  60. {%- raw %}
  61. for: 2m
  62. labels:
  63. severity: major
  64. service: system
  65. annotations:
  66. summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
  67. description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
  68. SystemDiskInodesFullWarning:
  69. {%- endraw %}
  70. {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
  71. if: >-
  72. 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
  73. for: 2m
  74. labels:
  75. severity: warning
  76. service: system
  77. annotations:
  78. summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
  79. description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
  80. SystemDiskInodesFullMajor:
  81. {%- endraw %}
  82. {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
  83. if: >-
  84. 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
  85. for: 2m
  86. labels:
  87. severity: major
  88. service: system
  89. annotations:
  90. summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
  91. description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
  92. SystemDiskErrorsTooHigh:
  93. if: >-
  94. increase(hdd_errors_total[1m]) > 0
  95. for: 5m
  96. labels:
  97. severity: warning
  98. service: system
  99. annotations:
  100. summary: "Disk {{ $labels.device }} is failing"
  101. description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
  102. SystemMemoryFullWarning:
  103. {%- endraw %}
  104. {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
  105. if: >-
  106. mem_used_percent >= {{ mem_threshold }}
  107. for: 2m
  108. labels:
  109. severity: warning
  110. service: system
  111. annotations:
  112. summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
  113. description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
  114. SystemMemoryFullMajor:
  115. {%- endraw %}
  116. {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
  117. if: >-
  118. mem_used_percent >= {{ mem_threshold }}
  119. for: 2m
  120. labels:
  121. severity: major
  122. service: system
  123. annotations:
  124. summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
  125. description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
  126. SystemSwapFullWarning:
  127. {%- endraw %}
  128. {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
  129. if: >-
  130. swap_used_percent >= {{ swap_threshold }}
  131. for: 2m
  132. labels:
  133. severity: warning
  134. service: system
  135. annotations:
  136. summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
  137. description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
  138. SystemSwapFullMinor:
  139. {%- endraw %}
  140. {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
  141. if: >-
  142. swap_used_percent >= {{ swap_threshold }}
  143. for: 2m
  144. labels:
  145. severity: minor
  146. service: system
  147. annotations:
  148. summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
  149. description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
  150. SystemRxPacketsDroppedTooHigh:
  151. {%- endraw %}
  152. {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
  153. if: >-
  154. increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
  155. labels:
  156. severity: warning
  157. service: system
  158. annotations:
  159. summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
  160. description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
  161. SystemRxPacketsDroppedLongTermTooHigh:
  162. if: >-
  163. increase(net_drop_in[1m]) > 0
  164. for: 10m
  165. labels:
  166. severity: major
  167. service: system
  168. annotations:
  169. summary: "Received packets long term dropping"
  170. description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
  171. SystemTxPacketsDroppedTooHigh:
  172. {%- endraw %}
  173. {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
  174. if: >-
  175. increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
  176. labels:
  177. severity: warning
  178. service: system
  179. annotations:
  180. summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
  181. description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
  182. CronProcessDown:
  183. if: >-
  184. procstat_running{process_name="cron"} == 0
  185. labels:
  186. severity: critical
  187. service: system
  188. annotations:
  189. summary: "Cron process is down"
  190. description: "The cron process on the {{ $labels.host }} node is down."
  191. SshdProcessDown:
  192. if: >-
  193. procstat_running{process_name="sshd"} == 0
  194. labels:
  195. severity: critical
  196. service: system
  197. annotations:
  198. summary: "SSH process is down"
  199. description: "The SSH process on the {{ $labels.host }} node is down."
  200. SshFailedLoginsTooHigh:
  201. {%- endraw %}
  202. {%- set threshold = monitoring.failed_auths_threshold.warn %}
  203. if: >-
  204. increase(failed_logins_total[5m]) > {{ threshold }}
  205. labels:
  206. severity: warning
  207. service: system
  208. annotations:
  209. summary: "{{ threshold }}{%- raw %} failed SSH logins"
  210. description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
  211. {%- endraw %}
  212. {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
  213. {%- raw %}
  214. BondInterfaceDown:
  215. if: >-
  216. bond_status < 1
  217. labels:
  218. severity: critical
  219. service: system
  220. annotations:
  221. summary: "{{ $labels.bond }} bond interface is down"
  222. description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
  223. BondInterfaceSlaveDown:
  224. if: >-
  225. bond_slave_status < 1
  226. labels:
  227. severity: warning
  228. service: system
  229. annotations:
  230. summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
  231. description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
  232. BondInterfaceSlaveDownMajor:
  233. if: >-
  234. sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
  235. labels:
  236. severity: major
  237. service: system
  238. annotations:
  239. summary: "50% of bond interface slaves {{ $labels.bond }} are down"
  240. description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
  241. {% endraw %}
  242. {%- endif %}