Saltstack Official Linux Formula
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

prometheus.yml 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. {%- from "linux/map.jinja" import monitoring with context %}
  2. server:
  3. alert:
  4. SystemCpuFullWarning:
  5. {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
  6. if: >-
  7. 100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
  8. {% raw %}
  9. for: 2m
  10. labels:
  11. severity: warning
  12. service: system
  13. annotations:
  14. summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
  15. description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
  16. SystemLoadTooHighWarning:
  17. {%- endraw %}
  18. {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
  19. if: >-
  20. system_load5 / system_n_cpus > {{ load_threshold }}
  21. {%- raw %}
  22. for: 5m
  23. labels:
  24. severity: warning
  25. service: system
  26. annotations:
  27. summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
  28. description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
  29. SystemLoadTooHighCritical:
  30. {%- endraw %}
  31. {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
  32. if: >-
  33. system_load5 / system_n_cpus > {{ load_threshold }}
  34. {%- raw %}
  35. for: 5m
  36. labels:
  37. severity: warning
  38. service: system
  39. annotations:
  40. summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
  41. description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
  42. SystemDiskFullWarning:
  43. {%- endraw %}
  44. {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
  45. if: >-
  46. disk_used_percent >= {{ disk_threshold }}
  47. {%- raw %}
  48. for: 2m
  49. labels:
  50. severity: warning
  51. service: system
  52. annotations:
  53. summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
  54. description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
  55. SystemDiskFullMajor:
  56. {%- endraw %}
  57. {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
  58. if: >-
  59. disk_used_percent >= {{ disk_threshold }}
  60. {%- raw %}
  61. for: 2m
  62. labels:
  63. severity: major
  64. service: system
  65. annotations:
  66. summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
  67. description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
  68. SystemDiskInodesFullWarning:
  69. {%- endraw %}
  70. {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
  71. if: >-
  72. 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
  73. for: 2m
  74. labels:
  75. severity: warning
  76. service: system
  77. annotations:
  78. summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
  79. description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
  80. SystemDiskInodesFullMajor:
  81. {%- endraw %}
  82. {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
  83. if: >-
  84. 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
  85. for: 2m
  86. labels:
  87. severity: major
  88. service: system
  89. annotations:
  90. summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
  91. description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
  92. SystemDiskErrorsTooHigh:
  93. if: >-
  94. increase(hdd_errors_total[1m]) > 0
  95. for: 5m
  96. labels:
  97. severity: warning
  98. service: system
  99. annotations:
  100. summary: "Disk {{ $labels.device }} is failing"
  101. description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
  102. SystemMemoryFullWarning:
  103. {%- endraw %}
  104. {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
  105. if: >-
  106. mem_used_percent >= {{ mem_threshold }}
  107. for: 2m
  108. labels:
  109. severity: warning
  110. service: system
  111. annotations:
  112. summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
  113. description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
  114. SystemMemoryFullMajor:
  115. {%- endraw %}
  116. {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
  117. if: >-
  118. mem_used_percent >= {{ mem_threshold }}
  119. for: 2m
  120. labels:
  121. severity: major
  122. service: system
  123. annotations:
  124. summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
  125. description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
  126. SystemSwapFullWarning:
  127. {%- endraw %}
  128. {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
  129. if: >-
  130. swap_used_percent >= {{ swap_threshold }}
  131. for: 2m
  132. labels:
  133. severity: warning
  134. service: system
  135. annotations:
  136. summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
  137. description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
  138. SystemSwapFullMinor:
  139. {%- endraw %}
  140. {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
  141. if: >-
  142. swap_used_percent >= {{ swap_threshold }}
  143. for: 2m
  144. labels:
  145. severity: minor
  146. service: system
  147. annotations:
  148. summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
  149. description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
  150. SystemRxPacketsDroppedTooHigh:
  151. {%- endraw %}
  152. {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
  153. if: >-
  154. increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
  155. labels:
  156. severity: warning
  157. service: system
  158. annotations:
  159. summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
  160. description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
  161. SystemRxPacketsDroppedLongTermTooHigh:
  162. if: >-
  163. increase(net_drop_in[1m]) > 0
  164. for: 10m
  165. labels:
  166. severity: major
  167. service: system
  168. annotations:
  169. summary: "Received packets long term dropping"
  170. description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
  171. SystemTxPacketsDroppedTooHigh:
  172. {%- endraw %}
  173. {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
  174. if: >-
  175. increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
  176. labels:
  177. severity: warning
  178. service: system
  179. annotations:
  180. summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
  181. description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
  182. CronProcessDown:
  183. if: >-
  184. procstat_running{process_name="cron"} == 0
  185. labels:
  186. severity: critical
  187. service: system
  188. annotations:
  189. summary: "Cron process is down"
  190. description: "The cron process on the {{ $labels.host }} node is down."
  191. SshdProcessDown:
  192. if: >-
  193. procstat_running{process_name="sshd"} == 0
  194. labels:
  195. severity: critical
  196. service: system
  197. annotations:
  198. summary: "SSH process is down"
  199. description: "The SSH process on the {{ $labels.host }} node is down."
  200. SshFailedLoginsTooHigh:
  201. {%- endraw %}
  202. {%- set threshold = monitoring.failed_auths_threshold.warn %}
  203. if: >-
  204. increase(failed_logins_total[5m]) > {{ threshold }}
  205. labels:
  206. severity: warning
  207. service: system
  208. annotations:
  209. summary: "{{ threshold }}{%- raw %} failed SSH logins"
  210. description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
  211. PacketsDroppedByCpuMinor:
  212. {%- endraw %}
  213. {%- set packets_dropped_minor_threshold = monitoring.packets_dropped_per_cpu_threshold.minor %}
  214. if: >-
  215. floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_minor_threshold }}
  216. labels:
  217. severity: minor
  218. service: system
  219. annotations:
  220. summary: "CPU dropped {{ packets_dropped_minor_threshold }}{%- raw %} packets"
  221. description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
  222. PacketsDroppedByCpuMajor:
  223. {%- endraw %}
  224. {%- set packets_dropped_major_threshold = monitoring.packets_dropped_per_cpu_threshold.major %}
  225. if: >-
  226. floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_major_threshold }}
  227. labels:
  228. severity: major
  229. service: system
  230. annotations:
  231. summary: "CPU dropped {{ packets_dropped_major_threshold }}{%- raw %} packets"
  232. description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
  233. NetRxActionByCpuWarning:
  234. {%- endraw %}
  235. {%- set net_rx_action_warning_threshold = monitoring.net_rx_action_per_cpu_threshold.warning %}
  236. if: >-
  237. floor(increase(nstat_time_squeeze[1d])) > {{ net_rx_action_warning_threshold }}
  238. labels:
  239. severity: warning
  240. service: system
  241. annotations:
  242. summary: "CPU terminated {{ net_rx_action_warning_threshold }}{%- raw %} net_rx_action loops"
  243. description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours. Modify the net.core.netdev_budget kernel parameter."
  244. NetRxActionByCpuMinor:
  245. {%- endraw %}
  246. {%- set net_rx_action_minor_threshold = monitoring.net_rx_action_per_cpu_threshold.minor %}
  247. if: >-
  248. floor(increase(nstat_time_squeeze[1d])) > {{ net_rx_action_minor_threshold }}
  249. labels:
  250. severity: minor
  251. service: system
  252. annotations:
  253. summary: "CPU terminated {{ net_rx_action_minor_threshold }}{%- raw %} net_rx_action loops"
  254. description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours. Modify the net.core.netdev_budget kernel parameter."
  255. {%- endraw %}
  256. {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
  257. {%- raw %}
  258. BondInterfaceDown:
  259. if: >-
  260. bond_status < 1
  261. labels:
  262. severity: critical
  263. service: system
  264. annotations:
  265. summary: "{{ $labels.bond }} bond interface is down"
  266. description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
  267. BondInterfaceSlaveDown:
  268. if: >-
  269. bond_slave_status < 1
  270. labels:
  271. severity: warning
  272. service: system
  273. annotations:
  274. summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
  275. description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
  276. BondInterfaceSlaveDownMajor:
  277. if: >-
  278. sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
  279. labels:
  280. severity: major
  281. service: system
  282. annotations:
  283. summary: "50% of bond interface slaves {{ $labels.bond }} are down"
  284. description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
  285. {% endraw %}
  286. {%- endif %}