Saltstack Official Linux Formula
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

177 lines
8.0KB

  1. {%- from "linux/map.jinja" import monitoring with context %}
  2. server:
  3. alert:
  4. SystemCpuIdleTooLow:
  5. {%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
  6. if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
  7. {% raw %}
  8. labels:
  9. severity: warning
  10. service: system
  11. annotations:
  12. summary: 'Idle CPU usage too low on {{ $labels.host }}'
  13. description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
  14. SystemDiskSpaceTooLow:
  15. if: 'predict_linear(disk_free[1h], 8*3600) < 0'
  16. {% raw %}
  17. for: 15m
  18. labels:
  19. severity: warning
  20. service: system
  21. annotations:
  22. summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
  23. description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
  24. {% endraw %}
  25. SystemFreeOpenFilesTooLow:
  26. if: 'predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max'
  27. {% raw %}
  28. labels:
  29. severity: warning
  30. service: system
  31. annotations:
  32. summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
  33. description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
  34. {% endraw %}
  35. SystemDiskErrors:
  36. if: 'increase(hdd_errors_total[5m]) > 0'
  37. {% raw %}
  38. labels:
  39. severity: critical
  40. service: system
  41. annotations:
  42. summary: 'Disk {{ $labels.device }} is failing'
  43. description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
  44. {% endraw %}
  45. SystemDiskSpaceFull:
  46. if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
  47. {% raw %}
  48. labels:
  49. severity: critical
  50. service: system
  51. annotations:
  52. summary: 'Disk partition {{ $labels.path }} full on {{ $labels.host }}'
  53. description: 'The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}.'
  54. {% endraw %}
  55. SystemDiskInodesTooLow:
  56. if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
  57. {% raw %}
  58. for: 15m
  59. labels:
  60. severity: warning
  61. service: system
  62. annotations:
  63. summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
  64. description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
  65. {% endraw %}
  66. SystemDiskInodesFull:
  67. if: 'disk_inodes_used / disk_inodes_total >= 0.99'
  68. {% raw %}
  69. labels:
  70. severity: critical
  71. service: system
  72. annotations:
  73. summary: 'Inodes for {{ $labels.path }} full on {{ $labels.host }}'
  74. description: 'The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}.'
  75. {% endraw %}
  76. SystemMemoryAvailableLow:
  77. {%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
  78. if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
  79. {% raw %}
  80. labels:
  81. severity: warning
  82. service: system
  83. annotations:
  84. summary: 'Free memory low on {{ $labels.host }}'
  85. description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
  86. SystemMemoryAvailableTooLow:
  87. {%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
  88. if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
  89. {% raw %}
  90. labels:
  91. severity: critical
  92. service: system
  93. annotations:
  94. summary: 'Free memory too low on {{ $labels.host }}'
  95. description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
  96. SystemLoad5TooHigh:
  97. if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
  98. {% raw %}
  99. labels:
  100. severity: warning
  101. service: system
  102. annotations:
  103. summary: 'High system load (5m) on {{ $labels.host }}'
  104. description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
  105. SystemRxPacketsDroppedTooHigh:
  106. {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
  107. if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
  108. {% raw %}
  109. labels:
  110. severity: critical
  111. service: system
  112. annotations:
  113. summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
  114. description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
  115. SystemTxPacketsDroppedTooHigh:
  116. {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
  117. if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
  118. {% raw %}
  119. labels:
  120. severity: critical
  121. service: system
  122. annotations:
  123. summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
  124. description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
  125. SystemSwapIn:
  126. {%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
  127. if: rate(swap_in[2m]) > {{ swap_in_threshold }}
  128. {% raw %}
  129. labels:
  130. severity: warning
  131. service: system
  132. annotations:
  133. summary: 'Swap input throughput too high on {{ $labels.host }}'
  134. description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
  135. SystemSwapOut:
  136. {%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
  137. if: rate(swap_out[2m]) > {{ swap_out_threshold }}
  138. {% raw %}
  139. labels:
  140. severity: warning
  141. service: system
  142. annotations:
  143. summary: 'Swap output throughput too high on {{ $labels.host }}'
  144. description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
  145. {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
  146. BondInterfaceDown:
  147. if: 'bond_status < 1'
  148. {% raw %}
  149. labels:
  150. severity: critical
  151. service: system
  152. annotations:
  153. summary: 'Bond status interface {{ $labels.bond }} is DOWN on {{ $labels.host }}'
  154. description: 'The bond interface ({{ $labels.bond }) has all ifaces in a down state on {{ $labels.host }}.'
  155. {% endraw %}
  156. BondSlaveInterfacesMinimum:
  157. if: '(sum(bond_slave_status) BY (bond,host)) / (count(bond_slave_status) BY (bond,host)) <= 0.5'
  158. {% raw %}
  159. labels:
  160. severity: critical
  161. service: system
  162. annotations:
  163. summary: 'At least half of Bond slave interfaces {{ $labels.bond }} are DOWN on {{ $labels.host }}'
  164. description: 'The bond interface ({{ $labels.bond }) has at least half of slave ifaces in a down state on {{ $labels.host }}.'
  165. {% endraw %}
  166. BondSlaveInterfaceStatus:
  167. if: 'bond_slave_status < 1'
  168. {% raw %}
  169. labels:
  170. severity: warning
  171. service: system
  172. annotations:
  173. summary: 'Bond slave interface {{ $labels.interface }} is DOWN on {{ $labels.host }} for {{ $labels.bond }}'
  174. description: 'The bond slave interface ({{ $labels.interface }) is in DOWN state for {{ $labels.bond }} on {{ $labels.host }}.'
  175. {% endraw %}
  176. {%- endif %}