server: alert: AvgCPUUsageIdle: {%- set cpu_idle_threshold = prometheus_server.get('alert', {}).get('AvgCPUUsageIdle', {}).get('var', {}).get('threshold', 10) %} if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Idle CPU usage too low on {{ $labels.host }}' description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ cpu_idle_threshold}})' PredictLinearDiskFree: if: 'predict_linear(disk_free[1h], 8*3600) < 0' {% raw %} labels: severity: warning service: system annotations: summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}' description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}' {% endraw %} PredictLinearDiskInodesFree: if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0' {% raw %} labels: severity: warning service: system annotations: summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}' description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}' {% endraw %} AvgMemAvailablePercent: {%- set mem_avail_threshold = prometheus_server.get('alert', {}).get('AvgMemAvailablePercent', {}).get('var', {}).get('threshold', 10) %} if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Free memory too low on {{ $labels.host }}' description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})' SystemLoad5: if: system_load5 / system_n_cpus > {{ prometheus_server.get('alert', {}).get('SystemLoad5', {}).get('var', {}).get('threshold', 3) }} {% raw %} labels: severity: warning service: system annotations: summary: 'High system load (5m) on {{ $labels.host }}' description: 'High system load (5m) on node {{ $labels.host }}' {% endraw %} NetworkRxPacketsDropped: {%- set net_rx_dropped_threshold = prometheus_server.get('alert', {}).get('NetworkRxPacketsDropped', {}).get('var', {}).get('threshold', 100) %} if: avg_over_time(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}' description: 'The average number of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_rx_dropped_threshold }})' NetworkTxPacketsDropped: {%- set net_tx_dropped_threshold = prometheus_server.get('alert', {}).get('NetworkTxPacketsDropped', {}).get('var', {}).get('threshold', 100) %} if: avg_over_time(net_drop_out[1m]) > {{ net_tx_dropped_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}' description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})' SystemSwapUsed: {%- set swap_used_threshold = prometheus_server.get('alert', {}).get('SystemSwapUsed', {}).get('var', {}).get('threshold', 80) %} if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Swap usage too high on {{ $labels.host }}' description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})' SystemSwapIn: {%- set swap_in_threshold = prometheus_server.get('alert', {}).get('SystemSwapIn', {}).get('var', {}).get('threshold', 1024 * 1024) %} if: rate(swap_in[2m]) > {{ swap_in_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Swap input throughput too high on {{ $labels.host }}' description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})' SystemSwapOut: {%- set swap_out_threshold = prometheus_server.get('alert', {}).get('SystemSwapOut', {}).get('var', {}).get('threshold', 1024 * 1024) %} if: rate(swap_out[2m]) > {{ swap_out_threshold }} {% raw %} labels: severity: warning service: system annotations: summary: 'Swap output throughput too high on {{ $labels.host }}' description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }})'