{%- from "linux/map.jinja" import monitoring with context %} server: alert: SystemCpuFullWarning: {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %} if: >- 100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }} {% raw %} for: 2m labels: severity: warning service: system annotations: summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage" description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes." SystemLoadTooHighWarning: {%- endraw %} {%- set load_threshold = monitoring.system_load_threshold.warn|float %} if: >- system_load5 / system_n_cpus > {{ load_threshold }} {%- raw %} for: 5m labels: severity: warning service: system annotations: summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}" description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes." SystemLoadTooHighCritical: {%- endraw %} {%- set load_threshold = monitoring.system_load_threshold.crit|float %} if: >- system_load5 / system_n_cpus > {{ load_threshold }} {%- raw %} for: 5m labels: severity: warning service: system annotations: summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}" description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes." SystemDiskFullWarning: {%- endraw %} {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %} if: >- disk_used_percent >= {{ disk_threshold }} {%- raw %} for: 2m labels: severity: warning service: system annotations: summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full" description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes." SystemDiskFullMajor: {%- endraw %} {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %} if: >- disk_used_percent >= {{ disk_threshold }} {%- raw %} for: 2m labels: severity: major service: system annotations: summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full" description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes." SystemDiskInodesFullWarning: {%- endraw %} {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %} if: >- 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }} for: 2m labels: severity: warning service: system annotations: summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used" description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes." SystemDiskInodesFullMajor: {%- endraw %} {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %} if: >- 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }} for: 2m labels: severity: major service: system annotations: summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used" description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes." SystemDiskErrorsTooHigh: if: >- increase(hdd_errors_total[1m]) > 0 for: 5m labels: severity: warning service: system annotations: summary: "Disk {{ $labels.device }} is failing" description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes." SystemMemoryFullWarning: {%- endraw %} {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %} if: >- mem_used_percent >= {{ mem_threshold }} for: 2m labels: severity: warning service: system annotations: summary: "{{ mem_threshold }}{%- raw %}% of memory is used" description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes." SystemMemoryFullMajor: {%- endraw %} {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %} if: >- mem_used_percent >= {{ mem_threshold }} for: 2m labels: severity: major service: system annotations: summary: "{{ mem_threshold }}{%- raw %}% of memory is used" description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes." SystemSwapFullWarning: {%- endraw %} {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %} if: >- swap_used_percent >= {{ swap_threshold }} for: 2m labels: severity: warning service: system annotations: summary: "{{ swap_threshold }}{%- raw %}% of swap is used" description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes." SystemSwapFullMinor: {%- endraw %} {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %} if: >- swap_used_percent >= {{ swap_threshold }} for: 2m labels: severity: minor service: system annotations: summary: "{{ swap_threshold }}{%- raw %}% of swap is used" description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes." SystemRxPacketsDroppedTooHigh: {%- endraw %} {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %} if: >- increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} labels: severity: warning service: system annotations: summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped" description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute." SystemRxPacketsDroppedLongTermTooHigh: if: >- increase(net_drop_in[1m]) > 0 for: 10m labels: severity: major service: system annotations: summary: "Received packets long term dropping" description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes." SystemTxPacketsDroppedTooHigh: {%- endraw %} {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %} if: >- increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }} labels: severity: warning service: system annotations: summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped" description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute." CronProcessDown: if: >- procstat_running{process_name="cron"} == 0 labels: severity: critical service: system annotations: summary: "Cron process is down" description: "The cron process on the {{ $labels.host }} node is down." SshdProcessDown: if: >- procstat_running{process_name="sshd"} == 0 labels: severity: critical service: system annotations: summary: "SSH process is down" description: "The SSH process on the {{ $labels.host }} node is down." SshFailedLoginsTooHigh: {%- endraw %} {%- set threshold = monitoring.failed_auths_threshold.warn %} if: >- increase(failed_logins_total[5m]) > {{ threshold }} labels: severity: warning service: system annotations: summary: "{{ threshold }}{%- raw %} failed SSH logins" description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes." {%- endraw %} {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %} {%- raw %} BondInterfaceDown: if: >- bond_status < 1 labels: severity: critical service: system annotations: summary: "{{ $labels.bond }} bond interface is down" description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down." BondInterfaceSlaveDown: if: >- bond_slave_status < 1 labels: severity: warning service: system annotations: summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down" description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down." BondInterfaceSlaveDownMajor: if: >- sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status) labels: severity: major service: system annotations: summary: "50% of bond interface slaves {{ $labels.bond }} are down" description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down." {% endraw %} {%- endif %}