ExternalMirrors
/
linux-formula
spogulis no https://github.com/salt-formulas/salt-formula-linux


			
				
					
						
						
							
							{%- from "linux/map.jinja" import monitoring with context %}
server:
  alert:
    SystemCpuFullWarning:
      {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
      if: >-
        100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
      {% raw %}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
        description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
    SystemLoadTooHighWarning:
      {%- endraw %}
      {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
      if: >-
        system_load5 / system_n_cpus > {{ load_threshold }}
      {%- raw %}
      for: 5m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
        description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
    SystemLoadTooHighCritical:
      {%- endraw %}
      {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
      if: >-
        system_load5 / system_n_cpus > {{ load_threshold }}
      {%- raw %}
      for: 5m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
        description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
    SystemDiskFullWarning:
      {%- endraw %}
      {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
      if: >-
        disk_used_percent >= {{ disk_threshold }}
      {%- raw %}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
        description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
    SystemDiskFullMajor:
      {%- endraw %}
      {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
      if: >-
        disk_used_percent >= {{ disk_threshold }}
      {%- raw %}
      for: 2m
      labels:
        severity: major
        service: system
      annotations:
        summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
        description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
    SystemDiskInodesFullWarning:
      {%- endraw %}
      {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
      if: >-
        100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
    SystemDiskInodesFullMajor:
      {%- endraw %}
      {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
      if: >-
        100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
      for: 2m
      labels:
        severity: major
        service: system
      annotations:
        summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
    SystemDiskErrorsTooHigh:
      if: >-
        increase(hdd_errors_total[1m]) > 0
      for: 5m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "Disk {{ $labels.device }} is failing"
        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
    SystemMemoryFullWarning:
      {%- endraw %}
      {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
      if: >-
        mem_used_percent >= {{ mem_threshold }}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
    SystemMemoryFullMajor:
      {%- endraw %}
      {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
      if: >-
        mem_used_percent >= {{ mem_threshold }}
      for: 2m
      labels:
        severity: major
        service: system
      annotations:
        summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
    SystemSwapFullWarning:
      {%- endraw %}
      {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
      if: >-
        swap_used_percent >= {{ swap_threshold }}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
        description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
    SystemSwapFullMinor:
      {%- endraw %}
      {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
      if: >-
        swap_used_percent >= {{ swap_threshold }}
      for: 2m
      labels:
        severity: minor
        service: system
      annotations:
        summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
        description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
    SystemRxPacketsDroppedTooHigh:
      {%- endraw %}
      {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
      if: >-
        increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
        description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
    SystemRxPacketsDroppedLongTermTooHigh:
      if: >-
        increase(net_drop_in[1m]) > 0
      for: 10m
      labels:
        severity: major
        service: system
      annotations:
        summary: "Received packets long term dropping"
        description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
    SystemTxPacketsDroppedTooHigh:
      {%- endraw %}
      {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
      if: >-
        increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
        description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
    CronProcessDown:
      if: >-
        procstat_running{process_name="cron"} == 0
      labels:
        severity: critical
        service: system
      annotations:
        summary: "Cron process is down"
        description: "The cron process on the {{ $labels.host }} node is down."
    SshdProcessDown:
      if: >-
        procstat_running{process_name="sshd"} == 0
      labels:
        severity: critical
        service: system
      annotations:
        summary: "SSH process is down"
        description: "The SSH process on the {{ $labels.host }} node is down."
    SshFailedLoginsTooHigh:
      {%- endraw %}
      {%- set threshold = monitoring.failed_auths_threshold.warn %}
      if: >-
        increase(failed_logins_total[5m]) > {{ threshold }}
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ threshold }}{%- raw %} failed SSH logins"
        description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
    PacketsDroppedByCpuMinor:
      {%- endraw %}
      {%- set packets_dropped_minor_threshold = monitoring.packets_dropped_per_cpu_threshold.minor %}
      if: >-
        floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_minor_threshold }}
      labels:
        severity: minor
        service: system
      annotations:
        summary: "CPU dropped {{ packets_dropped_minor_threshold }}{%- raw %} packets"
        description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
    PacketsDroppedByCpuMajor:
      {%- endraw %}
      {%- set packets_dropped_major_threshold = monitoring.packets_dropped_per_cpu_threshold.major %}
      if: >-
        floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_major_threshold }}
      labels:
        severity: major
        service: system
      annotations:
        summary: "CPU dropped {{ packets_dropped_major_threshold }}{%- raw %} packets"
        description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
    NetRxActionByCpuWarning:
      {%- endraw %}
      {%- set net_rx_action_warning_threshold = monitoring.net_rx_action_per_cpu_threshold.warning %}
      if: >-
        floor(increase(nstat_time_squeeze[24h])) > {{ net_rx_action_warning_threshold }}
      labels:
        severity: warning
        service: system
      annotations:
        summary: "CPU terminated {{ net_rx_action_warning_threshold }}{%- raw %} net_rx_action loops"
        description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours."
    NetRxActionByCpuMinor:
      {%- endraw %}
      {%- set net_rx_action_minor_threshold = monitoring.net_rx_action_per_cpu_threshold.minor %}
      if: >-
        floor(increase(nstat_time_squeeze[24h])) > {{ net_rx_action_minor_threshold }}
      labels:
        severity: minor
        service: system
      annotations:
        summary: "CPU terminated {{ net_rx_action_minor_threshold }}{%- raw %} net_rx_action loops"
        description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours."
{%- endraw %}
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
{%- raw %}
    BondInterfaceDown:
      if: >-
        bond_status < 1
      labels:
        severity: critical
        service: system
      annotations:
        summary: "{{ $labels.bond }} bond interface is down"
        description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
    BondInterfaceSlaveDown:
      if: >-
        bond_slave_status < 1
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
        description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
    BondInterfaceSlaveDownMajor:
      if: >-
        sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
      labels:
        severity: major
        service: system
      annotations:
        summary: "50% of bond interface slaves {{ $labels.bond }} are down"
        description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
{% endraw %}
{%- endif %}