Change alerts names, severity and descriptions. Closes-bug: PROD-19718 Change-Id: I238fbcd51cf48389b504ccb531ba9b2bc9dd4be6

6 år sedan · d40d0f1e24
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -273,6 +273,9 @@

 {% set monitoring = salt['grains.filter_by']({
    'default': {
        'bond_status': {
            'interfaces': False
        },
        'zombie': {
            'warn': 3,
            'crit': 7,
@@ -298,30 +301,43 @@
            'interface_regex': '^[a-z0-9]+$',
            'ignore_selected': False,
        },
        'bond_status': {
            'interfaces': False
        'cpu_usage_percentage': {
              'warn': 90.0,
        },
        'memory_usage_percentage': {
            'warn': 90.0,
            'major': 95.0,
        },
        'cpu_idle_percentage': {
              'warn': 10.0,
        'disk_usage_percentage': {
            'warn': 85.0,
            'major': 95.0,
        },
        'free_memory_percentage': {
              'warn': 10.0,
              'crit': 5.0,
        'swap_usage_percentage': {
            'warn': 50.0,
            'minor': 90.0,
        },
        'load_5': {
              'warn': 3,
        'inodes_usage_percentage': {
            'warn': 85.0,
            'major': 95.0,
        },
        'rx_packets_dropped_rate': {
              'warn': 100,
        'system_load_threshold': {
            'warn': 1,
            'crit': 2,
        },
        'tx_packets_dropped_rate': {
              'warn': 100,
        'rx_packets_dropped_threshold': {
            'warn': 100,
        },
        'tx_packets_dropped_threshold': {
            'warn': 100,
        },
        'swap_in_rate': {
              'warn': 1024 * 1024,
            'warn': 1024 * 1024,
        },
        'swap_out_rate': {
              'warn': 1024 * 1024,
            'warn': 1024 * 1024,
        },
        'failed_auths_threshold': {
            'warn': 5,
        },
    },
 }, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,176 +1,242 @@
 {%- from "linux/map.jinja" import monitoring with context %}
 server:
  alert:
    SystemCpuIdleTooLow:
      {%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
      if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
    SystemCpuFullWarning:
      {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
      if: >-
        100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
      {% raw %}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'Idle CPU usage too low on {{ $labels.host }}'
        description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
    SystemDiskSpaceTooLow:
      if: 'predict_linear(disk_free[1h], 8*3600) < 0'
      {% raw %}
      for: 15m
        summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
        description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for at least 2 minutes."
    SystemLoadTooHighWarning:
      {%- endraw %}
      {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
      if: >-
        system_load5 / system_n_cpus > {{ load_threshold }}
      {%- raw %}
      for: 5m
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
        description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
      {% endraw %}
    SystemFreeOpenFilesTooLow:
      if: 'predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max'
      {% raw %}
        summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
        description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
    SystemLoadTooHighCritical:
      {%- endraw %}
      {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
      if: >-
        system_load5 / system_n_cpus > {{ load_threshold }}
      {%- raw %}
      for: 5m
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
        description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
      {% endraw %}
    SystemDiskErrors:
      if: 'increase(hdd_errors_total[5m]) > 0'
      {% raw %}
        summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
        description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
    SystemDiskFullWarning:
      {%- endraw %}
      {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
      if: >-
        disk_used_percent >= {{ disk_threshold }}
      {%- raw %}
      for: 2m
      labels:
        severity: critical
        severity: warning
        service: system
      annotations:
        summary: 'Disk {{ $labels.device }} is failing'
        description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
      {% endraw %}
    SystemDiskSpaceFull:
      if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
      {% raw %}
        summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
        description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
    SystemDiskFullMajor:
      {%- endraw %}
      {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
      if: >-
        disk_used_percent >= {{ disk_threshold }}
      {%- raw %}
      for: 2m
      labels:
        severity: critical
        severity: major
        service: system
      annotations:
        summary: 'Disk partition {{ $labels.path }} full on {{ $labels.host }}'
        description: 'The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}.'
      {% endraw %}
    SystemDiskInodesTooLow:
      if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
      {% raw %}
      for: 15m
        summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
        description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
    SystemDiskInodesFullWarning:
      {%- endraw %}
      {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
      if: >-
        100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
        description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
      {% endraw %}
    SystemDiskInodesFull:
      if: 'disk_inodes_used / disk_inodes_total >= 0.99'
      {% raw %}
        summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
    SystemDiskInodesFullMajor:
      {%- endraw %}
      {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
      if: >-
        100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
      for: 2m
      labels:
        severity: critical
        severity: major
        service: system
      annotations:
        summary: 'Inodes for {{ $labels.path }} full on {{ $labels.host }}'
        description: 'The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}.'
      {% endraw %}
    SystemMemoryAvailableLow:
      {%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
      if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
      {% raw %}
        summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
    SystemDiskErrorsTooHigh:
      if: >-
        increase(hdd_errors_total[1m]) > 0
      for: 5m
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'Free memory low on {{ $labels.host }}'
        description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
    SystemMemoryAvailableTooLow:
      {%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
      if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
      {% raw %}
        summary: "Disk {{ $labels.device }} is failing"
        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for at least 5 minutes."
    SystemMemoryFullWarning:
      {%- endraw %}
      {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
      if: >-
        mem_used_percent >= {{ mem_threshold }}
      for: 2m
      labels:
        severity: critical
        severity: warning
        service: system
      annotations:
        summary: 'Free memory too low on {{ $labels.host }}'
        description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
    SystemLoad5TooHigh:
      if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
      {% raw %}
        summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
    SystemMemoryFullMajor:
      {%- endraw %}
      {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
      if: >-
        mem_used_percent >= {{ mem_threshold }}
      for: 2m
      labels:
        severity: major
        service: system
      annotations:
        summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
        description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
    SystemSwapFullWarning:
      {%- endraw %}
      {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
      if: >-
        swap_used_percent >= {{ swap_threshold }}
      for: 2m
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'High system load (5m) on {{ $labels.host }}'
        description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
        summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
        description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
    SystemSwapFullMinor:
      {%- endraw %}
      {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
      if: >-
        swap_used_percent >= {{ swap_threshold }}
      for: 2m
      labels:
        severity: minor
        service: system
      annotations:
        summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
        description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
    SystemRxPacketsDroppedTooHigh:
      {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
      if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
      {% raw %}
      {%- endraw %}
      {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
      if: >-
        increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
      labels:
        severity: critical
        severity: warning
        service: system
      annotations:
        summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
        description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
    SystemRxPacketsDroppedLongTermTooHigh:
      if: >-
        increase(net_drop_in[1m]) > 0
      for: 10m
      labels:
        severity: major
        service: system
      annotations:
        summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
        description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
        summary: "Received packets long term dropping"
        description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
    SystemTxPacketsDroppedTooHigh:
      {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
      if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
      {% raw %}
      {%- endraw %}
      {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
      if: >-
        increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
      labels:
        severity: warning
        service: system
      annotations:
        summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
        description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
    CronProcessDown:
      if: >-
        procstat_running{process_name="cron"} == 0
      labels:
        severity: critical
        service: system
      annotations:
        summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
        description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
    SystemSwapIn:
      {%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
      if: rate(swap_in[2m]) > {{ swap_in_threshold }}
      {% raw %}
        summary: "Cron process is down"
        description: "The cron process on the {{ $labels.host }} node is down."
    SshdProcessDown:
      if: >-
        procstat_running{process_name="sshd"} == 0
      labels:
        severity: warning
        severity: critical
        service: system
      annotations:
        summary: 'Swap input throughput too high on {{ $labels.host }}'
        description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
    SystemSwapOut:
      {%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
      if: rate(swap_out[2m]) > {{ swap_out_threshold }}
      {% raw %}
        summary: "SSH process is down"
        description: "The SSH process on the {{ $labels.host }} node is down."
    SshFailedLoginsTooHigh:
      {%- endraw %}
      {%- set threshold = monitoring.failed_auths_threshold.warn %}
      if: >-
        increase(failed_logins_total[5m]) > {{ threshold }}
      labels:
        severity: warning
        service: system
      annotations:
        summary: 'Swap output throughput too high on {{ $labels.host }}'
        description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
        summary: "{{ threshold }}{%- raw %} failed SSH logins"
        description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
 {%- endraw %}
 {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
 {%- raw %}
    BondInterfaceDown:
      if: 'bond_status < 1'
      {% raw %}
      if: >-
        bond_status < 1
      labels:
        severity: critical
        service: system
      annotations:
        summary: 'Bond status interface {{ $labels.bond }} is DOWN on {{ $labels.host }}'
        description: 'The bond interface ({{ $labels.bond }) has all ifaces in a down state on {{ $labels.host }}.'
      {% endraw %}
    BondSlaveInterfacesMinimum:
      if: '(sum(bond_slave_status) BY (bond,host)) / (count(bond_slave_status) BY (bond,host)) <= 0.5'
      {% raw %}
        summary: "{{ $labels.bond }} bond interface is down"
        description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
    BondInterfaceSlaveDown:
      if: >-
        bond_slave_status < 1
      labels:
        severity: critical
        severity: warning
        service: system
      annotations:
        summary: 'At least half of Bond slave interfaces {{ $labels.bond }} are DOWN on {{ $labels.host }}'
        description: 'The bond interface ({{ $labels.bond }) has at least half of slave ifaces in a down state on {{ $labels.host }}.'
      {% endraw %}
    BondSlaveInterfaceStatus:
      if: 'bond_slave_status < 1'
      {% raw %}
        summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
        description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
    BondInterfaceSlaveDownMajor:
      if: >-
        sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
      labels:
        severity: warning
        severity: major
        service: system
      annotations:
        summary: 'Bond slave interface {{ $labels.interface }} is DOWN on {{ $labels.host }} for {{ $labels.bond }}'
        description: 'The bond slave interface ({{ $labels.interface }) is in DOWN state for {{ $labels.bond }} on {{ $labels.host }}.'
      {% endraw %}
        summary: "50% of bond interface slaves {{ $labels.bond }} are down"
        description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
 {% endraw %}
 {%- endif %}
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -23,6 +23,12 @@ agent:
    processes:
    swap:
    system:
    procstat:
      process:
        sshd:
          exe: sshd
        cron:
          exe: cron
    linux_sysctl_fs:
 {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
    bond: