Преглед изворни кода

Alerts reworked

Change alerts names, severity and descriptions.

Closes-bug: PROD-19718
Change-Id: I238fbcd51cf48389b504ccb531ba9b2bc9dd4be6
atp-proxy
Michal Kobus пре 6 година
родитељ
комит
d40d0f1e24
3 измењених фајлова са 210 додато и 122 уклоњено
  1. +31
    -15
      linux/map.jinja
  2. +173
    -107
      linux/meta/prometheus.yml
  3. +6
    -0
      linux/meta/telegraf.yml

+ 31
- 15
linux/map.jinja Прегледај датотеку

@@ -273,6 +273,9 @@

{% set monitoring = salt['grains.filter_by']({
'default': {
'bond_status': {
'interfaces': False
},
'zombie': {
'warn': 3,
'crit': 7,
@@ -298,30 +301,43 @@
'interface_regex': '^[a-z0-9]+$',
'ignore_selected': False,
},
'bond_status': {
'interfaces': False
'cpu_usage_percentage': {
'warn': 90.0,
},
'memory_usage_percentage': {
'warn': 90.0,
'major': 95.0,
},
'cpu_idle_percentage': {
'warn': 10.0,
'disk_usage_percentage': {
'warn': 85.0,
'major': 95.0,
},
'free_memory_percentage': {
'warn': 10.0,
'crit': 5.0,
'swap_usage_percentage': {
'warn': 50.0,
'minor': 90.0,
},
'load_5': {
'warn': 3,
'inodes_usage_percentage': {
'warn': 85.0,
'major': 95.0,
},
'rx_packets_dropped_rate': {
'warn': 100,
'system_load_threshold': {
'warn': 1,
'crit': 2,
},
'tx_packets_dropped_rate': {
'warn': 100,
'rx_packets_dropped_threshold': {
'warn': 100,
},
'tx_packets_dropped_threshold': {
'warn': 100,
},
'swap_in_rate': {
'warn': 1024 * 1024,
'warn': 1024 * 1024,
},
'swap_out_rate': {
'warn': 1024 * 1024,
'warn': 1024 * 1024,
},
'failed_auths_threshold': {
'warn': 5,
},
},
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}

+ 173
- 107
linux/meta/prometheus.yml Прегледај датотеку

@@ -1,176 +1,242 @@
{%- from "linux/map.jinja" import monitoring with context %}
server:
alert:
SystemCpuIdleTooLow:
{%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
SystemCpuFullWarning:
{%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
if: >-
100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
{% raw %}
for: 2m
labels:
severity: warning
service: system
annotations:
summary: 'Idle CPU usage too low on {{ $labels.host }}'
description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
SystemDiskSpaceTooLow:
if: 'predict_linear(disk_free[1h], 8*3600) < 0'
{% raw %}
for: 15m
summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for at least 2 minutes."
SystemLoadTooHighWarning:
{%- endraw %}
{%- set load_threshold = monitoring.system_load_threshold.warn|float %}
if: >-
system_load5 / system_n_cpus > {{ load_threshold }}
{%- raw %}
for: 5m
labels:
severity: warning
service: system
annotations:
summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %}
SystemFreeOpenFilesTooLow:
if: 'predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max'
{% raw %}
summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
SystemLoadTooHighCritical:
{%- endraw %}
{%- set load_threshold = monitoring.system_load_threshold.crit|float %}
if: >-
system_load5 / system_n_cpus > {{ load_threshold }}
{%- raw %}
for: 5m
labels:
severity: warning
service: system
annotations:
summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
{% endraw %}
SystemDiskErrors:
if: 'increase(hdd_errors_total[5m]) > 0'
{% raw %}
summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
SystemDiskFullWarning:
{%- endraw %}
{%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
if: >-
disk_used_percent >= {{ disk_threshold }}
{%- raw %}
for: 2m
labels:
severity: critical
severity: warning
service: system
annotations:
summary: 'Disk {{ $labels.device }} is failing'
description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
{% endraw %}
SystemDiskSpaceFull:
if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
{% raw %}
summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
SystemDiskFullMajor:
{%- endraw %}
{%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
if: >-
disk_used_percent >= {{ disk_threshold }}
{%- raw %}
for: 2m
labels:
severity: critical
severity: major
service: system
annotations:
summary: 'Disk partition {{ $labels.path }} full on {{ $labels.host }}'
description: 'The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}.'
{% endraw %}
SystemDiskInodesTooLow:
if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
{% raw %}
for: 15m
summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
SystemDiskInodesFullWarning:
{%- endraw %}
{%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
if: >-
100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
for: 2m
labels:
severity: warning
service: system
annotations:
summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %}
SystemDiskInodesFull:
if: 'disk_inodes_used / disk_inodes_total >= 0.99'
{% raw %}
summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
SystemDiskInodesFullMajor:
{%- endraw %}
{%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
if: >-
100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
for: 2m
labels:
severity: critical
severity: major
service: system
annotations:
summary: 'Inodes for {{ $labels.path }} full on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}.'
{% endraw %}
SystemMemoryAvailableLow:
{%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
{% raw %}
summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
SystemDiskErrorsTooHigh:
if: >-
increase(hdd_errors_total[1m]) > 0
for: 5m
labels:
severity: warning
service: system
annotations:
summary: 'Free memory low on {{ $labels.host }}'
description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
SystemMemoryAvailableTooLow:
{%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
{% raw %}
summary: "Disk {{ $labels.device }} is failing"
description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for at least 5 minutes."
SystemMemoryFullWarning:
{%- endraw %}
{%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
if: >-
mem_used_percent >= {{ mem_threshold }}
for: 2m
labels:
severity: critical
severity: warning
service: system
annotations:
summary: 'Free memory too low on {{ $labels.host }}'
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
SystemLoad5TooHigh:
if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
{% raw %}
summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
SystemMemoryFullMajor:
{%- endraw %}
{%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
if: >-
mem_used_percent >= {{ mem_threshold }}
for: 2m
labels:
severity: major
service: system
annotations:
summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
SystemSwapFullWarning:
{%- endraw %}
{%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
if: >-
swap_used_percent >= {{ swap_threshold }}
for: 2m
labels:
severity: warning
service: system
annotations:
summary: 'High system load (5m) on {{ $labels.host }}'
description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
SystemSwapFullMinor:
{%- endraw %}
{%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
if: >-
swap_used_percent >= {{ swap_threshold }}
for: 2m
labels:
severity: minor
service: system
annotations:
summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
SystemRxPacketsDroppedTooHigh:
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
{% raw %}
{%- endraw %}
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
if: >-
increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
labels:
severity: critical
severity: warning
service: system
annotations:
summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
SystemRxPacketsDroppedLongTermTooHigh:
if: >-
increase(net_drop_in[1m]) > 0
for: 10m
labels:
severity: major
service: system
annotations:
summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
summary: "Received packets long term dropping"
description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
SystemTxPacketsDroppedTooHigh:
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
{% raw %}
{%- endraw %}
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
if: >-
increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
labels:
severity: warning
service: system
annotations:
summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
CronProcessDown:
if: >-
procstat_running{process_name="cron"} == 0
labels:
severity: critical
service: system
annotations:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
SystemSwapIn:
{%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }}
{% raw %}
summary: "Cron process is down"
description: "The cron process on the {{ $labels.host }} node is down."
SshdProcessDown:
if: >-
procstat_running{process_name="sshd"} == 0
labels:
severity: warning
severity: critical
service: system
annotations:
summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
SystemSwapOut:
{%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }}
{% raw %}
summary: "SSH process is down"
description: "The SSH process on the {{ $labels.host }} node is down."
SshFailedLoginsTooHigh:
{%- endraw %}
{%- set threshold = monitoring.failed_auths_threshold.warn %}
if: >-
increase(failed_logins_total[5m]) > {{ threshold }}
labels:
severity: warning
service: system
annotations:
summary: 'Swap output throughput too high on {{ $labels.host }}'
description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
summary: "{{ threshold }}{%- raw %} failed SSH logins"
description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
{%- endraw %}
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
{%- raw %}
BondInterfaceDown:
if: 'bond_status < 1'
{% raw %}
if: >-
bond_status < 1
labels:
severity: critical
service: system
annotations:
summary: 'Bond status interface {{ $labels.bond }} is DOWN on {{ $labels.host }}'
description: 'The bond interface ({{ $labels.bond }) has all ifaces in a down state on {{ $labels.host }}.'
{% endraw %}
BondSlaveInterfacesMinimum:
if: '(sum(bond_slave_status) BY (bond,host)) / (count(bond_slave_status) BY (bond,host)) <= 0.5'
{% raw %}
summary: "{{ $labels.bond }} bond interface is down"
description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
BondInterfaceSlaveDown:
if: >-
bond_slave_status < 1
labels:
severity: critical
severity: warning
service: system
annotations:
summary: 'At least half of Bond slave interfaces {{ $labels.bond }} are DOWN on {{ $labels.host }}'
description: 'The bond interface ({{ $labels.bond }) has at least half of slave ifaces in a down state on {{ $labels.host }}.'
{% endraw %}
BondSlaveInterfaceStatus:
if: 'bond_slave_status < 1'
{% raw %}
summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
BondInterfaceSlaveDownMajor:
if: >-
sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
labels:
severity: warning
severity: major
service: system
annotations:
summary: 'Bond slave interface {{ $labels.interface }} is DOWN on {{ $labels.host }} for {{ $labels.bond }}'
description: 'The bond slave interface ({{ $labels.interface }) is in DOWN state for {{ $labels.bond }} on {{ $labels.host }}.'
{% endraw %}
summary: "50% of bond interface slaves {{ $labels.bond }} are down"
description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
{% endraw %}
{%- endif %}

+ 6
- 0
linux/meta/telegraf.yml Прегледај датотеку

@@ -23,6 +23,12 @@ agent:
processes:
swap:
system:
procstat:
process:
sshd:
exe: sshd
cron:
exe: cron
linux_sysctl_fs:
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
bond:

Loading…
Откажи
Сачувај