Browse Source

Fix Prometheus alerts on dropped packets

Since metrics on dropped packets are counters, the alerts should use
the rate() function. This change also fixes some inconsistencies in the
alert descriptions.

Change-Id: I9abbc0a49f45ba760836c436a3e7e65aa62f652e
add-del-users-in-grups
Simon Pasquier 7 years ago
parent
commit
db768fb47c
1 changed files with 12 additions and 13 deletions
  1. +12
    -13
      linux/meta/prometheus.yml

+ 12
- 13
linux/meta/prometheus.yml View File

service: system service: system
annotations: annotations:
summary: 'Idle CPU usage too low on {{ $labels.host }}' summary: 'Idle CPU usage too low on {{ $labels.host }}'
description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ cpu_idle_threshold}})'
description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
SystemDiskSpaceTooLow: SystemDiskSpaceTooLow:
if: 'predict_linear(disk_free[1h], 8*3600) < 0' if: 'predict_linear(disk_free[1h], 8*3600) < 0'
{% raw %} {% raw %}
service: system service: system
annotations: annotations:
summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}' summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}'
description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %} {% endraw %}
SystemDiskInodesTooLow: SystemDiskInodesTooLow:
if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0' if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
service: system service: system
annotations: annotations:
summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}' summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}'
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
{% endraw %} {% endraw %}
SystemMemoryAvailableTooLow: SystemMemoryAvailableTooLow:
{%- set mem_avail_threshold = monitoring.free_memory_percentage.warn|float %} {%- set mem_avail_threshold = monitoring.free_memory_percentage.warn|float %}
service: system service: system
annotations: annotations:
summary: 'Free memory too low on {{ $labels.host }}' summary: 'Free memory too low on {{ $labels.host }}'
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})'
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_threshold }}%).'
SystemLoad5TooHigh: SystemLoad5TooHigh:
if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }} if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
{% raw %} {% raw %}
service: system service: system
annotations: annotations:
summary: 'High system load (5m) on {{ $labels.host }}' summary: 'High system load (5m) on {{ $labels.host }}'
description: 'High system load (5m) on node {{ $labels.host }}'
{% endraw %}
description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
SystemRxPacketsDroppedTooHigh: SystemRxPacketsDroppedTooHigh:
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %} {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
if: avg_over_time(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
{% raw %} {% raw %}
labels: labels:
severity: warning severity: warning
service: system service: system
annotations: annotations:
summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}' summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_rx_dropped_threshold }})'
description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
SystemTxPacketsDroppedTooHigh: SystemTxPacketsDroppedTooHigh:
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %} {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
if: avg_over_time(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
{% raw %} {% raw %}
labels: labels:
severity: warning severity: warning
service: system service: system
annotations: annotations:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}' summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
SystemSwapUsed: SystemSwapUsed:
{%- set swap_used_threshold = monitoring.swap.warn.strip('%')|float %} {%- set swap_used_threshold = monitoring.swap.warn.strip('%')|float %}
if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }} if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
service: system service: system
annotations: annotations:
summary: 'Swap usage too high on {{ $labels.host }}' summary: 'Swap usage too high on {{ $labels.host }}'
description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }}%)'
SystemSwapIn: SystemSwapIn:
{%- set swap_in_threshold = monitoring.swap_in_rate.warn %} {%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }} if: rate(swap_in[2m]) > {{ swap_in_threshold }}
service: system service: system
annotations: annotations:
summary: 'Swap input throughput too high on {{ $labels.host }}' summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
SystemSwapOut: SystemSwapOut:
{%- set swap_out_threshold = monitoring.swap_out_rate.warn %} {%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }} if: rate(swap_out[2m]) > {{ swap_out_threshold }}
service: system service: system
annotations: annotations:
summary: 'Swap output throughput too high on {{ $labels.host }}' summary: 'Swap output throughput too high on {{ $labels.host }}'
description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }})'
description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'

Loading…
Cancel
Save