Browse Source

Fix linux/meta/prometheus.yml for the CI

Change-Id: Idc73c152a0e71d5ac2a8c10f46c955755d8e77ae
add-del-users-in-grups
Olivier Bourdon 7 years ago
parent
commit
0723131ffd
2 changed files with 30 additions and 8 deletions
  1. +21
    -0
      linux/map.jinja
  2. +9
    -8
      linux/meta/prometheus.yml

+ 21
- 0
linux/map.jinja View File

'warn': '15%', 'warn': '15%',
'crit': '5%', 'crit': '5%',
}, },
'cpu_idle_percentage': {
'warn': 10.0,
},
'free_memory_percentage': {
'warn': 10.0,
},
'load_5': {
'warn': 3,
},
'rx_packets_dropped_rate': {
'warn': 100,
},
'tx_packets_dropped_rate': {
'warn': 100,
},
'swap_in_rate': {
'warn': 1024 * 1024,
},
'swap_out_rate': {
'warn': 1024 * 1024,
},
}, },
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %} }, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}

+ 9
- 8
linux/meta/prometheus.yml View File

{%- from "linux/map.jinja" import monitoring with context %}
server: server:
alert: alert:
AvgCPUUsageIdle: AvgCPUUsageIdle:
{%- set cpu_idle_threshold = prometheus_server.get('alert', {}).get('AvgCPUUsageIdle', {}).get('var', {}).get('threshold', 10) %}
{%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }} if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
{% raw %} {% raw %}
labels: labels:
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}' description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}'
{% endraw %} {% endraw %}
AvgMemAvailablePercent: AvgMemAvailablePercent:
{%- set mem_avail_threshold = prometheus_server.get('alert', {}).get('AvgMemAvailablePercent', {}).get('var', {}).get('threshold', 10) %}
{%- set mem_avail_threshold = monitoring.free_memory_percentage.warn|float %}
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_threshold }} if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_threshold }}
{% raw %} {% raw %}
labels: labels:
summary: 'Free memory too low on {{ $labels.host }}' summary: 'Free memory too low on {{ $labels.host }}'
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})' description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})'
SystemLoad5: SystemLoad5:
if: system_load5 / system_n_cpus > {{ prometheus_server.get('alert', {}).get('SystemLoad5', {}).get('var', {}).get('threshold', 3) }}
if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
{% raw %} {% raw %}
labels: labels:
severity: warning severity: warning
description: 'High system load (5m) on node {{ $labels.host }}' description: 'High system load (5m) on node {{ $labels.host }}'
{% endraw %} {% endraw %}
NetworkRxPacketsDropped: NetworkRxPacketsDropped:
{%- set net_rx_dropped_threshold = prometheus_server.get('alert', {}).get('NetworkRxPacketsDropped', {}).get('var', {}).get('threshold', 100) %}
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
if: avg_over_time(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} if: avg_over_time(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
{% raw %} {% raw %}
labels: labels:
summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}' summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_rx_dropped_threshold }})' description: 'The average number of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_rx_dropped_threshold }})'
NetworkTxPacketsDropped: NetworkTxPacketsDropped:
{%- set net_tx_dropped_threshold = prometheus_server.get('alert', {}).get('NetworkTxPacketsDropped', {}).get('var', {}).get('threshold', 100) %}
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
if: avg_over_time(net_drop_out[1m]) > {{ net_tx_dropped_threshold }} if: avg_over_time(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
{% raw %} {% raw %}
labels: labels:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}' summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})' description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
SystemSwapUsed: SystemSwapUsed:
{%- set swap_used_threshold = prometheus_server.get('alert', {}).get('SystemSwapUsed', {}).get('var', {}).get('threshold', 80) %}
{%- set swap_used_threshold = monitoring.swap.warn.strip('%')|float %}
if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }} if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
{% raw %} {% raw %}
labels: labels:
summary: 'Swap usage too high on {{ $labels.host }}' summary: 'Swap usage too high on {{ $labels.host }}'
description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})' description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
SystemSwapIn: SystemSwapIn:
{%- set swap_in_threshold = prometheus_server.get('alert', {}).get('SystemSwapIn', {}).get('var', {}).get('threshold', 1024 * 1024) %}
{%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }} if: rate(swap_in[2m]) > {{ swap_in_threshold }}
{% raw %} {% raw %}
labels: labels:
summary: 'Swap input throughput too high on {{ $labels.host }}' summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})' description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
SystemSwapOut: SystemSwapOut:
{%- set swap_out_threshold = prometheus_server.get('alert', {}).get('SystemSwapOut', {}).get('var', {}).get('threshold', 1024 * 1024) %}
{%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }} if: rate(swap_out[2m]) > {{ swap_out_threshold }}
{% raw %} {% raw %}
labels: labels:

Loading…
Cancel
Save