소스 검색

Fix linux/meta/prometheus.yml for the CI

Change-Id: Idc73c152a0e71d5ac2a8c10f46c955755d8e77ae
add-del-users-in-grups
Olivier Bourdon 7 년 전
부모
커밋
0723131ffd
2개의 변경된 파일30개의 추가작업 그리고 8개의 파일을 삭제
  1. +21
    -0
      linux/map.jinja
  2. +9
    -8
      linux/meta/prometheus.yml

+ 21
- 0
linux/map.jinja 파일 보기

@@ -212,5 +212,26 @@
'warn': '15%',
'crit': '5%',
},
'cpu_idle_percentage': {
'warn': 10.0,
},
'free_memory_percentage': {
'warn': 10.0,
},
'load_5': {
'warn': 3,
},
'rx_packets_dropped_rate': {
'warn': 100,
},
'tx_packets_dropped_rate': {
'warn': 100,
},
'swap_in_rate': {
'warn': 1024 * 1024,
},
'swap_out_rate': {
'warn': 1024 * 1024,
},
},
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}

+ 9
- 8
linux/meta/prometheus.yml 파일 보기

@@ -1,7 +1,8 @@
{%- from "linux/map.jinja" import monitoring with context %}
server:
alert:
AvgCPUUsageIdle:
{%- set cpu_idle_threshold = prometheus_server.get('alert', {}).get('AvgCPUUsageIdle', {}).get('var', {}).get('threshold', 10) %}
{%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
{% raw %}
labels:
@@ -31,7 +32,7 @@ server:
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}'
{% endraw %}
AvgMemAvailablePercent:
{%- set mem_avail_threshold = prometheus_server.get('alert', {}).get('AvgMemAvailablePercent', {}).get('var', {}).get('threshold', 10) %}
{%- set mem_avail_threshold = monitoring.free_memory_percentage.warn|float %}
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_threshold }}
{% raw %}
labels:
@@ -41,7 +42,7 @@ server:
summary: 'Free memory too low on {{ $labels.host }}'
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})'
SystemLoad5:
if: system_load5 / system_n_cpus > {{ prometheus_server.get('alert', {}).get('SystemLoad5', {}).get('var', {}).get('threshold', 3) }}
if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
{% raw %}
labels:
severity: warning
@@ -51,7 +52,7 @@ server:
description: 'High system load (5m) on node {{ $labels.host }}'
{% endraw %}
NetworkRxPacketsDropped:
{%- set net_rx_dropped_threshold = prometheus_server.get('alert', {}).get('NetworkRxPacketsDropped', {}).get('var', {}).get('threshold', 100) %}
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
if: avg_over_time(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
{% raw %}
labels:
@@ -61,7 +62,7 @@ server:
summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_rx_dropped_threshold }})'
NetworkTxPacketsDropped:
{%- set net_tx_dropped_threshold = prometheus_server.get('alert', {}).get('NetworkTxPacketsDropped', {}).get('var', {}).get('threshold', 100) %}
{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
if: avg_over_time(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
{% raw %}
labels:
@@ -71,7 +72,7 @@ server:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
SystemSwapUsed:
{%- set swap_used_threshold = prometheus_server.get('alert', {}).get('SystemSwapUsed', {}).get('var', {}).get('threshold', 80) %}
{%- set swap_used_threshold = monitoring.swap.warn.strip('%')|float %}
if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
{% raw %}
labels:
@@ -81,7 +82,7 @@ server:
summary: 'Swap usage too high on {{ $labels.host }}'
description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
SystemSwapIn:
{%- set swap_in_threshold = prometheus_server.get('alert', {}).get('SystemSwapIn', {}).get('var', {}).get('threshold', 1024 * 1024) %}
{%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }}
{% raw %}
labels:
@@ -91,7 +92,7 @@ server:
summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
SystemSwapOut:
{%- set swap_out_threshold = prometheus_server.get('alert', {}).get('SystemSwapOut', {}).get('var', {}).get('threshold', 1024 * 1024) %}
{%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }}
{% raw %}
labels:

Loading…
취소
저장