|
|
@@ -1,15 +1,15 @@ |
|
|
|
server: |
|
|
|
alert: |
|
|
|
AvgCPUUsageIdle: |
|
|
|
if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ prometheus_server.get('alert', {}).get('AvgCPUUsageIdle', {}).get('var', {}).get('threshold', 10) }} |
|
|
|
{%- set cpu_idle_threshold = prometheus_server.get('alert', {}).get('AvgCPUUsageIdle', {}).get('var', {}).get('threshold', 10) %} |
|
|
|
if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }} |
|
|
|
{% raw %} |
|
|
|
labels: |
|
|
|
severity: warning |
|
|
|
service: system |
|
|
|
annotations: |
|
|
|
summary: 'Avarage CPU usage (idle) for node {{ $labels.host }} is low' |
|
|
|
description: 'Avarage CPU usage (idle) for node {{ $labels.host }} is low {{ $value }}' |
|
|
|
{% endraw %} |
|
|
|
summary: 'Idle CPU usage too low on {{ $labels.host }}' |
|
|
|
description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ cpu_idle_threshold}})' |
|
|
|
PredictLinearDiskFree: |
|
|
|
if: 'predict_linear(disk_free[1h], 8*3600) < 0' |
|
|
|
{% raw %} |
|
|
@@ -17,8 +17,8 @@ server: |
|
|
|
severity: warning |
|
|
|
service: system |
|
|
|
annotations: |
|
|
|
summary: 'Disk space ({{ $labels.path }}) is filling on {{ $labels.host }}' |
|
|
|
description: 'Disk space ({{ $labels.path }}) will be full in 8h on {{ $labels.host }}' |
|
|
|
summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}' |
|
|
|
description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}' |
|
|
|
{% endraw %} |
|
|
|
PredictLinearDiskInodesFree: |
|
|
|
if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0' |
|
|
@@ -27,19 +27,19 @@ server: |
|
|
|
severity: warning |
|
|
|
service: system |
|
|
|
annotations: |
|
|
|
summary: 'Disk inodes ({{ $labels.path }}) are filling on {{ $labels.host }}' |
|
|
|
description: 'Disk inodes ({{ $labels.path }}) will be full in 8h on {{ $labels.host }}' |
|
|
|
summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}' |
|
|
|
description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}' |
|
|
|
{% endraw %} |
|
|
|
AvgMemAvailablePercent: |
|
|
|
if: avg_over_time(mem_available_percent[5m]) < {{ prometheus_server.get('alert', {}).get('AvgMemAvailablePercent', {}).get('var', {}).get('threshold', 10) }} |
|
|
|
{%- set mem_avail_threshold = prometheus_server.get('alert', {}).get('AvgMemAvailablePercent', {}).get('var', {}).get('threshold', 10) %} |
|
|
|
if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_threshold }} |
|
|
|
{% raw %} |
|
|
|
labels: |
|
|
|
severity: warning |
|
|
|
service: system |
|
|
|
annotations: |
|
|
|
summary: 'Free memory is low on {{ $labels.host }}' |
|
|
|
description: 'Free memory percent for node {{ $labels.host }} is low {{ $value }}' |
|
|
|
{% endraw %} |
|
|
|
summary: 'Free memory too low on {{ $labels.host }}' |
|
|
|
description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ mem_avail_threshold }})' |
|
|
|
SystemLoad5: |
|
|
|
if: system_load5 / system_n_cpus > {{ prometheus_server.get('alert', {}).get('SystemLoad5', {}).get('var', {}).get('threshold', 3) }} |
|
|
|
{% raw %} |