Sfoglia il codice sorgente

Add monitoring of the swap usage

This change adds the Telegraf configuration to collect swap metrics, the
associated Prometheus alarms and graphs to the Grafana dashboard.

Change-Id: I3595fd0b8cab06215c620642da69dd29c398396a
pull/114/head
Simon Pasquier 7 anni fa
parent
commit
9083abf8a3
3 ha cambiato i file con 313 aggiunte e 5 eliminazioni
  1. +282
    -5
      linux/files/grafana_dashboards/system_prometheus.json
  2. +30
    -0
      linux/meta/prometheus.yml
  3. +1
    -0
      linux/meta/telegraf.yml

+ 282
- 5
linux/files/grafana_dashboards/system_prometheus.json Vedi File

@@ -98,6 +98,7 @@
"dashes": false,
"datasource": null,
"fill": 1,
"height": "",
"id": 1,
"legend": {
"avg": false,
@@ -352,7 +353,19 @@
"show": true
}
]
},
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "General",
"titleSize": "h6"
},
{
"collapse": false,
"height": 250,
"panels": [
{
"aliasColors": {},
"bars": false,
@@ -652,7 +665,7 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "General",
"title": "Processes",
"titleSize": "h6"
},
{
@@ -675,7 +688,7 @@
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"thresholdMarkers": false
},
"hideTimeOverride": false,
"id": 11,
@@ -753,7 +766,7 @@
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"thresholdMarkers": false
},
"hideTimeOverride": false,
"id": 12,
@@ -1535,9 +1548,269 @@
"showTitle": true,
"title": "Network",
"titleSize": "h6"
},
{
"collapse": false,
"height": 250,
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": null,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": false
},
"id": 18,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "swap_used_percent{host=\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
"step": 60
}
],
"thresholds": "",
"title": "Used",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"decimals": null,
"fill": 0,
"id": 17,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 5,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "swap_used{host=\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "used",
"refId": "A",
"step": 10
},
{
"expr": "swap_free{host=\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "free",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 0,
"id": 19,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 5,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "irate(swap_in{host=\"$host\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "in",
"refId": "A",
"step": 10
},
{
"expr": "irate(swap_out{host=\"$host\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "I/O",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Swap",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
@@ -1554,6 +1827,7 @@
"options": [],
"query": "label_values(cpu_usage_idle,host)",
"refresh": 1,
"refresh_on_load": true,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
@@ -1574,6 +1848,7 @@
"options": [],
"query": "query_result(diskio_read_bytes{host=\"$host\"})",
"refresh": 1,
"refresh_on_load": true,
"regex": "/name=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1594,6 +1869,7 @@
"options": [],
"query": "query_result(disk_free{host=\"$host\"})",
"refresh": 1,
"refresh_on_load": true,
"regex": "/path=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1614,6 +1890,7 @@
"options": [],
"query": "query_result(net_bytes_recv{host=\"$host\"})",
"refresh": 1,
"refresh_on_load": true,
"regex": "/interface=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1655,5 +1932,5 @@
},
"timezone": "browser",
"title": "System",
"version": 31
"version": 32
}

+ 30
- 0
linux/meta/prometheus.yml Vedi File

@@ -70,3 +70,33 @@ server:
annotations:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
SystemSwapUsed:
{%- set swap_used_threshold = prometheus_server.get('alert', {}).get('SystemSwapUsed', {}).get('var', {}).get('threshold', 80) %}
if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap usage too high on {{ $labels.host }}'
description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
SystemSwapIn:
{%- set swap_in_threshold = prometheus_server.get('alert', {}).get('SystemSwapIn', {}).get('var', {}).get('threshold', 1024 * 1024) %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
SystemSwapOut:
{%- set swap_out_threshold = prometheus_server.get('alert', {}).get('SystemSwapOut', {}).get('var', {}).get('threshold', 1024 * 1024) %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap output throughput too high on {{ $labels.host }}'
description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }})'

+ 1
- 0
linux/meta/telegraf.yml Vedi File

@@ -9,4 +9,5 @@ agent:
net:
mem:
processes:
swap:
system:

Loading…
Annulla
Salva