Browse Source

Add monitoring of the swap usage

This change adds the Telegraf configuration to collect swap metrics, the
associated Prometheus alarms and graphs to the Grafana dashboard.

Change-Id: I3595fd0b8cab06215c620642da69dd29c398396a
pull/114/head
Simon Pasquier 7 years ago
parent
commit
9083abf8a3
3 changed files with 313 additions and 5 deletions
  1. +282
    -5
      linux/files/grafana_dashboards/system_prometheus.json
  2. +30
    -0
      linux/meta/prometheus.yml
  3. +1
    -0
      linux/meta/telegraf.yml

+ 282
- 5
linux/files/grafana_dashboards/system_prometheus.json View File

@@ -98,6 +98,7 @@
"dashes": false,
"datasource": null,
"fill": 1,
"height": "",
"id": 1,
"legend": {
"avg": false,
@@ -352,7 +353,19 @@
"show": true
}
]
},
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "General",
"titleSize": "h6"
},
{
"collapse": false,
"height": 250,
"panels": [
{
"aliasColors": {},
"bars": false,
@@ -652,7 +665,7 @@
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "General",
"title": "Processes",
"titleSize": "h6"
},
{
@@ -675,7 +688,7 @@
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"thresholdMarkers": false
},
"hideTimeOverride": false,
"id": 11,
@@ -753,7 +766,7 @@
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
"thresholdMarkers": false
},
"hideTimeOverride": false,
"id": 12,
@@ -1535,9 +1548,269 @@
"showTitle": true,
"title": "Network",
"titleSize": "h6"
},
{
"collapse": false,
"height": 250,
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": false,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"datasource": null,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": false
},
"id": 18,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "swap_used_percent{host=\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
"step": 60
}
],
"thresholds": "",
"title": "Used",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"decimals": null,
"fill": 0,
"id": 17,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 5,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "swap_used{host=\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "used",
"refId": "A",
"step": 10
},
{
"expr": "swap_free{host=\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "free",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Usage",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 0,
"id": 19,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"span": 5,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "irate(swap_in{host=\"$host\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "in",
"refId": "A",
"step": 10
},
{
"expr": "irate(swap_out{host=\"$host\"}[5m])",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out",
"refId": "B",
"step": 10
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "I/O",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Swap",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"sharedCrosshair": true,
"style": "dark",
"tags": [],
"templating": {
@@ -1554,6 +1827,7 @@
"options": [],
"query": "label_values(cpu_usage_idle,host)",
"refresh": 1,
"refresh_on_load": true,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
@@ -1574,6 +1848,7 @@
"options": [],
"query": "query_result(diskio_read_bytes{host=\"$host\"})",
"refresh": 1,
"refresh_on_load": true,
"regex": "/name=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1594,6 +1869,7 @@
"options": [],
"query": "query_result(disk_free{host=\"$host\"})",
"refresh": 1,
"refresh_on_load": true,
"regex": "/path=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1614,6 +1890,7 @@
"options": [],
"query": "query_result(net_bytes_recv{host=\"$host\"})",
"refresh": 1,
"refresh_on_load": true,
"regex": "/interface=\"([^\"]+)/",
"sort": 1,
"tagValuesQuery": "",
@@ -1655,5 +1932,5 @@
},
"timezone": "browser",
"title": "System",
"version": 31
"version": 32
}

+ 30
- 0
linux/meta/prometheus.yml View File

@@ -70,3 +70,33 @@ server:
annotations:
summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
description: 'The average number of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $label.interface }} (current value={{ $value }}, threshold={% endraw %}{{ net_tx_dropped_threshold }})'
SystemSwapUsed:
{%- set swap_used_threshold = prometheus_server.get('alert', {}).get('SystemSwapUsed', {}).get('var', {}).get('threshold', 80) %}
if: avg_over_time(swap_used_percent[1m]) > {{ swap_used_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap usage too high on {{ $labels.host }}'
description: 'The average percentage of used swap is too high on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ swap_used_threshold }})'
SystemSwapIn:
{%- set swap_in_threshold = prometheus_server.get('alert', {}).get('SystemSwapIn', {}).get('var', {}).get('threshold', 1024 * 1024) %}
if: rate(swap_in[2m]) > {{ swap_in_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap input throughput too high on {{ $labels.host }}'
description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }})'
SystemSwapOut:
{%- set swap_out_threshold = prometheus_server.get('alert', {}).get('SystemSwapOut', {}).get('var', {}).get('threshold', 1024 * 1024) %}
if: rate(swap_out[2m]) > {{ swap_out_threshold }}
{% raw %}
labels:
severity: warning
service: system
annotations:
summary: 'Swap output throughput too high on {{ $labels.host }}'
description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }})'

+ 1
- 0
linux/meta/telegraf.yml View File

@@ -9,4 +9,5 @@ agent:
net:
mem:
processes:
swap:
system:

Loading…
Cancel
Save