Procházet zdrojové kódy

Enable nstat input plugin for softnet_stat data

Since we added to nstat's telegraf plugin the possibility
to collect data from `/proc/net/softnet_stat` regarding
dropped packets and rx_net_action a.k.a time squeeze, we need to enable
it globally on all hosts.

Also grafana dashboard update to include new graphs + added four
new Prometheus alers.

Related-Bug: PROD-21090

Change-Id: I9dfe87bdc8b677a51e3f305dd3c75c7d4cc4e0d4
pull/162/head
Mateusz Matuszkowiak před 6 roky
rodič
revize
ee7c76af8b
4 změnil soubory, kde provedl 612 přidání a 6 odebrání
  1. +556
    -6
      linux/files/grafana_dashboards/system_network_prometheus.json
  2. +8
    -0
      linux/map.jinja
  3. +44
    -0
      linux/meta/prometheus.yml
  4. +4
    -0
      linux/meta/telegraf.yml

+ 556
- 6
linux/files/grafana_dashboards/system_network_prometheus.json Zobrazit soubor

@@ -19,7 +19,7 @@
"gnetId": null,
"graphTooltip": 1,
"id": null,
"iteration": 1529498668709,
"iteration": 1532690906484,
"links": [],
"panels": [
{
@@ -429,13 +429,562 @@
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"decimals": null,
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 15
},
"id": 49,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideZero": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "nstat_time_squeeze{host=~\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{host}} @{{cpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Net RX action@$host per CPU",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": 0,
"format": "none",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": null,
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 15
},
"id": 45,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "nstat_packet_drop{host=~\"$host\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{host}} @{{cpu}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Dropped packets@$host per CPU",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"columns": [],
"datasource": null,
"fontSize": "100%",
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 22
},
"hideTimeOverride": false,
"id": 51,
"links": [],
"pageSize": null,
"scroll": true,
"showHeader": true,
"sort": {
"col": 2,
"desc": true
},
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "Increase since 1h",
"colorMode": "cell",
"colors": [
"transparent",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 0,
"mappingType": 1,
"pattern": "Value #A",
"preserveFormat": false,
"thresholds": [
"2",
"4"
],
"type": "string",
"unit": "none",
"valueMaps": [
{
"text": "-",
"value": "-1"
}
]
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "job",
"thresholds": [],
"type": "hidden",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "instance",
"thresholds": [],
"type": "hidden",
"unit": "short"
},
{
"alias": "Increase since 4h",
"colorMode": "cell",
"colors": [
"transparent",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "Value #B",
"thresholds": [
"8",
"16"
],
"type": "string",
"unit": "short",
"valueMaps": [
{
"text": "-",
"value": "-1"
}
]
},
{
"alias": "Increase since 24h",
"colorMode": "cell",
"colors": [
"transparent",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "Value #C",
"thresholds": [
"50",
"100"
],
"type": "string",
"unit": "short",
"valueMaps": [
{
"text": "-",
"value": "-1"
}
]
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"decimals": 2,
"pattern": "/.*/",
"thresholds": [],
"type": "number",
"unit": "short"
}
],
"targets": [
{
"expr": "floor(increase(nstat_time_squeeze{host=~\"$host\"}[1h])) > 0 or increase(nstat_time_squeeze{host=~\"$host\"}[1h]) - 1",
"format": "table",
"instant": false,
"intervalFactor": 2,
"refId": "A"
},
{
"expr": "floor(increase(nstat_time_squeeze{host=~\"$host\"}[4h])) > 0 or increase(nstat_time_squeeze{host=~\"$host\"}[4h]) - 1",
"format": "table",
"intervalFactor": 1,
"refId": "B"
},
{
"expr": "floor(increase(nstat_time_squeeze{host=~\"$host\"}[24h])) > 0 or increase(nstat_time_squeeze{host=~\"$host\"}[24h]) - 1",
"format": "table",
"intervalFactor": 1,
"refId": "C"
}
],
"timeFrom": "1s",
"title": "Net RX action@$host per CPU - increased",
"transform": "table",
"transparent": false,
"type": "table"
},
{
"columns": [],
"datasource": null,
"fontSize": "100%",
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 22
},
"hideTimeOverride": false,
"id": 47,
"links": [],
"pageSize": null,
"scroll": true,
"showHeader": true,
"sort": {
"col": 2,
"desc": true
},
"styles": [
{
"alias": "Time",
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"pattern": "Time",
"type": "hidden"
},
{
"alias": "Increase since 1h",
"colorMode": "cell",
"colors": [
"transparent",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 0,
"link": false,
"mappingType": 1,
"pattern": "Value #A",
"preserveFormat": false,
"rangeMaps": [
{
"from": "0",
"text": "asas",
"to": "0"
}
],
"thresholds": [
"2",
"4"
],
"type": "string",
"unit": "none",
"valueMaps": [
{
"text": "-",
"value": "-1"
}
]
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "job",
"thresholds": [],
"type": "hidden",
"unit": "short"
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 2,
"mappingType": 1,
"pattern": "instance",
"thresholds": [],
"type": "hidden",
"unit": "short"
},
{
"alias": "Increase since 4h",
"colorMode": "cell",
"colors": [
"transparent",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 0,
"mappingType": 1,
"pattern": "Value #B",
"thresholds": [
"8",
"16"
],
"type": "string",
"unit": "none",
"valueMaps": [
{
"text": "-",
"value": "-1"
}
]
},
{
"alias": "Increase since 24h",
"colorMode": "cell",
"colors": [
"transparent",
"rgba(237, 129, 40, 0.89)",
"rgba(245, 54, 54, 0.9)"
],
"dateFormat": "YYYY-MM-DD HH:mm:ss",
"decimals": 0,
"mappingType": 1,
"pattern": "Value #C",
"thresholds": [
"50",
"100"
],
"type": "string",
"unit": "short",
"valueMaps": [
{
"text": "-",
"value": "-1"
}
]
},
{
"alias": "",
"colorMode": null,
"colors": [
"rgba(245, 54, 54, 0.9)",
"rgba(237, 129, 40, 0.89)",
"rgba(50, 172, 45, 0.97)"
],
"decimals": 2,
"pattern": "/.*/",
"thresholds": [],
"type": "number",
"unit": "short"
}
],
"targets": [
{
"expr": "floor(increase(nstat_packet_drop{host=~\"$host\"}[1h])) > 0 or increase(nstat_packet_drop{host=~\"$host\"}[1h]) - 1",
"format": "table",
"hide": false,
"instant": false,
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
},
{
"expr": "floor(increase(nstat_packet_drop{host=~\"$host\"}[4h])) > 0 or increase(nstat_packet_drop{host=~\"$host\"}[4h]) -1",
"format": "table",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "B"
},
{
"expr": "floor(increase(nstat_packet_drop{host=~\"$host\"}[24h])) > 0 or increase(nstat_packet_drop{host=~\"$host\"}[24h]) -1",
"format": "table",
"hide": false,
"instant": false,
"intervalFactor": 2,
"legendFormat": "",
"refId": "C"
}
],
"timeFrom": "1s",
"title": "Dropped packets@$host per CPU - increased",
"transform": "table",
"transparent": false,
"type": "table"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 15
"y": 28
},
"id": 24,
"panels": [],
@@ -454,7 +1003,7 @@
"h": 7,
"w": 8,
"x": 0,
"y": 16
"y": 29
},
"id": 8,
"legend": {
@@ -568,7 +1117,7 @@
"h": 7,
"w": 8,
"x": 8,
"y": 16
"y": 29
},
"id": 15,
"legend": {
@@ -681,7 +1230,7 @@
"h": 7,
"w": 8,
"x": 16,
"y": 16
"y": 29
},
"id": 13,
"legend": {
@@ -841,6 +1390,7 @@
]
},
"datasource": "prometheus",

"hide": 0,
"includeAll": true,
"label": null,
@@ -933,6 +1483,6 @@
"timezone": "browser",
"title": "System - Networking",
"uid": null,
"version": 22
"version": 1
}
{% endraw %}

+ 8
- 0
linux/map.jinja Zobrazit soubor

@@ -348,5 +348,13 @@
'failed_auths_threshold': {
'warn': 5,
},
'net_rx_action_per_cpu_threshold': {
'warning': '0',
'minor': '100'
},
'packets_dropped_per_cpu_threshold': {
'minor': '0',
'major': '100'
}
},
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}

+ 44
- 0
linux/meta/prometheus.yml Zobrazit soubor

@@ -208,6 +208,50 @@ server:
annotations:
summary: "{{ threshold }}{%- raw %} failed SSH logins"
description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
PacketsDroppedByCpuMinor:
{%- endraw %}
{%- set packets_dropped_minor_threshold = monitoring.packets_dropped_per_cpu_threshold.minor %}
if: >-
floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_minor_threshold }}
labels:
severity: minor
service: system
annotations:
summary: "CPU dropped {{ packets_dropped_minor_threshold }}{%- raw %} packets"
description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
PacketsDroppedByCpuMajor:
{%- endraw %}
{%- set packets_dropped_major_threshold = monitoring.packets_dropped_per_cpu_threshold.major %}
if: >-
floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_major_threshold }}
labels:
severity: major
service: system
annotations:
summary: "CPU dropped {{ packets_dropped_major_threshold }}{%- raw %} packets"
description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
NetRxActionByCpuWarning:
{%- endraw %}
{%- set net_rx_action_warning_threshold = monitoring.net_rx_action_per_cpu_threshold.warning %}
if: >-
floor(increase(nstat_time_squeeze[24h])) > {{ net_rx_action_warning_threshold }}
labels:
severity: warning
service: system
annotations:
summary: "CPU terminated {{ net_rx_action_warning_threshold }}{%- raw %} net_rx_action loops"
description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours."
NetRxActionByCpuMinor:
{%- endraw %}
{%- set net_rx_action_minor_threshold = monitoring.net_rx_action_per_cpu_threshold.minor %}
if: >-
floor(increase(nstat_time_squeeze[24h])) > {{ net_rx_action_minor_threshold }}
labels:
severity: minor
service: system
annotations:
summary: "CPU terminated {{ net_rx_action_minor_threshold }}{%- raw %} net_rx_action loops"
description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours."
{%- endraw %}
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
{%- raw %}

+ 4
- 0
linux/meta/telegraf.yml Zobrazit soubor

@@ -20,6 +20,10 @@ agent:
kernel:
net:
mem:
nstat:
fieldpass:
- packet_drop
- time_squeeze
processes:
swap:
system:

Načítá se…
Zrušit
Uložit