瀏覽代碼

Generate metrics from logs

Change-Id: I5a8ccb235d36c1b4115794904f373a5704c2296d
pull/139/head
Bartosz Kupidura 7 年之前
父節點
當前提交
6616077674
共有 2 個檔案被更改,包括 78 行新增3 行删除
  1. +68
    -3
      linux/meta/fluentd.yml
  2. +10
    -0
      linux/meta/prometheus.yml

+ 68
- 3
linux/meta/fluentd.yml 查看文件

@@ -5,6 +5,49 @@ agent:
gem: ['fluent-plugin-systemd']
config:
label:
default_metric:
filter:
metric_failed_user:
tag: metric.failed_user
type: prometheus
metric:
- name: failed_logins_total
type: counter
desc: The total number of failed logins.
label:
- name: host
value: ${Hostname}
metric_out_of_memory:
tag: metric.out_of_memory
type: prometheus
metric:
- name: out_of_memory_total
type: counter
desc: The total number of OOM.
label:
- name: host
value: ${Hostname}
metric_hdd_errors_parse:
tag: metric.hdd_errors
type: parser
key_name: Payload
parser:
type: regexp
format: '/(?<device>[sv]d[a-z]+\d*)/'
metric_hdd_errors:
tag: metric.hdd_errors
require:
- metric_hdd_errors_parse
type: prometheus
metric:
- name: hdd_errors_total
type: counter
desc: The total number of hdd errors.
label:
- name: host
value: ${Hostname}
- name: device
value: ${device}
systemd:
input:
systemd:
@@ -37,11 +80,33 @@ agent:
tag: systemd.source
type: rewrite_tag_filter
rule:
- name: service
regexp: '^(.*)\.(.*)$'
- name: ident
regexp: '^(.*)$'
result: __TAG__.$1
push_to_default:
tag: 'systemd.source.*'
type: copy
store:
- type: relabel
label: default_output
- type: rewrite_tag_filter
rule:
- name: Payload
regexp: '^Invalid user'
result: metric.failed_user
- name: Payload
regexp: '^Out of memory'
result: metric.out_of_memory
- name: Payload
regexp: >-
'error.+[sv]d[a-z]+\d*'
result: metric.hdd_errors
- name: Payload
regexp: >-
'[sv]d[a-z]+\d*.+error'
result: metric.hdd_errors
push_to_metric:
tag: 'metric.**'
type: relabel
label: default_output
label: default_metric
{%- endif %}

+ 10
- 0
linux/meta/prometheus.yml 查看文件

@@ -32,6 +32,16 @@ server:
summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
{% endraw %}
SystemDiskErrors:
if: 'increase(hdd_errors_total[5m]) > 0'
{% raw %}
labels:
severity: critical
service: system
annotations:
summary: 'Disk {{ $labels.device }} is failing'
description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
{% endraw %}
SystemDiskSpaceFull:
if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
{% raw %}

Loading…
取消
儲存