Browse Source

Add support of mcelog service

Change-Id: I32c83d63e7f359704ab6cc77dec07a1617880fbb
Prod-Related: PROD-20137
atp-proxy
Oleksii Chupryn 6 years ago
parent
commit
144432b0f7
5 changed files with 251 additions and 0 deletions
  1. +12
    -0
      README.rst
  2. +199
    -0
      linux/files/mcelog.conf
  3. +3
    -0
      linux/system/init.sls
  4. +32
    -0
      linux/system/mcelog.sls
  5. +5
    -0
      tests/pillar/system.sls

+ 12
- 0
README.rst View File

@@ -918,6 +918,18 @@ Linux with atop service:
logpath: "/var/log/atop"
outfile: "/var/log/atop/daily.log"

Linux with mcelog service:

.. code-block:: yaml

linux:
system:
mcelog:
enabled: true
logging:
syslog: true
syslog_error: true

RHEL / CentOS
^^^^^^^^^^^^^


+ 199
- 0
linux/files/mcelog.conf View File

@@ -0,0 +1,199 @@
{%- from "linux/map.jinja" import system with context %}
{%- set mcelog = system.mcelog %}
#
# Example config file for mcelog
# mcelog is the user space backend that decodes and process machine check events
# (cpu hardware errors) reported by the CPU to the kernel
#

# general format
#optionname = value
# white space is not allowed in value currently, except at the end where it is dropped
#

# In general all command line options that are not commands work here.
# See man mcelog or mcelog --help for a list.
# e.g. to enable the --no-syslog option use
#no-syslog = yes (or no to disable)
# when the option has a argument
#logfile = /tmp/logfile
# below are the options which are not command line options.

# Set CPU type for which mcelog decodes events:
#cpu = type
# For valid values for type please see mcelog --help.
# If this value is set incorrectly the decoded output will be likely incorrect.
# By default when this parameter is not set mcelog uses the CPU it is running on
# on very new kernels the mcelog events reported by the kernel also carry
# the CPU type which is used too when available and not overriden.

# Enable daemon mode:
#daemon = yes
# By default mcelog just processes the currently pending events and exits.
# In daemon mode it will keep running as a daemon in the background and poll
# the kernel for events and then decode them.

# Filter out known broken events by default.
filter = yes
# Don't log memory errors individually.
# They still get accounted if that is enabled.
#filter-memory-errors = yes

# output in undecoded raw format to be easier machine readable
# (default is decoded).
#raw = yes

# Set CPU Mhz to decode uptime from time stamp counter (output
# unreliable, not needed on new kernels which report the event time
# directly. A lot of systems don't have a linear time stamp clock
# and the output is wrong then.
# Normally mcelog tries to figure out if it the TSC is reliable
# and only uses the current frequency then.
# Setting a frequency forces timestamp decoding.
# This setting is obsolete with modern kernels which report the time
# directly.
#cpumhz = 1800.00

# log output options
# Log decoded machine checks in syslog (default stdout or syslog for daemon)
#syslog = yes
# Log decoded machine checks in syslog with error level
#syslog-error = yes
# Never log anything to syslog
#no-syslog = yes
# Append log output to logfile instead of stdout. Only when no syslog logging is active
#logfile = filename

{%- if mcelog.logging is defined %}

{%- if mcelog.logging.syslog is defined %}
syslog = {{ 'yes' if mcelog.logging.syslog else 'no' }}
{%- endif %}
{%- if mcelog.logging.syslog_error is defined %}
syslog-error = {{ 'yes' if mcelog.logging.syslog_error else 'no' }}
{%- endif %}
{%- if mcelog.logging.no_syslog is defined %}
no-syslog = {{ 'yes' if mcelog.logging.no_syslog else 'no' }}
{%- endif %}
{%- if mcelog.logging.logfile is defined %}
logfile = {{ mcelog.logging.logfile }}
{%- endif %}

{%- endif %}
# Use SMBIOS information to decode DIMMs (needs root).
# This function is not recommended to use right now and generally not needed.
# The exception is memdb prepopulation, which is configured separately below.
#dmi = no

# When in daemon mode run as this user after set up.
# Note that the triggers will run as this user too.
# Setting this to non root will mean that triggers cannot take some corrective
# action, like offlining objects.
#run-credentials-user = root

# group to run as daemon with
# default to the group of the run-credentials-user
#run-credentials-group = nobody

[server]
# user allowed to access client socket.
# when set to * match any
# root is always allowed to access.
# default: root only
client-user = root
# group allowed to access mcelog
# When no group is configured any group matches (but still user checking).
# when set to * match any
#client-group = root
# Path to the unix socket for client<->server communication.
# When no socket-path is configured the server will not start
#socket-path = /var/run/mcelog-client
# When mcelog starts it checks if a server is already running. This configures the timeout
# for this check.
#initial-ping-timeout = 2
#
[dimm]
# Is the in memory DIMM error tracking enabled?
# Only works on systems with integrated memory controller and
# which are supported.
# Only takes effect in daemon mode.
dimm-tracking-enabled = yes
# Use DMI information from the BIOS to prepopulate DIMM database.
# Note this might not work with all BIOS and requires mcelog to run as root.
# Alternative is to let mcelog create DIMM objects on demand.
dmi-prepopulate = yes
#
# Execute these triggers when the rate of corrected or uncorrected
# Errors per DIMM exceeds the threshold.
# Note when the hardware does not report DIMMs this might also
# be per channel.
# The default of 10/24h is reasonable for server quality
# DDR3 DIMMs as of 2009/10.
#uc-error-trigger = dimm-error-trigger
uc-error-threshold = 1 / 24h
#ce-error-trigger = dimm-error-trigger
ce-error-threshold = 10 / 24h

[socket]
# Enable memory error accounting per socket.
socket-tracking-enabled = yes

# Threshold and trigger for uncorrected memory errors on a socket.
# mem-uc-error-trigger = socket-memory-error-trigger

mem-uc-error-threshold = 100 / 24h

# Trigger script for corrected memory errors on a socket.
mem-ce-error-trigger = socket-memory-error-trigger

# Threshold on when to trigger a correct error for the socket.

mem-ce-error-threshold = 100 / 24h

# Log socket error threshold explicitely?
mem-ce-error-log = yes

# Trigger script for uncorrected bus error events
bus-uc-threshold-trigger = bus-error-trigger

# Trigger script for uncorrected IOMCA erors
iomca-threshold-trigger = iomca-error-trigger

# Trigger script for other uncategorized errors
unknown-threshold-trigger = unknown-error-trigger

[cache]
# Processing of cache error thresholds reported by Intel CPUs.
cache-threshold-trigger = cache-error-trigger

# Should cache threshold events be logged explicitely?
cache-threshold-log = yes

[page]
# Memory error accouting per 4K memory page.
# Threshold for the correct memory errors trigger script.
memory-ce-threshold = 10 / 24h

# Trigger script for corrected errors.
# memory-ce-trigger = page-error-trigger

# Should page threshold events be logged explicitely?
memory-ce-log = yes

# specify the internal action in mcelog to exceeding a page error threshold
# this is done in addition to executing the trigger script if available
# off no action
# account only account errors
# soft try to soft-offline page without killing any processes
# This requires an uptodate kernel. Might not be successfull.
# hard try to hard-offline page by killing processes
# Requires an uptodate kernel. Might not be successfull.
# soft-then-hard First try to soft offline, then try hard offlining
#memory-ce-action = off|account|soft|hard|soft-then-hard
memory-ce-action = soft

[trigger]
# Maximum number of running triggers
children-max = 2
# execute triggers in this directory
directory = /etc/mcelog

+ 3
- 0
linux/system/init.sls View File

@@ -117,3 +117,6 @@ include:
{%- if system.banner is defined %}
- linux.system.banner
{%- endif %}
{%- if system.mcelog is defined %}
- linux.system.mcelog
{%- endif %}

+ 32
- 0
linux/system/mcelog.sls View File

@@ -0,0 +1,32 @@
{%- from "linux/map.jinja" import system with context %}
{%- if system.enabled %}

{%- if system.get('mcelog',{}).get('enabled', False) %}

mcelog_packages:
pkg.installed:
- name: mcelog

mcelog_conf:
file.managed:
- name: /etc/mcelog/mcelog.conf
- source: salt://linux/files/mcelog.conf
- template: jinja
- user: root
- group: root
- mode: 644
- require:
- pkg: mcelog_packages

mce_service:
service.running:
- name: mcelog
- enable: true
- require:
- pkg: mcelog_packages
- watch:
- file: mcelog_conf

{%- endif %}

{%- endif %}

+ 5
- 0
tests/pillar/system.sls View File

@@ -369,3 +369,8 @@ linux:
interval: 20
logpath: "/var/mylog/atop"
outfile: "/var/mylog/atop/daily.log"
mcelog:
enabled: true
logging:
syslog: true
syslog_error: true

Loading…
Cancel
Save