|
|
|
|
|
|
|
|
|
|
|
{%- from "linux/map.jinja" import system with context %} |
|
|
|
|
|
{%- set mcelog = system.mcelog %} |
|
|
|
|
|
# |
|
|
|
|
|
# Example config file for mcelog |
|
|
|
|
|
# mcelog is the user space backend that decodes and process machine check events |
|
|
|
|
|
# (cpu hardware errors) reported by the CPU to the kernel |
|
|
|
|
|
# |
|
|
|
|
|
|
|
|
|
|
|
# general format |
|
|
|
|
|
#optionname = value |
|
|
|
|
|
# white space is not allowed in value currently, except at the end where it is dropped |
|
|
|
|
|
# |
|
|
|
|
|
|
|
|
|
|
|
# In general all command line options that are not commands work here. |
|
|
|
|
|
# See man mcelog or mcelog --help for a list. |
|
|
|
|
|
# e.g. to enable the --no-syslog option use |
|
|
|
|
|
#no-syslog = yes (or no to disable) |
|
|
|
|
|
# when the option has a argument |
|
|
|
|
|
#logfile = /tmp/logfile |
|
|
|
|
|
# below are the options which are not command line options. |
|
|
|
|
|
|
|
|
|
|
|
# Set CPU type for which mcelog decodes events: |
|
|
|
|
|
#cpu = type |
|
|
|
|
|
# For valid values for type please see mcelog --help. |
|
|
|
|
|
# If this value is set incorrectly the decoded output will be likely incorrect. |
|
|
|
|
|
# By default when this parameter is not set mcelog uses the CPU it is running on |
|
|
|
|
|
# on very new kernels the mcelog events reported by the kernel also carry |
|
|
|
|
|
# the CPU type which is used too when available and not overriden. |
|
|
|
|
|
|
|
|
|
|
|
# Enable daemon mode: |
|
|
|
|
|
#daemon = yes |
|
|
|
|
|
# By default mcelog just processes the currently pending events and exits. |
|
|
|
|
|
# In daemon mode it will keep running as a daemon in the background and poll |
|
|
|
|
|
# the kernel for events and then decode them. |
|
|
|
|
|
|
|
|
|
|
|
# Filter out known broken events by default. |
|
|
|
|
|
filter = yes |
|
|
|
|
|
# Don't log memory errors individually. |
|
|
|
|
|
# They still get accounted if that is enabled. |
|
|
|
|
|
#filter-memory-errors = yes |
|
|
|
|
|
|
|
|
|
|
|
# output in undecoded raw format to be easier machine readable |
|
|
|
|
|
# (default is decoded). |
|
|
|
|
|
#raw = yes |
|
|
|
|
|
|
|
|
|
|
|
# Set CPU Mhz to decode uptime from time stamp counter (output |
|
|
|
|
|
# unreliable, not needed on new kernels which report the event time |
|
|
|
|
|
# directly. A lot of systems don't have a linear time stamp clock |
|
|
|
|
|
# and the output is wrong then. |
|
|
|
|
|
# Normally mcelog tries to figure out if it the TSC is reliable |
|
|
|
|
|
# and only uses the current frequency then. |
|
|
|
|
|
# Setting a frequency forces timestamp decoding. |
|
|
|
|
|
# This setting is obsolete with modern kernels which report the time |
|
|
|
|
|
# directly. |
|
|
|
|
|
#cpumhz = 1800.00 |
|
|
|
|
|
|
|
|
|
|
|
# log output options |
|
|
|
|
|
# Log decoded machine checks in syslog (default stdout or syslog for daemon) |
|
|
|
|
|
#syslog = yes |
|
|
|
|
|
# Log decoded machine checks in syslog with error level |
|
|
|
|
|
#syslog-error = yes |
|
|
|
|
|
# Never log anything to syslog |
|
|
|
|
|
#no-syslog = yes |
|
|
|
|
|
# Append log output to logfile instead of stdout. Only when no syslog logging is active |
|
|
|
|
|
#logfile = filename |
|
|
|
|
|
|
|
|
|
|
|
{%- if mcelog.logging is defined %} |
|
|
|
|
|
|
|
|
|
|
|
{%- if mcelog.logging.syslog is defined %} |
|
|
|
|
|
syslog = {{ 'yes' if mcelog.logging.syslog else 'no' }} |
|
|
|
|
|
{%- endif %} |
|
|
|
|
|
{%- if mcelog.logging.syslog_error is defined %} |
|
|
|
|
|
syslog-error = {{ 'yes' if mcelog.logging.syslog_error else 'no' }} |
|
|
|
|
|
{%- endif %} |
|
|
|
|
|
{%- if mcelog.logging.no_syslog is defined %} |
|
|
|
|
|
no-syslog = {{ 'yes' if mcelog.logging.no_syslog else 'no' }} |
|
|
|
|
|
{%- endif %} |
|
|
|
|
|
{%- if mcelog.logging.logfile is defined %} |
|
|
|
|
|
logfile = {{ mcelog.logging.logfile }} |
|
|
|
|
|
{%- endif %} |
|
|
|
|
|
|
|
|
|
|
|
{%- endif %} |
|
|
|
|
|
# Use SMBIOS information to decode DIMMs (needs root). |
|
|
|
|
|
# This function is not recommended to use right now and generally not needed. |
|
|
|
|
|
# The exception is memdb prepopulation, which is configured separately below. |
|
|
|
|
|
#dmi = no |
|
|
|
|
|
|
|
|
|
|
|
# When in daemon mode run as this user after set up. |
|
|
|
|
|
# Note that the triggers will run as this user too. |
|
|
|
|
|
# Setting this to non root will mean that triggers cannot take some corrective |
|
|
|
|
|
# action, like offlining objects. |
|
|
|
|
|
#run-credentials-user = root |
|
|
|
|
|
|
|
|
|
|
|
# group to run as daemon with |
|
|
|
|
|
# default to the group of the run-credentials-user |
|
|
|
|
|
#run-credentials-group = nobody |
|
|
|
|
|
|
|
|
|
|
|
[server] |
|
|
|
|
|
# user allowed to access client socket. |
|
|
|
|
|
# when set to * match any |
|
|
|
|
|
# root is always allowed to access. |
|
|
|
|
|
# default: root only |
|
|
|
|
|
client-user = root |
|
|
|
|
|
# group allowed to access mcelog |
|
|
|
|
|
# When no group is configured any group matches (but still user checking). |
|
|
|
|
|
# when set to * match any |
|
|
|
|
|
#client-group = root |
|
|
|
|
|
# Path to the unix socket for client<->server communication. |
|
|
|
|
|
# When no socket-path is configured the server will not start |
|
|
|
|
|
#socket-path = /var/run/mcelog-client |
|
|
|
|
|
# When mcelog starts it checks if a server is already running. This configures the timeout |
|
|
|
|
|
# for this check. |
|
|
|
|
|
#initial-ping-timeout = 2 |
|
|
|
|
|
# |
|
|
|
|
|
[dimm] |
|
|
|
|
|
# Is the in memory DIMM error tracking enabled? |
|
|
|
|
|
# Only works on systems with integrated memory controller and |
|
|
|
|
|
# which are supported. |
|
|
|
|
|
# Only takes effect in daemon mode. |
|
|
|
|
|
dimm-tracking-enabled = yes |
|
|
|
|
|
# Use DMI information from the BIOS to prepopulate DIMM database. |
|
|
|
|
|
# Note this might not work with all BIOS and requires mcelog to run as root. |
|
|
|
|
|
# Alternative is to let mcelog create DIMM objects on demand. |
|
|
|
|
|
dmi-prepopulate = yes |
|
|
|
|
|
# |
|
|
|
|
|
# Execute these triggers when the rate of corrected or uncorrected |
|
|
|
|
|
# Errors per DIMM exceeds the threshold. |
|
|
|
|
|
# Note when the hardware does not report DIMMs this might also |
|
|
|
|
|
# be per channel. |
|
|
|
|
|
# The default of 10/24h is reasonable for server quality |
|
|
|
|
|
# DDR3 DIMMs as of 2009/10. |
|
|
|
|
|
#uc-error-trigger = dimm-error-trigger |
|
|
|
|
|
uc-error-threshold = 1 / 24h |
|
|
|
|
|
#ce-error-trigger = dimm-error-trigger |
|
|
|
|
|
ce-error-threshold = 10 / 24h |
|
|
|
|
|
|
|
|
|
|
|
[socket] |
|
|
|
|
|
# Enable memory error accounting per socket. |
|
|
|
|
|
socket-tracking-enabled = yes |
|
|
|
|
|
|
|
|
|
|
|
# Threshold and trigger for uncorrected memory errors on a socket. |
|
|
|
|
|
# mem-uc-error-trigger = socket-memory-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
mem-uc-error-threshold = 100 / 24h |
|
|
|
|
|
|
|
|
|
|
|
# Trigger script for corrected memory errors on a socket. |
|
|
|
|
|
mem-ce-error-trigger = socket-memory-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
# Threshold on when to trigger a correct error for the socket. |
|
|
|
|
|
|
|
|
|
|
|
mem-ce-error-threshold = 100 / 24h |
|
|
|
|
|
|
|
|
|
|
|
# Log socket error threshold explicitely? |
|
|
|
|
|
mem-ce-error-log = yes |
|
|
|
|
|
|
|
|
|
|
|
# Trigger script for uncorrected bus error events |
|
|
|
|
|
bus-uc-threshold-trigger = bus-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
# Trigger script for uncorrected IOMCA erors |
|
|
|
|
|
iomca-threshold-trigger = iomca-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
# Trigger script for other uncategorized errors |
|
|
|
|
|
unknown-threshold-trigger = unknown-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
[cache] |
|
|
|
|
|
# Processing of cache error thresholds reported by Intel CPUs. |
|
|
|
|
|
cache-threshold-trigger = cache-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
# Should cache threshold events be logged explicitely? |
|
|
|
|
|
cache-threshold-log = yes |
|
|
|
|
|
|
|
|
|
|
|
[page] |
|
|
|
|
|
# Memory error accouting per 4K memory page. |
|
|
|
|
|
# Threshold for the correct memory errors trigger script. |
|
|
|
|
|
memory-ce-threshold = 10 / 24h |
|
|
|
|
|
|
|
|
|
|
|
# Trigger script for corrected errors. |
|
|
|
|
|
# memory-ce-trigger = page-error-trigger |
|
|
|
|
|
|
|
|
|
|
|
# Should page threshold events be logged explicitely? |
|
|
|
|
|
memory-ce-log = yes |
|
|
|
|
|
|
|
|
|
|
|
# specify the internal action in mcelog to exceeding a page error threshold |
|
|
|
|
|
# this is done in addition to executing the trigger script if available |
|
|
|
|
|
# off no action |
|
|
|
|
|
# account only account errors |
|
|
|
|
|
# soft try to soft-offline page without killing any processes |
|
|
|
|
|
# This requires an uptodate kernel. Might not be successfull. |
|
|
|
|
|
# hard try to hard-offline page by killing processes |
|
|
|
|
|
# Requires an uptodate kernel. Might not be successfull. |
|
|
|
|
|
# soft-then-hard First try to soft offline, then try hard offlining |
|
|
|
|
|
#memory-ce-action = off|account|soft|hard|soft-then-hard |
|
|
|
|
|
memory-ce-action = soft |
|
|
|
|
|
|
|
|
|
|
|
[trigger] |
|
|
|
|
|
# Maximum number of running triggers |
|
|
|
|
|
children-max = 2 |
|
|
|
|
|
# execute triggers in this directory |
|
|
|
|
|
directory = /etc/mcelog |