watchdogd.conf

# /etc/watchdogd.conf sample
# Commented out values are program defaults.
#
# The checker/monitor `warning` and `critical` levels are 0.00-1.00,
# i.e. 0-100%, except for load average which can vary a lot between
# systems and use-cases, not just because of the number of CPU cores.
# Use the `script = ...` setting to call script when `warning` and
# `critical` are reached for a monitor.  In `critical` the monitor
# otherwise triggers an unconditional reboot.
#
# NOTE: `critical` is optional, omitting it disables the reboot action.
#

### Watchdogs ##########################################################
# Global settings that can be overridden per watchdog

# Do not set WDT timeout and kick interval too low, the daemon runs at
# SCHED_OTHER level with all other tasks, unless the process supervisor
# is enabled.  The monitor plugins (below) need CPU time as well.
#timeout   = 20
#interval  = 10

# With safe-exit enabled (true) the daemon will ask the driver disable
# the WDT before exiting (SIGINT).  However, some WDT drivers (or HW)
# may not support this.
#safe-exit = true

# Multiple watchdogs can be kicked, the default, even if no .conf file
# is found or device node given on the command line, is /dev/watchdog
device /dev/watchdog {
    timeout    = 20
    interval   = 10
    safe-exit  = true
}

#device /dev/watchdog2 {
#    timeout    = 20
#    interval   = 10
#    safe-exit  = true
#}

### Supervisor #########################################################
# Instrumented processes can have their main loop supervised.  Processes
# subscribe to this service using the libwdog API, see the docs for more
# on this.  When the supervisor is enabled and the priority is set to a
# value > 0, watchdogd runs as a SCHED_RR process with elevated realtime
# priority.  When disabled, or the priority is set to zero (0), it runs
# as a regular SCHED_OTHER process, this is the default.
#
# When a supervised process fails to meet its deadline, the daemon will
# perform an unconditional reset having saved the reset reason.  If a
# script is provided in this section it will be called instead.  The
# script is called as:
#
#    script.sh supervisor CODE PID LABEL
#
# Availabel CODEs for the reset reason are avilable in wdog.h
#
supervisor {
#    !!!REMEMBER TO ENABLE reset-reason (below) AS WELL!!!
#    enabled  = true
#    priority = 98
    script = "/path/to/supervisor-script.sh"
}

### Reset reason #######################################################
# The following section controls if/how the reset reason & reset counter
# is tracked.  By default this is disabled, since not all systems allow
# writing to disk, e.g. embedded systems using MTD devices with limited
# number of write cycles.
#
# The default file setting is a non-volatile path, according to the FHS.
# It can be changed to another location, but make sure that location is
# writable first.
reset-reason {
    enabled = true
    file    = "/var/lib/misc/watchdogd.state"
}

### Checkers/Monitors ##################################################
#
# Script or command to run instead of reboot when a monitor plugin
# reaches any of its critical or warning level.  Setting this will
# disable the built-in reboot on critical, it is therefore up to the
# script to perform reboot, if needed.  The script is called as:
#
#    script.sh {filenr, fsmon, loadavg, meminfo} {crit, warn} VALUE
#
#script = "/path/to/reboot-action.sh"

# Monitors file descriptor leaks based on /proc/sys/fs/file-nr
filenr {
#    enabled = true
    interval = 300
    logmark  = false
    warning  = 0.9
    critical = 1.0
#    script = "/path/to/alt-reboot-action.sh"
}

# Monitors a file system, blocks and inode usage against watermarks
# The script is called with fsmon as the first argument and there
# are two environment variables FSMON_NAME, for the monitored path,
# and FSMON_TYPE indicating either 'blocks' or 'inodes'.
#fsmon /var {
#    enabled = true
#    interval = 300
#    logmark  = false
#    warning  = 0.95
#    critical = 1.0
#    script = "/path/to/alt-reboot-action.sh"
#}

# Monitors load average based on sysinfo() from /proc/loadavg
# The level is composed from the average of the 1 and 5 min marks.
loadavg {
#    enabled = true
    interval = 300
    logmark  = false
    warning  = 1.0
    critical = 2.0
#    script = "/path/to/alt-reboot-action.sh"
}

# Monitors free RAM based on data from /proc/meminfo
meminfo {
#    enabled = true
    interval = 300
    logmark  = false
    warning  = 0.9
    critical = 0.95
#    script = "/path/to/alt-reboot-action.sh"
}

# Monitor temperature.  The critical value is unset by default, so no
# action is taken at that watermark (by default).  Both the critical and
# warning watermarks are relative to the trip/critical/max value from
# sysfs.  The warning is default 0.9, i.e., 90% of critical.  Use script
# to to reset the fan controller or poweroff(8) the system.
#
# Each temp monitor caches the last 10 values, calculates the mean, and
# compares that to the warning and critical levels.  Logging is only
# done every 10 x interval (if enabled).
#tempmon /sys/class/hwmon/hwmon0/temp1_input {
#    enabled  = true
#    interval = 30
#    warning  = 0.9
#    critical = 0.95
#    logmark  = true
#    script   = "/script/to/log/and/poweroff.sh"
#}

# Monitor a generic script, executes 'monitor-script' every 'interval'
# seconds, with a max runtime of 'timeout' seconds.  When the exit code
# of the monitor script is above the critical level watchdogd either
# starts the reboot, or calls the alternate 'script' to determin the
# next cause of action.
#generic /path/to/monitor-script.sh {
#    enabled = true
#    interval = 300
#    timeout = 60
#    warning  = 1
#    critical = 10
#    script = "/path/to/alt-reboot-action.sh"
#}