-
-
Notifications
You must be signed in to change notification settings - Fork 40
/
watchdogd.conf
165 lines (151 loc) · 5.66 KB
/
watchdogd.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# /etc/watchdogd.conf sample
# Commented out values are program defaults.
#
# The checker/monitor `warning` and `critical` levels are 0.00-1.00,
# i.e. 0-100%, except for load average which can vary a lot between
# systems and use-cases, not just because of the number of CPU cores.
# Use the `script = ...` setting to call script when `warning` and
# `critical` are reached for a monitor. In `critical` the monitor
# otherwise triggers an unconditional reboot.
#
# NOTE: `critical` is optional, omitting it disables the reboot action.
#
### Watchdogs ##########################################################
# Global settings that can be overridden per watchdog
# Do not set WDT timeout and kick interval too low, the daemon runs at
# SCHED_OTHER level with all other tasks, unless the process supervisor
# is enabled. The monitor plugins (below) need CPU time as well.
#timeout = 20
#interval = 10
# With safe-exit enabled (true) the daemon will ask the driver disable
# the WDT before exiting (SIGINT). However, some WDT drivers (or HW)
# may not support this.
#safe-exit = true
# Multiple watchdogs can be kicked, the default, even if no .conf file
# is found or device node given on the command line, is /dev/watchdog
device /dev/watchdog {
timeout = 20
interval = 10
safe-exit = true
}
#device /dev/watchdog2 {
# timeout = 20
# interval = 10
# safe-exit = true
#}
### Supervisor #########################################################
# Instrumented processes can have their main loop supervised. Processes
# subscribe to this service using the libwdog API, see the docs for more
# on this. When the supervisor is enabled and the priority is set to a
# value > 0, watchdogd runs as a SCHED_RR process with elevated realtime
# priority. When disabled, or the priority is set to zero (0), it runs
# as a regular SCHED_OTHER process, this is the default.
#
# When a supervised process fails to meet its deadline, the daemon will
# perform an unconditional reset having saved the reset reason. If a
# script is provided in this section it will be called instead. The
# script is called as:
#
# script.sh supervisor CODE PID LABEL
#
# Availabel CODEs for the reset reason are avilable in wdog.h
#
supervisor {
# !!!REMEMBER TO ENABLE reset-reason (below) AS WELL!!!
# enabled = true
# priority = 98
script = "/path/to/supervisor-script.sh"
}
### Reset reason #######################################################
# The following section controls if/how the reset reason & reset counter
# is tracked. By default this is disabled, since not all systems allow
# writing to disk, e.g. embedded systems using MTD devices with limited
# number of write cycles.
#
# The default file setting is a non-volatile path, according to the FHS.
# It can be changed to another location, but make sure that location is
# writable first.
reset-reason {
enabled = true
file = "/var/lib/misc/watchdogd.state"
}
### Checkers/Monitors ##################################################
#
# Script or command to run instead of reboot when a monitor plugin
# reaches any of its critical or warning level. Setting this will
# disable the built-in reboot on critical, it is therefore up to the
# script to perform reboot, if needed. The script is called as:
#
# script.sh {filenr, fsmon, loadavg, meminfo} {crit, warn} VALUE
#
#script = "/path/to/reboot-action.sh"
# Monitors file descriptor leaks based on /proc/sys/fs/file-nr
filenr {
# enabled = true
interval = 300
logmark = false
warning = 0.9
critical = 1.0
# script = "/path/to/alt-reboot-action.sh"
}
# Monitors a file system, blocks and inode usage against watermarks
# The script is called with fsmon as the first argument and there
# are two environment variables FSMON_NAME, for the monitored path,
# and FSMON_TYPE indicating either 'blocks' or 'inodes'.
#fsmon /var {
# enabled = true
# interval = 300
# logmark = false
# warning = 0.95
# critical = 1.0
# script = "/path/to/alt-reboot-action.sh"
#}
# Monitors load average based on sysinfo() from /proc/loadavg
# The level is composed from the average of the 1 and 5 min marks.
loadavg {
# enabled = true
interval = 300
logmark = false
warning = 1.0
critical = 2.0
# script = "/path/to/alt-reboot-action.sh"
}
# Monitors free RAM based on data from /proc/meminfo
meminfo {
# enabled = true
interval = 300
logmark = false
warning = 0.9
critical = 0.95
# script = "/path/to/alt-reboot-action.sh"
}
# Monitor temperature. The critical value is unset by default, so no
# action is taken at that watermark (by default). Both the critical and
# warning watermarks are relative to the trip/critical/max value from
# sysfs. The warning is default 0.9, i.e., 90% of critical. Use script
# to to reset the fan controller or poweroff(8) the system.
#
# Each temp monitor caches the last 10 values, calculates the mean, and
# compares that to the warning and critical levels. Logging is only
# done every 10 x interval (if enabled).
#tempmon /sys/class/hwmon/hwmon0/temp1_input {
# enabled = true
# interval = 30
# warning = 0.9
# critical = 0.95
# logmark = true
# script = "/script/to/log/and/poweroff.sh"
#}
# Monitor a generic script, executes 'monitor-script' every 'interval'
# seconds, with a max runtime of 'timeout' seconds. When the exit code
# of the monitor script is above the critical level watchdogd either
# starts the reboot, or calls the alternate 'script' to determin the
# next cause of action.
#generic /path/to/monitor-script.sh {
# enabled = true
# interval = 300
# timeout = 60
# warning = 1
# critical = 10
# script = "/path/to/alt-reboot-action.sh"
#}