diff --git a/go.mod b/go.mod index fb837b15666..0810d7e7e08 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/buger/goterm v0.0.0-20200322175922-2f3e71b85129 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e github.com/cockroachdb/pebble v0.0.0-20201001221639-879f3bfeef07 - github.com/coreos/go-systemd/v22 v22.0.0 + github.com/coreos/go-systemd/v22 v22.1.0 github.com/detailyang/go-fallocate v0.0.0-20180908115635-432fa640bd2e github.com/dgraph-io/badger/v2 v2.2007.2 github.com/docker/go-units v0.4.0 @@ -124,7 +124,7 @@ require ( github.com/polydawn/refmt v0.0.0-20190809202753-05966cbd336a github.com/prometheus/client_golang v1.6.0 github.com/raulk/clock v1.1.0 - github.com/raulk/go-watchdog v0.0.1 + github.com/raulk/go-watchdog v1.0.1 github.com/stretchr/testify v1.6.1 github.com/supranational/blst v0.1.1 github.com/syndtr/goleveldb v1.0.0 diff --git a/go.sum b/go.sum index 6cef9defd8b..7f60cef4ae9 100644 --- a/go.sum +++ b/go.sum @@ -129,6 +129,7 @@ github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5O github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= github.com/clbanning/x2j v0.0.0-20191024224557-825249438eec/go.mod h1:jMjuTZXRI4dUb/I5gc9Hdhagfvm9+RyrPryS/auMzxE= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= @@ -145,6 +146,8 @@ github.com/cockroachdb/redact v0.0.0-20200622112456-cd282804bbd3 h1:2+dpIJzYMSbL github.com/cockroachdb/redact v0.0.0-20200622112456-cd282804bbd3/go.mod h1:BVNblN9mBWFyMyqK1k3AAiSxhvhfK2oOZZ2lK+dpvRg= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd h1:qMd81Ts1T2OTKmB4acZcyKaMtRnY5Y44NuXGX2GFJ1w= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= +github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327 h1:7grrpcfCtbZLsjtB0DgMuzs1umsJmpzaHMZ6cO6iAWw= +github.com/containerd/cgroups v0.0.0-20201119153540-4cbc285b3327/go.mod h1:ZJeTFisyysqgcCdecO57Dj79RfL0LNeGiFUqLYQRYLE= github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk= github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= @@ -155,8 +158,8 @@ github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7 github.com/coreos/go-systemd v0.0.0-20181012123002-c6f51f82210d/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU= github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= -github.com/coreos/go-systemd/v22 v22.0.0 h1:XJIw/+VlJ+87J+doOxznsAWIdmWuViOVhkQamW5YV28= -github.com/coreos/go-systemd/v22 v22.0.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= +github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= github.com/cpuguy83/go-md2man v1.0.10 h1:BSKMNlYxDvnunlTymqtgONjNnaRV1sTpcovwwjF22jk= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= @@ -394,6 +397,8 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0 h1:/QaMHBdZ26BB3SSst0Iwl10Epc+xhTquomWX0oZEB6w= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ= github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -1236,6 +1241,8 @@ github.com/onsi/gomega v1.9.0/go.mod h1:Ho0h+IUsWyvy1OpqCwxlQ/21gkhVunqlU8fDGcoT github.com/onsi/gomega v1.10.1 h1:o0+MgICZLuZ7xjH7Vx6zS/zcu93/BEp1VwkIW1mEXCE= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/op/go-logging v0.0.0-20160315200505-970db520ece7/go.mod h1:HzydrMdWErDVzsI23lYNej1Htcns9BCg93Dk0bBINWk= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opentracing-contrib/go-grpc v0.0.0-20180928155321-4b5a12d3ff02/go.mod h1:JNdpVEzCpXBgIiv4ds+TzhN1hrtxq6ClLrTlT9OQRSc= github.com/opentracing-contrib/go-grpc v0.0.0-20191001143057-db30781987df h1:vdYtBU6zvL7v+Tr+0xFM/qhahw/EvY8DMMunZHKH6eE= github.com/opentracing-contrib/go-grpc v0.0.0-20191001143057-db30781987df/go.mod h1:DYR5Eij8rJl8h7gblRrOZ8g0kW1umSpKqYIBTgeDtLo= @@ -1313,8 +1320,8 @@ github.com/prometheus/procfs v0.1.0 h1:jhMy6QXfi3y2HEzFoyuCj40z4OZIIHHPtFyCMftmv github.com/prometheus/procfs v0.1.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/raulk/clock v1.1.0 h1:dpb29+UKMbLqiU/jqIJptgLR1nn23HLgMY0sTCDza5Y= github.com/raulk/clock v1.1.0/go.mod h1:3MpVxdZ/ODBQDxbN+kzshf5OSZwPjtMDx6BBXBmOeY0= -github.com/raulk/go-watchdog v0.0.1 h1:q0ad0fanW8uaLRTvxQ0RfdADBiKa6CL6NMByhB0vpBs= -github.com/raulk/go-watchdog v0.0.1/go.mod h1:dIvQcKy0laxuHGda1ms8/2T9wE3ZJRbz9bxEO7c0q1M= +github.com/raulk/go-watchdog v1.0.1 h1:qgm3DIJAeb+2byneLrQJ7kvmDLGxN2vy3apXyGaDKN4= +github.com/raulk/go-watchdog v1.0.1/go.mod h1:lzSbAl5sh4rtI8tYHU01BWIDzgzqaQLj6RcA1i4mlqI= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 h1:OdAsTTz6OkFY5QxjkYwrChwuRruF69c169dPK26NUlk= github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= @@ -1366,6 +1373,8 @@ github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= +github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= +github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHeiJApdr3r4w= @@ -1426,6 +1435,8 @@ github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljT github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= github.com/urfave/cli v1.22.1 h1:+mkCCcOFKPnCmVYVcURKps1Xe+3zP90gSYGNfRkjoIY= github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= +github.com/urfave/cli v1.22.2 h1:gsqYFH8bb9ekPA12kRo0hfjngWQjkJPlN9R0N78BoUo= +github.com/urfave/cli v1.22.2/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/urfave/cli/v2 v2.0.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ= github.com/urfave/cli/v2 v2.2.0 h1:JTTnM6wKzdA0Jqodd966MVj4vWbbquZykeX1sKbe2C4= github.com/urfave/cli/v2 v2.2.0/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ= diff --git a/node/modules/core.go b/node/modules/core.go index 794a9dafe89..f8b248572ad 100644 --- a/node/modules/core.go +++ b/node/modules/core.go @@ -7,6 +7,7 @@ import ( "io" "io/ioutil" "os" + "path/filepath" "time" "github.com/gbrlsnchs/jwt/v3" @@ -14,6 +15,7 @@ import ( "github.com/libp2p/go-libp2p-core/peer" "github.com/libp2p/go-libp2p-core/peerstore" record "github.com/libp2p/go-libp2p-record" + "github.com/raulk/go-watchdog" "go.uber.org/fx" "golang.org/x/xerrors" @@ -28,7 +30,6 @@ import ( "github.com/filecoin-project/lotus/node/modules/dtypes" "github.com/filecoin-project/lotus/node/repo" "github.com/filecoin-project/lotus/system" - "github.com/raulk/go-watchdog" ) const ( @@ -69,45 +70,71 @@ func MemoryConstraints() system.MemoryConstraints { // MemoryWatchdog starts the memory watchdog, applying the computed resource // constraints. -func MemoryWatchdog(lc fx.Lifecycle, constraints system.MemoryConstraints) { +func MemoryWatchdog(lr repo.LockedRepo, lc fx.Lifecycle, constraints system.MemoryConstraints) { if os.Getenv(EnvWatchdogDisabled) == "1" { log.Infof("memory watchdog is disabled via %s", EnvWatchdogDisabled) return } - cfg := watchdog.MemConfig{ - Resolution: 5 * time.Second, - Policy: &watchdog.WatermarkPolicy{ - Watermarks: []float64{0.50, 0.60, 0.70, 0.85, 0.90, 0.925, 0.95}, - EmergencyWatermark: 0.95, - }, - Logger: logWatchdog, + // configure heap profile capture so that one is captured per episode where + // utilization climbs over 90% of the limit. A maximum of 10 heapdumps + // will be captured during life of this process. + watchdog.HeapProfileDir = filepath.Join(lr.Path(), "heapprof") + watchdog.HeapProfileMaxCaptures = 10 + watchdog.HeapProfileThreshold = 0.1 + watchdog.Logger = logWatchdog + + policy := watchdog.NewWatermarkPolicy(0.50, 0.60, 0.70, 0.85, 0.90, 0.925, 0.95) + + // Try to initialize a watchdog in the following order of precedence: + // 1. If a max heap limit has been provided, initialize a heap-driven watchdog. + // 2. Else, try to initialize a cgroup-driven watchdog. + // 3. Else, try to initialize a system-driven watchdog. + // 4. Else, log a warning that the system is flying solo, and return. + + addStopHook := func(stopFn func()) { + lc.Append(fx.Hook{ + OnStop: func(ctx context.Context) error { + stopFn() + return nil + }, + }) } - // if user has set max heap limit, apply it. Otherwise, fall back to total - // system memory constraint. + // 1. If user has set max heap limit, apply it. if maxHeap := constraints.MaxHeapMem; maxHeap != 0 { - log.Infof("memory watchdog will apply max heap constraint: %d bytes", maxHeap) - cfg.Limit = maxHeap - cfg.Scope = watchdog.ScopeHeap - } else { - log.Infof("max heap size not provided; memory watchdog will apply total system memory constraint: %d bytes", constraints.TotalSystemMem) - cfg.Limit = constraints.TotalSystemMem - cfg.Scope = watchdog.ScopeSystem + const minGOGC = 10 + err, stopFn := watchdog.HeapDriven(maxHeap, minGOGC, policy) + if err == nil { + log.Infof("initialized heap-driven watchdog; max heap: %d bytes", maxHeap) + addStopHook(stopFn) + return + } + log.Warnf("failed to initialize heap-driven watchdog; err: %s", err) + log.Warnf("trying a cgroup-driven watchdog") } - err, stop := watchdog.Memory(cfg) - if err != nil { - log.Warnf("failed to instantiate memory watchdog: %s", err) + // 2. cgroup-driven watchdog. + err, stopFn := watchdog.CgroupDriven(5*time.Second, policy) + if err == nil { + log.Infof("initialized cgroup-driven watchdog") + addStopHook(stopFn) + return + } + log.Warnf("failed to initialize cgroup-driven watchdog; err: %s", err) + log.Warnf("trying a system-driven watchdog") + + // 3. system-driven watchdog. + err, stopFn = watchdog.SystemDriven(0, 5*time.Second, policy) // 0 calculates the limit automatically. + if err == nil { + log.Infof("initialized system-driven watchdog") + addStopHook(stopFn) return } - lc.Append(fx.Hook{ - OnStop: func(ctx context.Context) error { - stop() - return nil - }, - }) + // 4. log the failure + log.Warnf("failed to initialize system-driven watchdog; err: %s", err) + log.Warnf("system running without a memory watchdog") } type JwtPayload struct { @@ -166,7 +193,7 @@ func BuiltinBootstrap() (dtypes.BootstrapPeers, error) { func DrandBootstrap(ds dtypes.DrandSchedule) (dtypes.DrandBootstrap, error) { // TODO: retry resolving, don't fail if at least one resolve succeeds - res := []peer.AddrInfo{} + var res []peer.AddrInfo for _, d := range ds { addrs, err := addrutil.ParseAddresses(context.TODO(), d.Config.Relays) if err != nil {