-
Notifications
You must be signed in to change notification settings - Fork 3.8k
/
Copy pathliveness.go
197 lines (180 loc) · 8.08 KB
/
liveness.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
// Copyright 2018 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
package kvserverpb
import (
"fmt"
"time"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
)
// IsLive returns whether the node is considered live at the given time.
//
// NOTE: If one is interested whether the Liveness is valid currently, then the
// timestamp passed in should be the known high-water mark of all the clocks of
// the nodes in the cluster. For example, if the liveness expires at ts 100, our
// physical clock is at 90, but we know that another node's clock is at 110,
// then it's preferable (more consistent across nodes) for the liveness to be
// considered expired. For that purpose, it's better to pass in
// clock.Now().GoTime() rather than clock.PhysicalNow() - the former takes into
// consideration clock signals from other nodes, the latter doesn't.
func (l *Liveness) IsLive(now time.Time) bool {
expiration := timeutil.Unix(0, l.Expiration.WallTime)
return now.Before(expiration)
}
// IsDead returns true if the liveness expired more than threshold ago.
//
// Note that, because of threshold, IsDead() is not the inverse of IsLive().
func (l *Liveness) IsDead(now time.Time, threshold time.Duration) bool {
expiration := timeutil.Unix(0, l.Expiration.WallTime)
deadAsOf := expiration.Add(threshold)
return !now.Before(deadAsOf)
}
func (l *Liveness) String() string {
var extra string
if l.Draining || l.CommissionStatus.Decommissioning() || l.CommissionStatus.Decommissioned() {
extra = fmt.Sprintf(" drain:%t comm:%s", l.Draining, l.CommissionStatus.String())
}
return fmt.Sprintf("liveness(nid:%d epo:%d exp:%s%s)", l.NodeID, l.Epoch, l.Expiration, extra)
}
// EnsureCompatible is typically called before transmitting/after receiving
// Liveness objects from over the wire. The representation for a given node's
// 'commission status' was changed in v20.2. In v20.1, we used a boolean
// representation to indicated that a node was undergoing a decommissioning
// process. Since it was only a boolean, we couldn't disambiguate between a node
// currently undergoing decommissioning, and a fully decommissioned node. In
// v20.2 we introduced a dedicated enum to be able to disambiguate between the
// two. That being said, v20.2 nodes need to be able to operate in mixed
// clusters with v20.1 nodes, that only know to interpret the boolean
// representation.
//
// EnsureCompatible is able to reconcile across both representations by mutating
// the receiver such that it's understood by both v20.1 and v20.2 nodes (See
// AssertValid for what this entails). If the receiver object is clearly one
// generated from a v20.1 node, we consider the deprecated boolean
// representation as the authoritative one. We consider the enum state
// authoritative if not.
//
// TODO(irfansharif): Remove this once v20.2 is cut.
func (l *Liveness) EnsureCompatible() {
if l.CommissionStatus.Unknown() {
// Liveness is from node running v20.1, or is an empty
// kvserverpb.Liveness, we fill in the commission status.
l.CommissionStatus = CommissionStatusFromBooleanForm(l.DeprecatedDecommissioning)
} else {
// Liveness is from node running v20.2, we backfill in the deprecated
// boolean state.
l.DeprecatedDecommissioning = l.CommissionStatus.Decommissioning() ||
l.CommissionStatus.Decommissioned()
}
}
// AssertValid checks that the liveness record is internally consistent (i.e.
// it's deprecated v20.1 boolean decommissioning representation is consistent
// with the v20.2 enum representation).
func (l *Liveness) AssertValid() {
if l.CommissionStatus.Unknown() {
panic("invalid commission status")
}
err := fmt.Sprintf("inconsistent liveness representation: %v", l.String())
if l.CommissionStatus.Decommissioning() || l.CommissionStatus.Decommissioned() {
if !l.DeprecatedDecommissioning {
panic(err)
}
} else {
if l.DeprecatedDecommissioning {
panic(err)
}
}
}
// CommissionStatusFromBooleanForm converts the deprecated boolean
// decommissioning state used in the v20.1 liveness proto definition to the new
// CommissionStatus enum.
//
// TODO(irfansharif): Remove this once v20.2 is cut, as we no longer need to be
// compatible with the deprecated boolean decommissioning representation used by
// v20.1 nodes.
func CommissionStatusFromBooleanForm(decommissioning bool) CommissionStatus {
// Liveness is from node running v20.1, we fill in the appropriate
// commission state.
if decommissioning {
// We take the conservative opinion and assume the node to be
// decommissioning, not fully decommissioned (after all, that's all
// one can infer from a boolean decommissioning state). If operators
// decommissioned nodes in a cluster running v20.1 and v20.2 nodes,
// they may have to decommission the nodes again once fully onto
// v20.2 in order to durably mark said nodes as decommissioned.
return CommissionStatus_DECOMMISSIONING
}
// We take the optimistic route here and assume the node is fully
// commissioned (we don't have a way of representing a node in the
// 'recommissioning' state, see comment on CommissionStatus for why
// that is).
return CommissionStatus_COMMISSIONED
}
func (c CommissionStatus) Unknown() bool { return c == CommissionStatus_UNKNOWN }
func (c CommissionStatus) Decommissioning() bool { return c == CommissionStatus_DECOMMISSIONING }
func (c CommissionStatus) Decommissioned() bool { return c == CommissionStatus_DECOMMISSIONED }
func (c CommissionStatus) Commissioned() bool { return c == CommissionStatus_COMMISSIONED }
func (c CommissionStatus) String() string {
switch c {
case CommissionStatus_UNKNOWN:
return "unknown"
case CommissionStatus_COMMISSIONED:
return "commissioned"
case CommissionStatus_DECOMMISSIONING:
return "decommissioning"
case CommissionStatus_DECOMMISSIONED:
return "decommissioned"
default:
err := "unknown commission status, expected one of [unknown,commissioned,decommissioning,decommissioned]"
panic(err)
}
}
// ErrIllegalCommissionStatusTransition is the sentinel error for illegal
// commission status transition attempts.
var ErrIllegalCommissionStatusTransition = errors.New("illegal commission status transition")
// ValidateTransition validates transitions of the liveness record, returning an
// error if the proposed transition is invalid. Ignoring no-ops, the valid state
// transitions for CommissionStatus are as follows:
// Decommissioning => Commissioned
// Commissioned => Decommissioning
// Decommissioning => Decommissioned
//
// See diagram above the CommissionStatus type for more details.
func ValidateTransition(old, new Liveness) error {
if new.CommissionStatus.Unknown() {
panic("illegal usage: liveness commission status is unknown")
}
if old == (Liveness{}) {
// No previous liveness present, all states are considered valid.
return nil
}
if old.CommissionStatus == new.CommissionStatus {
// No-op.
return nil
}
if new.CommissionStatus.Commissioned() && !old.CommissionStatus.Decommissioning() {
err := errors.Newf("can only recommission a decommissioning node; n%d found to be %s",
new.NodeID, old.CommissionStatus.String())
return errors.Wrapf(err, ErrIllegalCommissionStatusTransition.Error())
}
if new.CommissionStatus.Decommissioning() && !old.CommissionStatus.Commissioned() {
// NB: This code-path is actually inaccessible, given the no-op
// conditions above. We keep it for clarity.
err := errors.Newf("can only decommission a commissioned node, found %s",
old.CommissionStatus.String())
return errors.Wrapf(err, ErrIllegalCommissionStatusTransition.Error())
}
if new.CommissionStatus.Decommissioned() && !old.CommissionStatus.Decommissioning() {
err := errors.Newf("can only fully decommission a decommissioning node, found %s",
old.CommissionStatus.String())
return errors.Wrapf(err, ErrIllegalCommissionStatusTransition.Error())
}
return nil
}