Skip to content

Commit

Permalink
Add a specialized function to recompute the health of primary tablets.
Browse files Browse the repository at this point in the history
This ensures that we can never end up in a situation where there is more
than one healthy primary tablet for a given shard.

Signed-off-by: Arthur Schreiber <[email protected]>
  • Loading branch information
arthurschreiber authored and danieljoos committed Jul 12, 2024
1 parent e2b37fa commit 2a810d1
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 10 deletions.
23 changes: 21 additions & 2 deletions go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"encoding/json"
"fmt"
"hash/crc32"
"math"
"net/http"
"sort"
"strings"
Expand Down Expand Up @@ -448,10 +449,16 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
continue
}
delete(ths, tabletAlias)
// delete from healthy list

healthy, ok := hc.healthy[key]
if ok && len(healthy) > 0 {
hc.recomputeHealthy(key)
if tabletType == topodata.TabletType_PRIMARY {
// If tablet type is primary, we should only have one tablet in the healthy list.
hc.recomputeHealthyPrimary(key)
} else {
// Simply recompute the list of healthy tablets for all other tablet types.
hc.recomputeHealthy(key)
}
}
}
}()
Expand Down Expand Up @@ -593,6 +600,18 @@ func (hc *HealthCheckImpl) recomputeHealthy(key KeyspaceShardTabletType) {
hc.healthy[key] = FilterStatsByReplicationLag(allArray)
}

// Recompute the healthy primary tablet for the given key.
func (hc *HealthCheckImpl) recomputeHealthyPrimary(key KeyspaceShardTabletType) {
highestPrimaryTermStartTime := int64(math.MinInt64)

for _, s := range hc.healthData[key] {
if s.PrimaryTermStartTime >= highestPrimaryTermStartTime {
highestPrimaryTermStartTime = s.PrimaryTermStartTime
hc.healthy[key][0] = s
}
}
}

// Subscribe adds a listener. Used by vtgate buffer to learn about primary changes.
func (hc *HealthCheckImpl) Subscribe() chan *TabletHealth {
hc.subMu.Lock()
Expand Down
26 changes: 18 additions & 8 deletions go/vt/discovery/healthcheck_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,14 @@ func TestRemoveTablet(t *testing.T) {
assert.Empty(t, a, "wrong result, expected empty list")
}

// When an external primary failover is performed,
// the demoted primary will advertise itself as a `PRIMARY`
// tablet until it recognizes that it was demoted,
// and until all in-flight operations have either finished
// (successfully or unsuccessfully, see `--shutdown_grace_period` flag).
//
// During this time, operations like `RemoveTablet` should not lead
// to multiple tablets becoming valid targets for `PRIMARY`.
func TestTabletRemoveDuringExternalReparenting(t *testing.T) {
ctx := utils.LeakCheckContext(t)

Check failure on line 861 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (mysql57)

undefined: utils.LeakCheckContext

Check failure on line 861 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

undefined: utils.LeakCheckContext

Check failure on line 861 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (Race)

undefined: utils.LeakCheckContext

Check failure on line 861 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (mysql80)

undefined: utils.LeakCheckContext

Expand Down Expand Up @@ -890,15 +898,15 @@ func TestTabletRemoveDuringExternalReparenting(t *testing.T) {
<-resultChan
<-resultChan

firstTabletPromotionTime := time.Now().Unix() - 10
firstTabletPrimaryTermStartTimestamp := time.Now().Unix() - 10

firstTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: firstTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,

PrimaryTermStartTimestamp: firstTabletPromotionTime,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
PrimaryTermStartTimestamp: firstTabletPrimaryTermStartTimestamp,

Check failure on line 908 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (mysql57)

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse

Check failure on line 908 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse

Check failure on line 908 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (Race)

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse

Check failure on line 908 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (mysql80)

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
}

secondTabletHealthStream <- &querypb.StreamHealthResponse{
Expand All @@ -923,23 +931,25 @@ func TestTabletRemoveDuringExternalReparenting(t *testing.T) {
<-resultChan
<-resultChan

secondTabletPrimaryTermStartTimestamp := time.Now().Unix()

// Simulate a failover
firstTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: firstTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,

PrimaryTermStartTimestamp: 0,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
PrimaryTermStartTimestamp: firstTabletPrimaryTermStartTimestamp,

Check failure on line 942 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (mysql57)

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse

Check failure on line 942 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Static Code Checks Etc

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse

Check failure on line 942 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (Race)

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse

Check failure on line 942 in go/vt/discovery/healthcheck_test.go

View workflow job for this annotation

GitHub Actions / Unit Test (mysql80)

unknown field PrimaryTermStartTimestamp in struct literal of type query.StreamHealthResponse
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
}

secondTabletHealthStream <- &querypb.StreamHealthResponse{
TabletAlias: secondTablet.Alias,
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,

PrimaryTermStartTimestamp: time.Now().Unix(),
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
PrimaryTermStartTimestamp: secondTabletPrimaryTermStartTimestamp,
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
}

<-resultChan
Expand All @@ -953,7 +963,7 @@ func TestTabletRemoveDuringExternalReparenting(t *testing.T) {
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
Serving: true,
Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
PrimaryTermStartTime: 10,
PrimaryTermStartTime: secondTabletPrimaryTermStartTimestamp,
}}

actualTabletStats := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
Expand Down

0 comments on commit 2a810d1

Please sign in to comment.