Skip to content

Commit

Permalink
VTOrc running PRS when database_instance empty bug fix. (#12019) (#1475)
Browse files Browse the repository at this point in the history
* feat: convert join with database_instance to a left join and prevent fixes from running if the information from database_instance is unavailable

Signed-off-by: Manan Gupta <[email protected]>

* test: add tests to verify the fix works

Signed-off-by: Manan Gupta <[email protected]>

Signed-off-by: Manan Gupta <[email protected]>

Signed-off-by: Manan Gupta <[email protected]>
  • Loading branch information
GuptaManan100 authored Jan 11, 2023
1 parent bf253b0 commit 0598783
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 2 deletions.
9 changes: 9 additions & 0 deletions go/vt/vtorc/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,15 @@ func deployStatements(db *sql.DB, queries []string) error {
return nil
}

// ClearVTOrcDatabase is used to clear the VTOrc database. This function is meant to be used by tests to clear the
// database to get a clean slate without starting a new one.
func ClearVTOrcDatabase() {
db, _, _ := sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile)
if db != nil {
_ = initVTOrcDB(db)
}
}

// initVTOrcDB attempts to create/upgrade the vtorc backend database. It is created once in the
// application's lifetime.
func initVTOrcDB(db *sql.DB) error {
Expand Down
7 changes: 6 additions & 1 deletion go/vt/vtorc/inst/analysis_dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
vitess_keyspace.keyspace_type AS keyspace_type,
vitess_keyspace.durability_policy AS durability_policy,
primary_instance.read_only AS read_only,
MIN(primary_instance.hostname) IS NULL AS is_invalid,
MIN(primary_instance.data_center) AS data_center,
MIN(primary_instance.region) AS region,
MIN(primary_instance.physical_environment) AS physical_environment,
Expand Down Expand Up @@ -288,7 +289,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
JOIN vitess_keyspace ON (
vitess_tablet.keyspace = vitess_keyspace.keyspace
)
JOIN database_instance primary_instance ON (
LEFT JOIN database_instance primary_instance ON (
vitess_tablet.hostname = primary_instance.hostname
AND vitess_tablet.port = primary_instance.port
)
Expand Down Expand Up @@ -468,6 +469,10 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
// We failed to load the durability policy, so we shouldn't run any analysis
return nil
}
isInvalid := m.GetBool("is_invalid")
if isInvalid {
return nil
}
if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 {
a.Analysis = DeadPrimaryWithoutReplicas
a.Description = "Primary cannot be reached by vtorc and has no replica"
Expand Down
140 changes: 139 additions & 1 deletion go/vt/vtorc/inst/analysis_dao_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ import (
"vitess.io/vitess/go/vt/vtorc/test"
)

func TestGetReplicationAnalysis(t *testing.T) {
// TestGetReplicationAnalysisDecision tests the code of GetReplicationAnalysis decision-making. It doesn't check the SQL query
// run by it. It only checks the analysis part after the rows have been read. This tests fakes the db and explicitly returns the
// rows that are specified in the test.
func TestGetReplicationAnalysisDecision(t *testing.T) {
tests := []struct {
name string
info []*test.InfoForRecoveryAnalysis
Expand Down Expand Up @@ -519,10 +522,54 @@ func TestGetReplicationAnalysis(t *testing.T) {
keyspaceWanted: "ks",
shardWanted: "0",
codeWanted: NoProblem,
}, {
// If the database_instance table for a tablet is empty (discovery of MySQL information hasn't happened yet or failed)
// then we shouldn't run a failure fix on it until the discovery succeeds
name: "Empty database_instance table",
info: []*test.InfoForRecoveryAnalysis{{
TabletInfo: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 101},
Hostname: "localhost",
Keyspace: "ks",
Shard: "0",
Type: topodatapb.TabletType_PRIMARY,
MysqlHostname: "localhost",
MysqlPort: 6708,
},
DurabilityPolicy: "semi_sync",
LastCheckValid: 1,
CountReplicas: 4,
CountValidReplicas: 4,
CountValidReplicatingReplicas: 3,
CountValidOracleGTIDReplicas: 4,
CountLoggingReplicas: 2,
IsPrimary: 1,
SemiSyncPrimaryEnabled: 1,
}, {
TabletInfo: &topodatapb.Tablet{
Alias: &topodatapb.TabletAlias{Cell: "zon1", Uid: 100},
Hostname: "localhost",
Keyspace: "ks",
Shard: "0",
Type: topodatapb.TabletType_REPLICA,
MysqlHostname: "localhost",
MysqlPort: 6709,
},
IsInvalid: 1,
DurabilityPolicy: "semi_sync",
}},
keyspaceWanted: "ks",
shardWanted: "0",
codeWanted: NoProblem,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
oldDB := db.Db
defer func() {
db.Db = oldDB
}()

var rowMaps []sqlutils.RowMap
for _, analysis := range tt.info {
analysis.SetValuesFromTabletInfo()
Expand All @@ -547,3 +594,94 @@ func TestGetReplicationAnalysis(t *testing.T) {
})
}
}

// TestGetReplicationAnalysis tests the entire GetReplicationAnalysis. It inserts data into the database and runs the function.
// The database is not faked. This is intended to give more test coverage. This test is more comprehensive but more expensive than TestGetReplicationAnalysisDecision.
// This test is somewhere between a unit test, and an end-to-end test. It is specifically useful for testing situations which are hard to come by in end-to-end test, but require
// real-world data to test specifically.
func TestGetReplicationAnalysis(t *testing.T) {
// The initialSQL is a set of insert commands copied from a dump of an actual running VTOrc instances. The relevant insert commands are here.
// This is a dump taken from a test running 4 tablets, zone1-101 is the primary, zone1-100 is a replica, zone1-112 is a rdonly and zone2-200 is a cross-cell replica.
initialSQL := []string{
`INSERT INTO database_instance VALUES('localhost',6747,'2022-12-28 07:26:04','2022-12-28 07:26:04',213696377,'8.0.31','ROW',1,1,'vt-0000000112-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000112-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-9240-92a06c3be3c2','2022-12-28 07:26:04','',1,0,0,'zone1-0000000112','Homebrew','8.0','FULL',10816929,0,0,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-9240-92a06c3be3c2',1,1,'',1000000000000000000,1,0,0,0,'',0,'','','[]','',0);`,
`INSERT INTO database_instance VALUES('localhost',6711,'2022-12-28 07:26:04','2022-12-28 07:26:04',1094500338,'8.0.31','ROW',1,1,'vt-0000000100-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000100-relay-bin.000002',15815,0,1,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a5138-8680-11ed-acf8-d6b0ef9f4eaa','2022-12-28 07:26:04','',1,0,0,'zone1-0000000100','Homebrew','8.0','FULL',10103920,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a5138-8680-11ed-acf8-d6b0ef9f4eaa',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`,
`INSERT INTO database_instance VALUES('localhost',6714,'2022-12-28 07:26:04','2022-12-28 07:26:04',390954723,'8.0.31','ROW',1,1,'vt-0000000101-bin.000001',15583,'',0,0,0,'',0,'',0,NULL,NULL,0,'','',0,0,'',0,0,0,0,'zone1','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a4cc4-8680-11ed-a104-47706090afbd','2022-12-28 07:26:04','',0,0,0,'zone1-0000000101','Homebrew','8.0','FULL',11366095,1,1,'ON',1,'','','729a4cc4-8680-11ed-a104-47706090afbd',-1,-1,'',1000000000000000000,1,1,0,2,'',0,'','','[]','',0);`,
`INSERT INTO database_instance VALUES('localhost',6756,'2022-12-28 07:26:05','2022-12-28 07:26:05',444286571,'8.0.31','ROW',1,1,'vt-0000000200-bin.000001',15963,'localhost',6714,1,1,'vt-0000000101-bin.000001',15583,'vt-0000000101-bin.000001',15583,0,0,1,'','',1,0,'vt-0000000200-relay-bin.000002',15815,0,1,0,'zone2','',0,0,0,1,'729a4cc4-8680-11ed-a104-47706090afbd:1-54','729a497c-8680-11ed-8ad4-3f51d747db75','2022-12-28 07:26:05','',1,0,0,'zone2-0000000200','Homebrew','8.0','FULL',10443112,0,1,'ON',1,'729a4cc4-8680-11ed-a104-47706090afbd','','729a4cc4-8680-11ed-a104-47706090afbd,729a497c-8680-11ed-8ad4-3f51d747db75',1,1,'',1000000000000000000,1,0,1,0,'',0,'','','[]','',0);`,
`INSERT INTO vitess_tablet VALUES('localhost',6711,'ks','0','zone1',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731307d20706f72745f6d61703a7b6b65793a227674222076616c75653a363730397d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363731312064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`,
`INSERT INTO vitess_tablet VALUES('localhost',6714,'ks','0','zone1',1,'2022-12-28 07:23:25.129898+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3130317d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363731337d20706f72745f6d61703a7b6b65793a227674222076616c75653a363731327d206b657973706163653a226b73222073686172643a22302220747970653a5052494d415259206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a36373134207072696d6172795f7465726d5f73746172745f74696d653a7b7365636f6e64733a31363732323132323035206e616e6f7365636f6e64733a3132393839383030307d2064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`,
`INSERT INTO vitess_tablet VALUES('localhost',6747,'ks','0','zone1',3,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653122207569643a3131327d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363734367d20706f72745f6d61703a7b6b65793a227674222076616c75653a363734357d206b657973706163653a226b73222073686172643a22302220747970653a52444f4e4c59206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363734372064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`,
`INSERT INTO vitess_tablet VALUES('localhost',6756,'ks','0','zone2',2,'0001-01-01 00:00:00+00:00',X'616c6961733a7b63656c6c3a227a6f6e653222207569643a3230307d20686f73746e616d653a226c6f63616c686f73742220706f72745f6d61703a7b6b65793a2267727063222076616c75653a363735357d20706f72745f6d61703a7b6b65793a227674222076616c75653a363735347d206b657973706163653a226b73222073686172643a22302220747970653a5245504c494341206d7973716c5f686f73746e616d653a226c6f63616c686f737422206d7973716c5f706f72743a363735362064625f7365727665725f76657273696f6e3a22382e302e3331222064656661756c745f636f6e6e5f636f6c6c6174696f6e3a3435');`,
`INSERT INTO vitess_keyspace VALUES('ks',0,'semi_sync');`,
}

// The test is intended to be used as follows. The initial data is stored into the database. Following this, some specific queries are run that each individual test specifies to get the desired state.
tests := []struct {
name string
sql []string
codeWanted AnalysisCode
shardWanted string
keyspaceWanted string
}{
{
name: "No additions",
sql: nil,
codeWanted: NoProblem,
}, {
name: "Removing Primary Tablet's Vitess record",
sql: []string{
// This query removes the primary tablet's vitess_tablet record
`delete from vitess_tablet where port = 6714`,
},
codeWanted: ClusterHasNoPrimary,
keyspaceWanted: "ks",
shardWanted: "0",
}, {
name: "Removing Primary Tablet's MySQL record",
sql: []string{
// This query removes the primary tablet's database_instance record
`delete from database_instance where port = 6714`,
},
// As long as we have the vitess record stating that this tablet is the primary
// It would be incorrect to run a PRS.
// This situation only happens when we haven't been able to read the MySQL information even once for this tablet.
// So it is likely a new tablet.
codeWanted: NoProblem,
}, {
name: "Removing Replica Tablet's MySQL record",
sql: []string{
// This query removes the replica tablet's database_instance record
`delete from database_instance where port = 6711`,
},
// As long as we don't have the MySQL information, we shouldn't do anything.
// We should wait for the MySQL information to be refreshed once.
// This situation only happens when we haven't been able to read the MySQL information even once for this tablet.
// So it is likely a new tablet.
codeWanted: NoProblem,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Each test should clear the database. The easiest way to do that is to run all the initialization commands again
defer func() {
db.ClearVTOrcDatabase()
}()

for _, query := range append(initialSQL, tt.sql...) {
_, err := db.ExecVTOrc(query)
require.NoError(t, err)
}

got, err := GetReplicationAnalysis("", "", &ReplicationAnalysisHints{})
require.NoError(t, err)
if tt.codeWanted == NoProblem {
require.Len(t, got, 0)
return
}
require.Len(t, got, 1)
require.Equal(t, tt.codeWanted, got[0].Analysis)
require.Equal(t, tt.keyspaceWanted, got[0].AnalyzedKeyspace)
require.Equal(t, tt.shardWanted, got[0].AnalyzedShard)
})
}
}
2 changes: 2 additions & 0 deletions go/vt/vtorc/test/recovery_analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type InfoForRecoveryAnalysis struct {
Shard string
KeyspaceType int
DurabilityPolicy string
IsInvalid int
IsPrimary int
IsCoPrimary int
Hostname string
Expand Down Expand Up @@ -118,6 +119,7 @@ func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap {
rowMap["is_co_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsCoPrimary), Valid: true}
rowMap["is_downtimed"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsDowntimed), Valid: true}
rowMap["is_failing_to_connect_to_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsFailingToConnectToPrimary), Valid: true}
rowMap["is_invalid"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsInvalid), Valid: true}
rowMap["is_last_check_valid"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.LastCheckValid), Valid: true}
rowMap["is_primary"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsPrimary), Valid: true}
rowMap["is_stale_binlog_coordinates"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsStaleBinlogCoordinates), Valid: true}
Expand Down

0 comments on commit 0598783

Please sign in to comment.