From f5011c9491ef86161dc6be165c907cf78d118382 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Fri, 11 Aug 2023 12:07:49 -0700 Subject: [PATCH] importer: only check import *atomicity* in TestImportWorkerFailure Five years ago, in #26881, we changed import to retry on worker failures, which made imports much more resilient to transient failures like nodes going down. As part of this work we created `TestImportWorkerFailure` which shuts down one node during an import, and checks that the import succeeded. Unfortunately, this test was checked-in skipped, because though imports were much more resilient to node failures, they were not completely resilient in every possible scenario, making the test flakey. Two months ago, in #105712, we unskipped this test and discovered that in some cases the import statement succeeded but only imported a partial dataset. This non-atomicity seems like a bigger issue than whether the import is able to succeed in every possible transient failure scenario, and is tracked separately in #108547. This PR changes `TestImportWorkerFailure` to remove successful import as a necessary condition for test success. Instead, the test now only checks whether the import was atomic; that is, whether a successful import imported all data or a failed import imported none. This is more in line with what we can guarantee about imports today. Fixes: #102839 Release note: None --- pkg/sql/importer/import_stmt_test.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pkg/sql/importer/import_stmt_test.go b/pkg/sql/importer/import_stmt_test.go index d8b898dcf05a..d08a7ed1e7d4 100644 --- a/pkg/sql/importer/import_stmt_test.go +++ b/pkg/sql/importer/import_stmt_test.go @@ -5386,6 +5386,10 @@ func TestImportWorkerFailure(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + skip.UnderStressWithIssue(t, 108547, "flaky test") + skip.UnderDeadlockWithIssue(t, 108547, "flaky test") + skip.UnderRaceWithIssue(t, 108547, "flaky test") + allowResponse := make(chan struct{}) params := base.TestClusterArgs{} params.ServerArgs.Knobs.JobsTestingKnobs = jobs.NewTestingKnobsWithShortIntervals() @@ -5433,10 +5437,15 @@ func TestImportWorkerFailure(t *testing.T) { tc.StopServer(1) close(allowResponse) - // We expect the statement to retry since it should have encountered a - // retryable error. + // We expect the IMPORT statement to usually retry since it should have + // encountered a retryable error. We don't currently catch all such retryable + // errors, however, so in some cases the IMPORT statement will return the + // error to the client. In this case we verify that the import was completely + // rolled back. if err := <-errCh; err != nil { - t.Fatal(err) + t.Logf("%s failed, checking that imported data was completely removed: %q", query, err) + sqlDB.CheckQueryResults(t, `SELECT * FROM t ORDER BY i`, [][]string{}) + return } // But the job should be restarted and succeed eventually.