Skip to content

Commit

Permalink
Merge #26615
Browse files Browse the repository at this point in the history
26615: release-2.0: sql,compactor: rate limit clear range requests r=bdarnell,petermattis a=benesch

Backports #26449.

I'm running a clearrange roachtest with this diff applied tonight. If it passes we're good to go.

```diff
diff --git a/pkg/cmd/roachtest/clearrange.go b/pkg/cmd/roachtest/clearrange.go
index ea5bcdff8..2b244af6d 100644
--- a/pkg/cmd/roachtest/clearrange.go
+++ b/pkg/cmd/roachtest/clearrange.go
@@ -30,19 +30,9 @@ func registerClearRange(r *registry) {
 		// thoroughly brick the cluster.
 		Stable: false,
 		Run: func(ctx context.Context, t *test, c *cluster) {
-			t.Status(`downloading store dumps`)
-			// Created via:
-			// roachtest --cockroach cockroach-v2.0.1 store-gen --stores=10 bank \
-			//           --payload-bytes=10240 --ranges=0 --rows=65104166
-			fixtureURL := `gs://cockroach-fixtures/workload/bank/version=1.0.0,payload-bytes=10240,ranges=0,rows=65104166,seed=1`
-			location := storeDirURL(fixtureURL, c.nodes, "2.0")
+			t.Status(`waiting for compactions to disappear`)
+			time.Sleep(90 * time.Minute)
 
-			// Download this store dump, which measures around 2TB (across all nodes).
-			if err := downloadStoreDumps(ctx, c, location, c.nodes); err != nil {
-				t.Fatal(err)
-			}
-
-			c.Put(ctx, cockroach, "./cockroach")
 			c.Start(ctx)
 
 			// Also restore a much smaller table. We'll use it to run queries against
@@ -81,7 +71,7 @@ func registerClearRange(r *registry) {
 				// above didn't brick the cluster.
 				//
 				// Don't lower this number, or the test may pass erroneously.
-				const minutes = 60
+				const minutes = 120
 				t.WorkerStatus("repeatedly running COUNT(*) on small table")
 				for i := 0; i < minutes; i++ {
 					after := time.After(time.Minute)

```

---

Now that DBCompactRange no longer attempts to compact the entire
database (#26355), sending ClearRange requests in sequential batches of
50is enough to prevent a large DROP TABLE from bricking a cluster.
They're slow enough that the compaction queue can keep up and purge
range deletion tombstones before enough pile up to wedge the cluster.

This is a partial fix for #24029.

Release note (bug fix): The performance impact of dropping a large table
has been substantially reduced.

Co-authored-by: Nikhil Benesch <[email protected]>
  • Loading branch information
craig[bot] and benesch committed Jun 13, 2018
2 parents c9c7a93 + 8e2b677 commit 91715a9
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 39 deletions.
85 changes: 83 additions & 2 deletions pkg/sql/schema_changer.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,13 +330,94 @@ func DropTableDesc(
})
}

// truncateTable deletes all of the data in the specified table.
func (sc *SchemaChanger) truncateTable(
ctx context.Context,
lease *sqlbase.TableDescriptor_SchemaChangeLease,
table *sqlbase.TableDescriptor,
evalCtx *extendedEvalContext,
) error {
// If DropTime isn't set, assume this drop request is from a version
// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
if table.DropTime == 0 {
return truncateTableInChunks(ctx, table, sc.db, false /* traceKV */)
}

tableKey := roachpb.RKey(keys.MakeTablePrefix(uint32(table.ID)))
tableSpan := roachpb.RSpan{Key: tableKey, EndKey: tableKey.PrefixEnd()}

// ClearRange requests lays down RocksDB range deletion tombstones that have
// serious performance implications (#24029). It is crucial that a single
// store never has more than a few dozen tombstones. The logic below attempts
// to bound the number of tombstones in one store by sending the ClearRange
// requests to each range in the table in small, sequential batches every 5s
// rather than letting DistSender send them all in parallel, to hopefully give
// the compaction queue time to compact the range tombstones away in between
// requests.
//
// As written, this approach has several deficiencies. It does not actually
// wait for the compaction queue to compact the tombstones away before sending
// the next request. It is likely insufficient if multiple DROP TABLEs are in
// flight at once. It does not save its progress in case the coordinator goes
// down. These deficiences could be addressed, but this code is only a
// stopgap: we expect that RocksDB tombstones can be made cheap enough that we
// won't need to rate limit ClearRange commands in the near future.

// These numbers were chosen empirically for the clearrange roachtest and
// could certainly use more tuning.
const batchSize = 20
const waitTime = time.Second

var n int
lastKey := tableSpan.Key
ri := kv.NewRangeIterator(sc.execCfg.DistSender)
for ri.Seek(ctx, tableSpan.Key, kv.Ascending); ; ri.Next(ctx) {
if !ri.Valid() {
return ri.Error().GoError()
}

// This call is a no-op unless the lease is nearly expired.
if err := sc.ExtendLease(ctx, lease); err != nil {
return err
}

if n++; n >= batchSize || !ri.NeedAnother(tableSpan) {
endKey := ri.Desc().EndKey
if tableSpan.EndKey.Less(endKey) {
endKey = tableSpan.EndKey
}
var b client.Batch
b.AddRawRequest(&roachpb.ClearRangeRequest{
Span: roachpb.Span{
Key: lastKey.AsRawKey(),
EndKey: endKey.AsRawKey(),
},
})
log.VEventf(ctx, 2, "ClearRange %s - %s", lastKey, endKey)
if err := sc.db.Run(ctx, &b); err != nil {
return err
}
n = 0
lastKey = endKey
time.Sleep(waitTime)
}

if !ri.NeedAnother(tableSpan) {
break
}
}

return nil
}

// maybe Add/Drop a table depending on the state of a table descriptor.
// This method returns true if the table is deleted.
func (sc *SchemaChanger) maybeAddDrop(
ctx context.Context,
inSession bool,
lease *sqlbase.TableDescriptor_SchemaChangeLease,
table *sqlbase.TableDescriptor,
evalCtx *extendedEvalContext,
) (bool, error) {
if table.Dropped() {
if err := sc.ExtendLease(ctx, lease); err != nil {
Expand Down Expand Up @@ -371,7 +452,7 @@ func (sc *SchemaChanger) maybeAddDrop(
}
}
// Do all the hard work of deleting the table data and the table ID.
if err := truncateTableInChunks(ctx, table, sc.db, false /* traceKV */); err != nil {
if err := sc.truncateTable(ctx, lease, table, evalCtx); err != nil {
return false, err
}

Expand Down Expand Up @@ -501,7 +582,7 @@ func (sc *SchemaChanger) exec(
}
}

if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc); err != nil {
if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc, evalCtx); err != nil {
return err
} else if drop {
needRelease = false
Expand Down
44 changes: 8 additions & 36 deletions pkg/sql/tablewriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -922,11 +922,14 @@ func (td *tableDeleter) deleteAllRows(
return td.deleteAllRowsFast(ctx, resume, limit, autoCommit, traceKV)
}

// deleteAllRowsFast employs a ClearRange KV API call to delete the
// underlying data quickly. Unlike DeleteRange, ClearRange doesn't
// leave tombstone data on individual keys, instead using a more
// efficient ranged tombstone, preventing unnecessary write
// amplification.
// deleteAllRowsFast uses the DelRange KV request to delete data quickly,
// relative to deleteAllRowsScan.
//
// Note that this method leaves a RocksDB deletion tombstone on every key in the
// table, resulting in substantial write amplification. When possible, the
// schema changer avoids using a tableDeleter entirely in favor of the
// ClearRange KV request, which uses RocksDB range deletion tombstones to avoid
// write amplification.
func (td *tableDeleter) deleteAllRowsFast(
ctx context.Context, resume roachpb.Span, limit int64, autoCommit autoCommitOpt, traceKV bool,
) (roachpb.Span, error) {
Expand All @@ -940,38 +943,7 @@ func (td *tableDeleter) deleteAllRowsFast(
EndKey: tablePrefix.PrefixEnd(),
}
}
// If DropTime isn't set, assume this drop request is from a version
// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
if td.tableDesc().DropTime == 0 {
return td.legacyDeleteAllRowsFast(ctx, resume, limit, autoCommit, traceKV)
}

log.VEventf(ctx, 2, "ClearRange %s - %s", resume.Key, resume.EndKey)
// ClearRange cannot be run in a transaction, so create a
// non-transactional batch to send the request.
b := &client.Batch{}
// TODO(tschottdorf): this might need a cluster migration.
b.AddRawRequest(&roachpb.ClearRangeRequest{
Span: roachpb.Span{
Key: resume.Key,
EndKey: resume.EndKey,
},
})
if err := td.txn.DB().Run(ctx, b); err != nil {
return resume, err
}
if _, err := td.finalize(ctx, autoCommit, traceKV); err != nil {
return resume, err
}
return roachpb.Span{}, nil
}

// legacyDeleteAllRowsFast handles cases where no GC deadline is set
// and so deletion must fall back to relying on DeleteRange and the
// eventual range GC cycle.
func (td *tableDeleter) legacyDeleteAllRowsFast(
ctx context.Context, resume roachpb.Span, limit int64, autoCommit autoCommitOpt, traceKV bool,
) (roachpb.Span, error) {
log.VEventf(ctx, 2, "DelRange %s - %s", resume.Key, resume.EndKey)
td.b.DelRange(resume.Key, resume.EndKey, false /* returnKeys */)
td.b.Header.MaxSpanRequestKeys = limit
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/compactor/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ var enabled = settings.RegisterBoolSetting(
var minInterval = settings.RegisterDurationSetting(
"compactor.min_interval",
"minimum time interval to wait before compacting",
2*time.Minute,
15*time.Second,
)

// thresholdBytes is the threshold in bytes of suggested
Expand Down

0 comments on commit 91715a9

Please sign in to comment.