sql,compactor: rate limit clear range requests

Now that DBCompactRange no longer attempts to compact the entire database (cockroachdb#26355), sending ClearRange requests in sequential batches of 50is enough to prevent a large DROP TABLE from bricking a cluster. They're slow enough that the compaction queue can keep up and purge range deletion tombstones before enough pile up to wedge the cluster. This is a partial fix for cockroachdb#24029. Release note (bug fix): The performance impact of dropping a large table has been substantially reduced.
benesch · Jun 12, 2018 · ea2aea5 · ea2aea5
1 parent 59616c0
commit ea2aea5
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 39 deletions.
diff --git a/pkg/sql/schema_changer.go b/pkg/sql/schema_changer.go
@@ -330,13 +330,95 @@ func DropTableDesc(
 	})
 }
 
+// truncateTable deletes all of the data in the specified table.
+func (sc *SchemaChanger) truncateTable(
+	ctx context.Context,
+	lease *sqlbase.TableDescriptor_SchemaChangeLease,
+	table *sqlbase.TableDescriptor,
+	evalCtx *extendedEvalContext,
+) error {
+	// If DropTime isn't set, assume this drop request is from a version
+	// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
+	if table.DropTime == 0 {
+		return truncateTableInChunks(ctx, table, sc.db, false /* traceKV */)
+	}
+
+	tableKey := roachpb.RKey(keys.MakeTablePrefix(uint32(table.ID)))
+	tableSpan := roachpb.RSpan{Key: tableKey, EndKey: tableKey.PrefixEnd()}
+
+	// ClearRange requests lays down RocksDB range deletion tombstones that have
+	// serious performance implications (#24029). It is crucial that a single
+	// store never has more than a few dozen tombstones. The logic below attempts
+	// to bound the number of tombstones in one store by sending the ClearRange
+	// requests to each range in the table in small, sequential batches every 5s
+	// rather than letting DistSender send them all in parallel, to hopefully give
+	// the compaction queue time to compact the range tombstones away in between
+	// requests.
+	//
+	// As written, this approach has several deficiencies. It does not actually
+	// wait for the compaction queue to compact the tombstones away before sending
+	// the next request. It is likely insufficient if multiple DROP TABLEs are in
+	// flight at once. It does not save its progress in case the coordinator goes
+	// down. These deficiences could be addressed, but this code is only a
+	// stopgap: we expect that RocksDB tombstones can be made cheap enough that we
+	// won't need to rate limit ClearRange commands in the near future.
+
+	// These numbers were chosen empirically for the clearrange roachtest and
+	// could certainly use more tuning.
+	const batchSize = 20
+	const waitTime = time.Second
+
+	var n int
+	lastKey := tableSpan.Key
+	ri := kv.NewRangeIterator(sc.execCfg.DistSender)
+	for ri.Seek(ctx, tableSpan.Key, kv.Ascending); ; ri.Next(ctx) {
+		if !ri.Valid() {
+			return ri.Error().GoError()
+		}
+
+		// This call is a no-op unless the lease is nearly expired.
+		if err := sc.ExtendLease(ctx, lease); err != nil {
+			return err
+		}
+
+		if n++; n >= batchSize || !ri.NeedAnother(tableSpan) {
+			endKey := ri.Desc().EndKey
+			if tableSpan.EndKey.Less(endKey) {
+				endKey = tableSpan.EndKey
+			}
+			var b client.Batch
+			b.AddRawRequest(&roachpb.ClearRangeRequest{
+				Span: roachpb.Span{
+					Key:    lastKey.AsRawKey(),
+					EndKey: endKey.AsRawKey(),
+				},
+			})
+			log.VEventf(ctx, 2, "ClearRange %s - %s", lastKey, endKey)
+			if err := sc.db.Run(ctx, &b); err != nil {
+				return err
+			}
+			n = 0
+			lastKey = endKey
+		}
+
+		if !ri.NeedAnother(tableSpan) {
+			break
+		}
+
+		time.Sleep(waitTime)
+	}
+
+	return nil
+}
+
 // maybe Add/Drop a table depending on the state of a table descriptor.
 // This method returns true if the table is deleted.
 func (sc *SchemaChanger) maybeAddDrop(
 	ctx context.Context,
 	inSession bool,
 	lease *sqlbase.TableDescriptor_SchemaChangeLease,
 	table *sqlbase.TableDescriptor,
+	evalCtx *extendedEvalContext,
 ) (bool, error) {
 	if table.Dropped() {
 		if err := sc.ExtendLease(ctx, lease); err != nil {
@@ -371,7 +453,7 @@ func (sc *SchemaChanger) maybeAddDrop(
 			}
 		}
 		// Do all the hard work of deleting the table data and the table ID.
-		if err := truncateTableInChunks(ctx, table, sc.db, false /* traceKV */); err != nil {
+		if err := sc.truncateTable(ctx, lease, table, evalCtx); err != nil {
 			return false, err
 		}
 
@@ -501,7 +583,7 @@ func (sc *SchemaChanger) exec(
 		}
 	}
 
-	if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc); err != nil {
+	if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc, evalCtx); err != nil {
 		return err
 	} else if drop {
 		needRelease = false

diff --git a/pkg/sql/tablewriter.go b/pkg/sql/tablewriter.go
@@ -922,11 +922,14 @@ func (td *tableDeleter) deleteAllRows(
 	return td.deleteAllRowsFast(ctx, resume, limit, autoCommit, traceKV)
 }
 
-// deleteAllRowsFast employs a ClearRange KV API call to delete the
-// underlying data quickly. Unlike DeleteRange, ClearRange doesn't
-// leave tombstone data on individual keys, instead using a more
-// efficient ranged tombstone, preventing unnecessary write
-// amplification.
+// deleteAllRowsFast uses the DelRange KV request to delete data quickly,
+// relative to deleteAllRowsScan.
+//
+// Note that this method leaves a RocksDB deletion tombstone on every key in the
+// table, resulting in substantial write amplification. When possible, the
+// schema changer avoids using a tableDeleter entirely in favor of the
+// ClearRange KV request, which uses RocksDB range deletion tombstones to avoid
+// write amplification.
 func (td *tableDeleter) deleteAllRowsFast(
 	ctx context.Context, resume roachpb.Span, limit int64, autoCommit autoCommitOpt, traceKV bool,
 ) (roachpb.Span, error) {
@@ -940,38 +943,7 @@ func (td *tableDeleter) deleteAllRowsFast(
 			EndKey: tablePrefix.PrefixEnd(),
 		}
 	}
-	// If DropTime isn't set, assume this drop request is from a version
-	// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
-	if td.tableDesc().DropTime == 0 {
-		return td.legacyDeleteAllRowsFast(ctx, resume, limit, autoCommit, traceKV)
-	}
 
-	log.VEventf(ctx, 2, "ClearRange %s - %s", resume.Key, resume.EndKey)
-	// ClearRange cannot be run in a transaction, so create a
-	// non-transactional batch to send the request.
-	b := &client.Batch{}
-	// TODO(tschottdorf): this might need a cluster migration.
-	b.AddRawRequest(&roachpb.ClearRangeRequest{
-		Span: roachpb.Span{
-			Key:    resume.Key,
-			EndKey: resume.EndKey,
-		},
-	})
-	if err := td.txn.DB().Run(ctx, b); err != nil {
-		return resume, err
-	}
-	if _, err := td.finalize(ctx, autoCommit, traceKV); err != nil {
-		return resume, err
-	}
-	return roachpb.Span{}, nil
-}
-
-// legacyDeleteAllRowsFast handles cases where no GC deadline is set
-// and so deletion must fall back to relying on DeleteRange and the
-// eventual range GC cycle.
-func (td *tableDeleter) legacyDeleteAllRowsFast(
-	ctx context.Context, resume roachpb.Span, limit int64, autoCommit autoCommitOpt, traceKV bool,
-) (roachpb.Span, error) {
 	log.VEventf(ctx, 2, "DelRange %s - %s", resume.Key, resume.EndKey)
 	td.b.DelRange(resume.Key, resume.EndKey, false /* returnKeys */)
 	td.b.Header.MaxSpanRequestKeys = limit

diff --git a/pkg/storage/compactor/settings.go b/pkg/storage/compactor/settings.go
@@ -53,7 +53,7 @@ var enabled = settings.RegisterBoolSetting(
 var minInterval = settings.RegisterDurationSetting(
 	"compactor.min_interval",
 	"minimum time interval to wait before compacting",
-	2*time.Minute,
+	15*time.Second,
 )
 
 // thresholdBytes is the threshold in bytes of suggested