Skip to content

Commit

Permalink
sql,compactor: rate limit clear range requests
Browse files Browse the repository at this point in the history
Now that DBCompactRange no longer attempts to compact the entire
database (cockroachdb#26355), sending ClearRange requests in sequential batches of
50is enough to prevent a large DROP TABLE from bricking a cluster.
They're slow enough that the compaction queue can keep up and purge
range deletion tombstones before enough pile up to wedge the cluster.

This is a partial fix for cockroachdb#24029.

Release note (bug fix): The performance impact of dropping a large table
has been substantially reduced.
  • Loading branch information
benesch committed Jun 12, 2018
1 parent 59616c0 commit ea2aea5
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 39 deletions.
86 changes: 84 additions & 2 deletions pkg/sql/schema_changer.go
Original file line number Diff line number Diff line change
Expand Up @@ -330,13 +330,95 @@ func DropTableDesc(
})
}

// truncateTable deletes all of the data in the specified table.
func (sc *SchemaChanger) truncateTable(
ctx context.Context,
lease *sqlbase.TableDescriptor_SchemaChangeLease,
table *sqlbase.TableDescriptor,
evalCtx *extendedEvalContext,
) error {
// If DropTime isn't set, assume this drop request is from a version
// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
if table.DropTime == 0 {
return truncateTableInChunks(ctx, table, sc.db, false /* traceKV */)
}

tableKey := roachpb.RKey(keys.MakeTablePrefix(uint32(table.ID)))
tableSpan := roachpb.RSpan{Key: tableKey, EndKey: tableKey.PrefixEnd()}

// ClearRange requests lays down RocksDB range deletion tombstones that have
// serious performance implications (#24029). It is crucial that a single
// store never has more than a few dozen tombstones. The logic below attempts
// to bound the number of tombstones in one store by sending the ClearRange
// requests to each range in the table in small, sequential batches every 5s
// rather than letting DistSender send them all in parallel, to hopefully give
// the compaction queue time to compact the range tombstones away in between
// requests.
//
// As written, this approach has several deficiencies. It does not actually
// wait for the compaction queue to compact the tombstones away before sending
// the next request. It is likely insufficient if multiple DROP TABLEs are in
// flight at once. It does not save its progress in case the coordinator goes
// down. These deficiences could be addressed, but this code is only a
// stopgap: we expect that RocksDB tombstones can be made cheap enough that we
// won't need to rate limit ClearRange commands in the near future.

// These numbers were chosen empirically for the clearrange roachtest and
// could certainly use more tuning.
const batchSize = 20
const waitTime = time.Second

var n int
lastKey := tableSpan.Key
ri := kv.NewRangeIterator(sc.execCfg.DistSender)
for ri.Seek(ctx, tableSpan.Key, kv.Ascending); ; ri.Next(ctx) {
if !ri.Valid() {
return ri.Error().GoError()
}

// This call is a no-op unless the lease is nearly expired.
if err := sc.ExtendLease(ctx, lease); err != nil {
return err
}

if n++; n >= batchSize || !ri.NeedAnother(tableSpan) {
endKey := ri.Desc().EndKey
if tableSpan.EndKey.Less(endKey) {
endKey = tableSpan.EndKey
}
var b client.Batch
b.AddRawRequest(&roachpb.ClearRangeRequest{
Span: roachpb.Span{
Key: lastKey.AsRawKey(),
EndKey: endKey.AsRawKey(),
},
})
log.VEventf(ctx, 2, "ClearRange %s - %s", lastKey, endKey)
if err := sc.db.Run(ctx, &b); err != nil {
return err
}
n = 0
lastKey = endKey
}

if !ri.NeedAnother(tableSpan) {
break
}

time.Sleep(waitTime)
}

return nil
}

// maybe Add/Drop a table depending on the state of a table descriptor.
// This method returns true if the table is deleted.
func (sc *SchemaChanger) maybeAddDrop(
ctx context.Context,
inSession bool,
lease *sqlbase.TableDescriptor_SchemaChangeLease,
table *sqlbase.TableDescriptor,
evalCtx *extendedEvalContext,
) (bool, error) {
if table.Dropped() {
if err := sc.ExtendLease(ctx, lease); err != nil {
Expand Down Expand Up @@ -371,7 +453,7 @@ func (sc *SchemaChanger) maybeAddDrop(
}
}
// Do all the hard work of deleting the table data and the table ID.
if err := truncateTableInChunks(ctx, table, sc.db, false /* traceKV */); err != nil {
if err := sc.truncateTable(ctx, lease, table, evalCtx); err != nil {
return false, err
}

Expand Down Expand Up @@ -501,7 +583,7 @@ func (sc *SchemaChanger) exec(
}
}

if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc); err != nil {
if drop, err := sc.maybeAddDrop(ctx, inSession, &lease, tableDesc, evalCtx); err != nil {
return err
} else if drop {
needRelease = false
Expand Down
44 changes: 8 additions & 36 deletions pkg/sql/tablewriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -922,11 +922,14 @@ func (td *tableDeleter) deleteAllRows(
return td.deleteAllRowsFast(ctx, resume, limit, autoCommit, traceKV)
}

// deleteAllRowsFast employs a ClearRange KV API call to delete the
// underlying data quickly. Unlike DeleteRange, ClearRange doesn't
// leave tombstone data on individual keys, instead using a more
// efficient ranged tombstone, preventing unnecessary write
// amplification.
// deleteAllRowsFast uses the DelRange KV request to delete data quickly,
// relative to deleteAllRowsScan.
//
// Note that this method leaves a RocksDB deletion tombstone on every key in the
// table, resulting in substantial write amplification. When possible, the
// schema changer avoids using a tableDeleter entirely in favor of the
// ClearRange KV request, which uses RocksDB range deletion tombstones to avoid
// write amplification.
func (td *tableDeleter) deleteAllRowsFast(
ctx context.Context, resume roachpb.Span, limit int64, autoCommit autoCommitOpt, traceKV bool,
) (roachpb.Span, error) {
Expand All @@ -940,38 +943,7 @@ func (td *tableDeleter) deleteAllRowsFast(
EndKey: tablePrefix.PrefixEnd(),
}
}
// If DropTime isn't set, assume this drop request is from a version
// 1.1 server and invoke legacy code that uses DeleteRange and range GC.
if td.tableDesc().DropTime == 0 {
return td.legacyDeleteAllRowsFast(ctx, resume, limit, autoCommit, traceKV)
}

log.VEventf(ctx, 2, "ClearRange %s - %s", resume.Key, resume.EndKey)
// ClearRange cannot be run in a transaction, so create a
// non-transactional batch to send the request.
b := &client.Batch{}
// TODO(tschottdorf): this might need a cluster migration.
b.AddRawRequest(&roachpb.ClearRangeRequest{
Span: roachpb.Span{
Key: resume.Key,
EndKey: resume.EndKey,
},
})
if err := td.txn.DB().Run(ctx, b); err != nil {
return resume, err
}
if _, err := td.finalize(ctx, autoCommit, traceKV); err != nil {
return resume, err
}
return roachpb.Span{}, nil
}

// legacyDeleteAllRowsFast handles cases where no GC deadline is set
// and so deletion must fall back to relying on DeleteRange and the
// eventual range GC cycle.
func (td *tableDeleter) legacyDeleteAllRowsFast(
ctx context.Context, resume roachpb.Span, limit int64, autoCommit autoCommitOpt, traceKV bool,
) (roachpb.Span, error) {
log.VEventf(ctx, 2, "DelRange %s - %s", resume.Key, resume.EndKey)
td.b.DelRange(resume.Key, resume.EndKey, false /* returnKeys */)
td.b.Header.MaxSpanRequestKeys = limit
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/compactor/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ var enabled = settings.RegisterBoolSetting(
var minInterval = settings.RegisterDurationSetting(
"compactor.min_interval",
"minimum time interval to wait before compacting",
2*time.Minute,
15*time.Second,
)

// thresholdBytes is the threshold in bytes of suggested
Expand Down

0 comments on commit ea2aea5

Please sign in to comment.