Skip to content

Commit

Permalink
lease: fix lease retention bug for tables taken offline
Browse files Browse the repository at this point in the history
Tables can temporarily be taken offline, either permanently or
temporarily. For example, executing a DROP TABLE statement will take
a table offline permanently, and an IMPORT INTO will take a table
offline only for the duration of the import and will bring it online
again afterward.

Previously, the lease manager would not distinguish between those two
cases and would behave as if the table was gone forever. As a result any
lease acquired after the table came back online would always be dropped
upon dereferencing. Although this behavior is valid, it is inefficient,
hence this patch.

Fixes cockroachdb#57834.

Release note: None
  • Loading branch information
Marius Posta committed Feb 25, 2021
1 parent 928adc7 commit 54ec551
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 14 deletions.
30 changes: 18 additions & 12 deletions pkg/sql/catalog/lease/lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -550,10 +550,15 @@ type descriptorState struct {
// entry is created with the expiration time of the new lease and
// the older entry is removed.
active descriptorSet
// Indicates that the has been dropped, or is being dropped.

// Indicates that the descriptor has been, or is being, dropped or taken
// offline.
// If set, leases are released from the store as soon as their
// refcount drops to 0, as opposed to waiting until they expire.
dropped bool
// This flag will be unset by any subsequent lease acquisition, which can
// happen after the table came back online again after having been taken
// offline temporarily (as opposed to dropped).
takenOffline bool

// acquisitionsInProgress indicates that at least one caller is currently
// in the process of performing an acquisition. This tracking is critical
Expand Down Expand Up @@ -864,6 +869,7 @@ func acquireNodeLease(ctx context.Context, m *Manager, id descpb.ID) (bool, erro
}
t := m.findDescriptorState(id, false /* create */)
t.mu.Lock()
t.mu.takenOffline = false
defer t.mu.Unlock()
toRelease, err = t.upsertLocked(newCtx, desc)
if err != nil {
Expand Down Expand Up @@ -905,9 +911,9 @@ func (t *descriptorState) release(
// when the refcount drops to 0). If so, we'll need to mark the lease as
// invalid.
removeOnceDereferenced = removeOnceDereferenced ||
// Release from the store if the descriptor has been dropped; no leases
// can be acquired any more.
t.mu.dropped ||
// Release from the store if the descriptor has been dropped or taken
// offline.
t.mu.takenOffline ||
// Release from the store if the lease is not for the latest
// version; only leases for the latest version can be acquired.
s != t.mu.active.findNewest()
Expand Down Expand Up @@ -983,9 +989,9 @@ func purgeOldVersions(
return nil
}

removeInactives := func(drop bool) {
removeInactives := func(takenOffline bool) {
t.mu.Lock()
t.mu.dropped = drop
t.mu.takenOffline = takenOffline
leases := t.removeInactiveVersions()
t.mu.Unlock()
for _, l := range leases {
Expand Down Expand Up @@ -2037,9 +2043,9 @@ func (m *Manager) refreshSomeLeases(ctx context.Context) {
break
}
desc.mu.Lock()
dropped := desc.mu.dropped
takenOffline := desc.mu.takenOffline
desc.mu.Unlock()
if !dropped {
if !takenOffline {
ids = append(ids, k)
}
}
Expand Down Expand Up @@ -2140,7 +2146,7 @@ func (m *Manager) Codec() keys.SQLCodec {
// TODO(ajwerner): consider refactoring the function to take a struct, maybe
// called LeaseInfo.
func (m *Manager) VisitLeases(
f func(desc catalog.Descriptor, dropped bool, refCount int, expiration tree.DTimestamp) (wantMore bool),
f func(desc catalog.Descriptor, takenOffline bool, refCount int, expiration tree.DTimestamp) (wantMore bool),
) {
m.mu.Lock()
defer m.mu.Unlock()
Expand All @@ -2149,7 +2155,7 @@ func (m *Manager) VisitLeases(
ts.mu.Lock()
defer ts.mu.Unlock()

dropped := ts.mu.dropped
takenOffline := ts.mu.takenOffline

for _, state := range ts.mu.active.data {
state.mu.Lock()
Expand All @@ -2161,7 +2167,7 @@ func (m *Manager) VisitLeases(
continue
}

if !f(state.Descriptor, dropped, refCount, lease.expiration) {
if !f(state.Descriptor, takenOffline, refCount, lease.expiration) {
return false
}
}
Expand Down
110 changes: 110 additions & 0 deletions pkg/sql/catalog/lease/lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,11 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkeys"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sqltestutils"
"github.com/cockroachdb/cockroach/pkg/sql/tests"
"github.com/cockroachdb/cockroach/pkg/testutils"
Expand Down Expand Up @@ -2447,3 +2449,111 @@ func TestBackoffOnRangefeedFailure(t *testing.T) {
}
require.Greater(t, totalBackoff.Nanoseconds(), (3 * minimumBackoff).Nanoseconds())
}

// TestLeaseWithOfflineTables checks that leases on tables which had
// previously gone offline at some point are not gratuitously dropped.
// See #57834.
func TestLeaseWithOfflineTables(t *testing.T) {
defer leaktest.AfterTest(t)()

var descID uint32
testTableID := func() descpb.ID {
return descpb.ID(atomic.LoadUint32(&descID))
}

var lmKnobs lease.ManagerTestingKnobs
blockDescRefreshed := make(chan struct{}, 1)
lmKnobs.TestingDescriptorRefreshedEvent = func(desc *descpb.Descriptor) {
t := descpb.TableFromDescriptor(desc, hlc.Timestamp{})
if t != nil && testTableID() == t.ID {
blockDescRefreshed <- struct{}{}
}
}

ctx := context.Background()
params, _ := tests.CreateTestServerParams()
params.Knobs.SQLLeaseManager = &lmKnobs
s, db, kvDB := serverutils.StartServer(t, params)
defer s.Stopper().Stop(ctx)
runner := sqlutils.MakeSQLRunner(db)

// This statement prevents timer issues due to periodic lease refreshing.
_, err := db.Exec(`
SET CLUSTER SETTING sql.tablecache.lease.refresh_limit = 0;
`)
require.NoError(t, err)

_, err = db.Exec(`
CREATE DATABASE t;
CREATE TABLE t.test(s STRING PRIMARY KEY);
`)
require.NoError(t, err)

desc := catalogkv.TestingGetTableDescriptor(kvDB, keys.SystemSQLCodec, "t", "test")
atomic.StoreUint32(&descID, uint32(desc.ID))

// Sets table descriptor state and waits for that change to propagate to the
// lease manager's refresh worker.
setTableState := func(expected descpb.DescriptorState, next descpb.DescriptorState) {
err := descs.Txn(
ctx, s.ClusterSettings(),
s.LeaseManager().(*lease.Manager),
s.InternalExecutor().(*sql.InternalExecutor),
kvDB,
func(ctx context.Context, txn *kv.Txn, descsCol *descs.Collection) error {
desc, err := descsCol.GetMutableTableVersionByID(ctx, testTableID(), txn)
require.NoError(t, err)
require.Equal(t, desc.State, expected)
desc.State = next
return descsCol.WriteDesc(ctx, false /* kvTrace */, desc, txn)
},
)
require.NoError(t, err)
// Wait for the lease manager's refresh worker to have processed the
// descriptor update.
<-blockDescRefreshed
}

// Checks that the lease manager state for `t.test` matches expectations.
checkLeaseState := func(shouldBePresent bool) {
var found bool
var wasTakenOffline bool
fn := func(desc catalog.Descriptor, takenOffline bool, _ int, _ tree.DTimestamp) bool {
if testTableID() != desc.GetID() {
return true
}
wasTakenOffline = takenOffline
found = true
return false
}
s.LeaseManager().(*lease.Manager).VisitLeases(fn)
if found && !wasTakenOffline {
require.Truef(t, shouldBePresent, "lease should not have been present but was")
} else if found {
require.Falsef(t, shouldBePresent, "lease should have been present but was marked as taken offline")
} else {
require.Falsef(t, shouldBePresent, "lease should have been present but wasn't")
}
}

// Check initial state.
checkLeaseState(false /* shouldBePresent */)

// Query the table, this should trigger a lease acquisition.
runner.CheckQueryResults(t, "SELECT s FROM t.test", [][]string{})
checkLeaseState(true /* shouldBePresent */)

// Take the table offline and back online again.
// This should relinquish the lease.
setTableState(descpb.DescriptorState_PUBLIC, descpb.DescriptorState_OFFLINE)
setTableState(descpb.DescriptorState_OFFLINE, descpb.DescriptorState_PUBLIC)
checkLeaseState(false /* shouldBePresent */)

// Query the table, thereby acquiring a lease once again.
runner.CheckQueryResults(t, "SELECT s FROM t.test", [][]string{})
checkLeaseState(true /* shouldBePresent */)

// Do a no-op descriptor update, lease should still be present.
setTableState(descpb.DescriptorState_PUBLIC, descpb.DescriptorState_PUBLIC)
checkLeaseState(true /* shouldBePresent */)
}
4 changes: 2 additions & 2 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ CREATE TABLE crdb_internal.leases (
ctx context.Context, p *planner, _ *dbdesc.Immutable, addRow func(...tree.Datum) error,
) (err error) {
nodeID, _ := p.execCfg.NodeID.OptionalNodeID() // zero if not available
p.LeaseMgr().VisitLeases(func(desc catalog.Descriptor, dropped bool, _ int, expiration tree.DTimestamp) (wantMore bool) {
p.LeaseMgr().VisitLeases(func(desc catalog.Descriptor, takenOffline bool, _ int, expiration tree.DTimestamp) (wantMore bool) {
if p.CheckAnyPrivilege(ctx, desc) != nil {
// TODO(ajwerner): inspect what type of error got returned.
return true
Expand All @@ -515,7 +515,7 @@ CREATE TABLE crdb_internal.leases (
tree.NewDString(desc.GetName()),
tree.NewDInt(tree.DInt(int64(desc.GetParentID()))),
&expiration,
tree.MakeDBool(tree.DBool(dropped)),
tree.MakeDBool(tree.DBool(takenOffline)),
)
return err == nil
})
Expand Down

0 comments on commit 54ec551

Please sign in to comment.