diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 3d39fddaaef..f238a392ae7 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -569,6 +569,10 @@ ReplicationSlotCleanup(void) if (!s->in_use) continue; + /* unlocked read of active_pid is ok for debugging purposes */ + elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d", + i, s->active_pid); + SpinLockAcquire(&s->mutex); if (s->active_pid == MyProcPid) { @@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) char path[MAXPGPATH]; char tmppath[MAXPGPATH]; + /* temp debugging aid to analyze 019_replslot_limit failures */ + elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name)); + /* * If some other backend ran this code concurrently with us, we might try * to delete a slot with a certain name while someone else was trying to @@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) path, tmppath))); } + elog(DEBUG3, "replication slot drop: %s: removed on-disk", + NameStr(slot->data.name)); + /* * The slot is definitely gone. Lock out concurrent scans of the array * long enough to kill it. It's OK to clear the active PID here without @@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) * a slot while we're still cleaning up the detritus of the old one. */ LWLockRelease(ReplicationSlotAllocationLock); + + elog(DEBUG3, "replication slot drop: %s: done", + NameStr(slot->data.name)); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 7b0dea4abec..8f7f1b2f7c3 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode) } return false; } + +/* temp debugging aid to analyze 019_replslot_limit failures */ +int +LWLockHeldCount(void) +{ + return num_held_lwlocks; +} diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index a29fa0b3e6a..86d193c89fc 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg) * them explicitly. */ LockReleaseAll(USER_LOCKMETHOD, true); + + /* + * temp debugging aid to analyze 019_replslot_limit failures + * + * If an error were thrown outside of a transaction nothing up to now + * would have released lwlocks. We probably will add an + * LWLockReleaseAll(). But for now make it easier to understand such cases + * by warning if any lwlocks are held. + */ +#ifdef USE_ASSERT_CHECKING + { + int held_lwlocks = LWLockHeldCount(); + if (held_lwlocks) + elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()", + held_lwlocks); + } +#endif } diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 124977cf7e3..c3d5889d7b2 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val); extern void LWLockReleaseAll(void); extern bool LWLockHeldByMe(LWLock *lock); extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode); +extern int LWLockHeldCount(void); extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval); extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value); diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl index 0c9da9bf272..9bb71b62c06 100644 --- a/src/test/recovery/t/019_replslot_limit.pl +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -335,7 +335,23 @@ $node_primary3->wait_for_catchup($node_standby3); my $senderpid = $node_primary3->safe_psql('postgres', "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); -like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"); + +# We've seen occasional cases where multiple walsender pids are active. An +# immediate shutdown may hide evidence of a locking bug. So if multiple +# walsenders are observed, shut down in fast mode, and collect some more +# information. +if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid")) +{ + my ($stdout, $stderr); + $node_primary3->psql('postgres', + "\\a\\t\nSELECT * FROM pg_stat_activity", + stdout => \$stdout, stderr => \$stderr); + diag $stdout, $stderr; + $node_primary3->stop('fast'); + $node_standby3->stop('fast'); + die "could not determine walsender pid, can't continue"; +} + my $receiverpid = $node_standby3->safe_psql('postgres', "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'"); like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");