Skip to content

Commit

Permalink
Add further debug info to help debug 019_replslot_limit.pl failures.
Browse files Browse the repository at this point in the history
See also afdeff1. Failures after that commit provided a few more hints,
but not yet enough to understand what's going on.

In 019_replslot_limit.pl shut down nodes with fast instead of immediate mode
if we observe the failure mode. That should tell us whether the failures we're
observing are just a timing issue under high load. PGCTLTIMEOUT should prevent
buildfarm animals from hanging endlessly.

Also adds a bit more logging to replication slot drop and ShutdownPostgres().

Discussion: https://postgr.es/m/[email protected]
  • Loading branch information
anarazel committed Feb 26, 2022
1 parent 638300f commit fe0972e
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 1 deletion.
13 changes: 13 additions & 0 deletions src/backend/replication/slot.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,10 @@ ReplicationSlotCleanup(void)
if (!s->in_use)
continue;

/* unlocked read of active_pid is ok for debugging purposes */
elog(DEBUG3, "temporary replication slot cleanup: %d in use, active_pid: %d",
i, s->active_pid);

SpinLockAcquire(&s->mutex);
if (s->active_pid == MyProcPid)
{
Expand Down Expand Up @@ -629,6 +633,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
char path[MAXPGPATH];
char tmppath[MAXPGPATH];

/* temp debugging aid to analyze 019_replslot_limit failures */
elog(DEBUG3, "replication slot drop: %s: begin", NameStr(slot->data.name));

/*
* If some other backend ran this code concurrently with us, we might try
* to delete a slot with a certain name while someone else was trying to
Expand Down Expand Up @@ -679,6 +686,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
path, tmppath)));
}

elog(DEBUG3, "replication slot drop: %s: removed on-disk",
NameStr(slot->data.name));

/*
* The slot is definitely gone. Lock out concurrent scans of the array
* long enough to kill it. It's OK to clear the active PID here without
Expand Down Expand Up @@ -734,6 +744,9 @@ ReplicationSlotDropPtr(ReplicationSlot *slot)
* a slot while we're still cleaning up the detritus of the old one.
*/
LWLockRelease(ReplicationSlotAllocationLock);

elog(DEBUG3, "replication slot drop: %s: done",
NameStr(slot->data.name));
}

/*
Expand Down
7 changes: 7 additions & 0 deletions src/backend/storage/lmgr/lwlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1945,3 +1945,10 @@ LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
}
return false;
}

/* temp debugging aid to analyze 019_replslot_limit failures */
int
LWLockHeldCount(void)
{
return num_held_lwlocks;
}
17 changes: 17 additions & 0 deletions src/backend/utils/init/postinit.c
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,23 @@ ShutdownPostgres(int code, Datum arg)
* them explicitly.
*/
LockReleaseAll(USER_LOCKMETHOD, true);

/*
* temp debugging aid to analyze 019_replslot_limit failures
*
* If an error were thrown outside of a transaction nothing up to now
* would have released lwlocks. We probably will add an
* LWLockReleaseAll(). But for now make it easier to understand such cases
* by warning if any lwlocks are held.
*/
#ifdef USE_ASSERT_CHECKING
{
int held_lwlocks = LWLockHeldCount();
if (held_lwlocks)
elog(WARNING, "holding %d lwlocks at the end of ShutdownPostgres()",
held_lwlocks);
}
#endif
}


Expand Down
1 change: 1 addition & 0 deletions src/include/storage/lwlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
extern void LWLockReleaseAll(void);
extern bool LWLockHeldByMe(LWLock *lock);
extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
extern int LWLockHeldCount(void);

extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
Expand Down
18 changes: 17 additions & 1 deletion src/test/recovery/t/019_replslot_limit.pl
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,23 @@
$node_primary3->wait_for_catchup($node_standby3);
my $senderpid = $node_primary3->safe_psql('postgres',
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");

# We've seen occasional cases where multiple walsender pids are active. An
# immediate shutdown may hide evidence of a locking bug. So if multiple
# walsenders are observed, shut down in fast mode, and collect some more
# information.
if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"))
{
my ($stdout, $stderr);
$node_primary3->psql('postgres',
"\\a\\t\nSELECT * FROM pg_stat_activity",
stdout => \$stdout, stderr => \$stderr);
diag $stdout, $stderr;
$node_primary3->stop('fast');
$node_standby3->stop('fast');
die "could not determine walsender pid, can't continue";
}

my $receiverpid = $node_standby3->safe_psql('postgres',
"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walreceiver'");
like($receiverpid, qr/^[0-9]+$/, "have walreceiver pid $receiverpid");
Expand Down

0 comments on commit fe0972e

Please sign in to comment.