Skip to content
This repository has been archived by the owner on Nov 14, 2024. It is now read-only.

[Timelock Partitioning] Part 32: Client aware event recorders #4263

Merged
merged 4 commits into from
Sep 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,10 @@ public static LocalPaxosServices createInstrumentedLocalServices(

AsyncLeadershipObserver leadershipObserver = AsyncLeadershipObserver.create();
PaxosLeadershipEventRecorder leadershipEventRecorder = PaxosLeadershipEventRecorder.create(
metricsManager.getRegistry(), leaderUuid.toString(), leadershipObserver);
metricsManager.getTaggedRegistry(),
leaderUuid.toString(),
leadershipObserver,
ImmutableList.of());

PaxosAcceptor ourAcceptor = AtlasDbMetrics.instrument(metricsManager.getRegistry(),
PaxosAcceptor.class,
Expand Down
6 changes: 6 additions & 0 deletions changelog/@unreleased/pr-4263.v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
type: break
break:
description: '`PaxosLeadershipEventRecorder` now takes in a `TaggedMetricRegistry`
instead of a `MetricRegistry`. The names of the metrics remain the same.'
links:
- https://github.com/palantir/atlasdb/pull/4263
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
*/
package com.palantir.leader;

import java.util.List;

import org.apache.commons.lang3.ArrayUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.palantir.logsafe.SafeArg;
import com.palantir.paxos.PaxosRoundFailureException;
import com.palantir.paxos.PaxosValue;
import com.palantir.tritium.metrics.registry.MetricName;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;

import net.jcip.annotations.ThreadSafe;

Expand All @@ -40,61 +44,75 @@ class LeadershipEvents {
private final Meter leaderPingFailure;
private final Meter leaderPingTimeout;
private final Meter leaderPingReturnedFalse;
private final Object[] contextArgs;

LeadershipEvents(MetricRegistry metrics) {
gainedLeadership = metrics.meter("leadership.gained");
lostLeadership = metrics.meter("leadership.lost");
noQuorum = metrics.meter("leadership.no-quorum");
proposedLeadership = metrics.meter("leadership.proposed");
proposalFailure = metrics.meter("leadership.proposed.failure");
leaderPingFailure = metrics.meter("leadership.ping-leader.failure");
leaderPingTimeout = metrics.meter("leadership.ping-leader.timeout");
leaderPingReturnedFalse = metrics.meter("leadership.ping-leader.returned-false");
LeadershipEvents(TaggedMetricRegistry metrics, List<SafeArg<Object>> contextArgs) {
gainedLeadership = metrics.meter(withName("leadership.gained"));
lostLeadership = metrics.meter(withName("leadership.lost"));
noQuorum = metrics.meter(withName("leadership.no-quorum"));
proposedLeadership = metrics.meter(withName("leadership.proposed"));
proposalFailure = metrics.meter(withName("leadership.proposed.failure"));
leaderPingFailure = metrics.meter(withName("leadership.ping-leader.failure"));
leaderPingTimeout = metrics.meter(withName("leadership.ping-leader.timeout"));
leaderPingReturnedFalse = metrics.meter(withName("leadership.ping-leader.returned-false"));
this.contextArgs = contextArgs.toArray(new Object[0]);
}

void proposedLeadershipFor(long round) {
leaderLog.info("Proposing leadership for {}", SafeArg.of("round", round));
leaderLog.info("Proposing leadership for {}", withContextArgs(SafeArg.of("round", round)));
proposedLeadership.mark();
}

void gainedLeadershipFor(PaxosValue value) {
leaderLog.info("Gained leadership for {}", SafeArg.of("value", value));
leaderLog.info("Gained leadership for {}", withContextArgs(SafeArg.of("value", value)));
gainedLeadership.mark();
}

void lostLeadershipFor(PaxosValue value) {
leaderLog.info("Lost leadership for {}", SafeArg.of("value", value));
leaderLog.info("Lost leadership for {}", withContextArgs(SafeArg.of("value", value)));
lostLeadership.mark();
}

void noQuorum(PaxosValue value) {
leaderLog.warn("The most recent known information says this server is the leader,"
+ " but there is no quorum right now. The paxos value is {}",
SafeArg.of("value", value));
withContextArgs(SafeArg.of("value", value)));
noQuorum.mark();
}

void leaderPingFailure(Throwable error) {
leaderLog.warn("Failed to ping the current leader", error);
leaderLog.warn("Failed to ping the current leader", withContextArgs(error));
leaderPingFailure.mark();
}

void leaderPingTimeout() {
leaderLog.warn("Timed out while attempting to ping the current leader");
leaderLog.warn("Timed out while attempting to ping the current leader", contextArgs);
leaderPingTimeout.mark();
}

void leaderPingReturnedFalse() {
leaderLog.info("We contacted the suspected leader, but it reported that it was no longer leading");
leaderLog.info("We contacted the suspected leader, but it reported that it was no longer leading", contextArgs);
leaderPingReturnedFalse.mark();
}

void proposalFailure(PaxosRoundFailureException paxosException) {
leaderLog.warn("Leadership was not gained.\n"
+ "We should recover automatically. If this recurs often, try to \n"
+ " (1) ensure that most other nodes are reachable over the network, and \n"
+ " (2) increase the randomWaitBeforeProposingLeadershipMs timeout in your configuration.",
paxosException);
+ "We should recover automatically. If this recurs often, try to \n"
+ " (1) ensure that most other nodes are reachable over the network, and \n"
+ " (2) increase the randomWaitBeforeProposingLeadershipMs timeout in your configuration.",
withContextArgs(paxosException));
proposalFailure.mark();
}

private Object[] withContextArgs(Object arg) {
if (contextArgs.length == 0) {
return new Object[] { arg };
} else {
return ArrayUtils.add(contextArgs, arg);
}
}
felixdesouza marked this conversation as resolved.
Show resolved Hide resolved

private static MetricName withName(String name) {
return MetricName.builder().safeName(name).build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
*/
package com.palantir.leader;

import java.util.List;
import java.util.Optional;

import javax.annotation.concurrent.GuardedBy;

import com.codahale.metrics.MetricRegistry;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.palantir.logsafe.SafeArg;
import com.palantir.paxos.PaxosRoundFailureException;
import com.palantir.paxos.PaxosValue;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;

public class PaxosLeadershipEventRecorder implements PaxosKnowledgeEventRecorder, PaxosLeaderElectionEventRecorder {

Expand All @@ -33,14 +36,17 @@ public class PaxosLeadershipEventRecorder implements PaxosKnowledgeEventRecorder
@GuardedBy("this") private PaxosValue currentRound = null;
@GuardedBy("this") private boolean isLeading = false;

public static PaxosLeadershipEventRecorder create(MetricRegistry metrics, String leaderUuid) {
return create(metrics, leaderUuid, null);
public static PaxosLeadershipEventRecorder create(TaggedMetricRegistry metrics, String leaderUuid) {
return create(metrics, leaderUuid, null, ImmutableList.of());
}

public static PaxosLeadershipEventRecorder create(MetricRegistry metrics,
String leaderUuid, LeadershipObserver observer) {
public static PaxosLeadershipEventRecorder create(
TaggedMetricRegistry metrics,
String leaderUuid,
LeadershipObserver observer,
List<SafeArg<Object>> safeArgs) {
return new PaxosLeadershipEventRecorder(
new LeadershipEvents(metrics),
new LeadershipEvents(metrics, safeArgs),
leaderUuid,
Optional.ofNullable(observer));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ private int lockAndUnlockAndCountExceptions(List<LockService> lockServices, int
for (int i = 0; i < numRequestsPerClient; i++) {
int currentTrial = i;
futures.add(executorService.submit(() ->
lockService.lock(CLIENT_2 + String.valueOf(currentTrial), REQUEST_LOCK_WITH_LONG_TIMEOUT))
lockService.lock(CLIENT_2 + currentTrial, REQUEST_LOCK_WITH_LONG_TIMEOUT))
);
}
}
Expand Down Expand Up @@ -412,20 +412,6 @@ public void throwsOnFastForwardWithIncorrectParameter() throws IOException {
assertThat(response.code()).isEqualTo(HttpStatus.BAD_REQUEST_400);
}

@Test
public void leadershipEventsSmokeTest() throws IOException {
MetricsOutput metrics = getMetricsOutput();

metrics.assertContainsMeter("leadership.gained");
metrics.assertContainsMeter("leadership.lost");
metrics.assertContainsMeter("leadership.proposed");
metrics.assertContainsMeter("leadership.no-quorum");
metrics.assertContainsMeter("leadership.proposed.failure");

assertThat(metrics.getMeter("leadership.gained").get("count").intValue()).isEqualTo(1);
assertThat(metrics.getMeter("leadership.proposed").get("count").intValue()).isEqualTo(1);
}

@Test
// TODO(nziebart): test remote service instrumentation - we need a multi-node server config for this
public void instrumentationSmokeTest() throws IOException {
Expand Down