Skip to content

Commit

Permalink
[fix](heartbeat) fix heartbeat editlog no persist hbTime (apache#42653)
Browse files Browse the repository at this point in the history
Backend persist lastUpdateMs, it will be modified by heartbeat editlog.
But heartbeat editlog not persist hbTime, and hbTime always equal 0, it
will make backend's lastUpdateMs = 0 in bdb image.

fix details:
1. heartbeat response persist hbTime;
2. only be state change will write an editlog. but we make a change:
even a backend is healthy, still write a healthy response editlog every
5 min. Inorder to make backend's lastUpdateMs periodly updated in bdb
image. But notice that this change wouldn't increase real editlog num.
Because heartbeat mgr will patch all fe/be's heartbeat into one editlog.
Even no fe/be state change, it still write an editlog which not contains
any node's response.
3. for a dead heartbeat response, set hbTime to last succ hbTime, then
replayer can set correct lastUpdateMs;
  • Loading branch information
yujun777 authored Oct 31, 2024
1 parent 4494b9c commit ac6a868
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2033,6 +2033,12 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = true)
public static long max_backend_heartbeat_failure_tolerance_count = 1;

/**
* Even if a backend is healthy, still write a heartbeat editlog to update backend's lastUpdateMs of bdb image.
*/
@ConfField(mutable = true, masterOnly = true)
public static int editlog_healthy_heartbeat_seconds = 300;

/**
* Abort transaction time after lost heartbeat.
* The default value is 300s, which means transactions of be will be aborted after lost heartbeat 300s.
Expand Down
14 changes: 14 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
Expand Down Expand Up @@ -153,6 +154,8 @@ public class Backend implements Writable {
// send some queries to this BE, it is not an important problem.
private AtomicBoolean isShutDown = new AtomicBoolean(false);

private long nextForceEditlogHeartbeatTime = System.currentTimeMillis() + (new SecureRandom()).nextInt(60 * 1000);

public Backend() {
this.host = "";
this.version = "";
Expand Down Expand Up @@ -876,7 +879,18 @@ public boolean handleHbResponse(BackendHbResponse hbResponse, boolean isReplay)

heartbeatErrMsg = "";
this.heartbeatFailureCounter = 0;

// even if no change, write an editlog to make lastUpdateMs in image update
if (System.currentTimeMillis() >= this.nextForceEditlogHeartbeatTime) {
isChanged = true;
int delaySecond = Config.editlog_healthy_heartbeat_seconds + (new SecureRandom()).nextInt(60);
this.nextForceEditlogHeartbeatTime = System.currentTimeMillis() + delaySecond * 1000L;
}
} else {
// for a bad BackendHbResponse, its hbTime is last succ hbTime, not this hbTime
if (hbResponse.getHbTime() > 0) {
this.lastUpdateMs = hbResponse.getHbTime();
}
// Only set backend to dead if the heartbeat failure counter exceed threshold.
// And if it is a replay process, must set backend to dead.
if (isReplay || ++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,12 @@ public BackendHbResponse(long beId, int bePort, int httpPort, int brpcPort, long
this.beMemory = beMemory;
}

public BackendHbResponse(long beId, String errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
this.beId = beId;
this.msg = errMsg;
}

public BackendHbResponse(long beId, String host, String errMsg) {
public BackendHbResponse(long beId, String host, long lastHbTime, String errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
this.beId = beId;
this.host = host;
this.hbTime = lastHbTime;
this.msg = errMsg;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,13 +315,13 @@ public HeartbeatResponse call() {
System.currentTimeMillis(), beStartTime, version, nodeRole,
fragmentNum, lastFragmentUpdateTime, isShutDown, arrowFlightSqlPort, beMemory);
} else {
return new BackendHbResponse(backendId, backend.getHost(),
return new BackendHbResponse(backendId, backend.getHost(), backend.getLastUpdateMs(),
result.getStatus().getErrorMsgs().isEmpty()
? "Unknown error" : result.getStatus().getErrorMsgs().get(0));
}
} catch (Exception e) {
LOG.warn("backend heartbeat got exception", e);
return new BackendHbResponse(backendId, backend.getHost(),
return new BackendHbResponse(backendId, backend.getHost(), backend.getLastUpdateMs(),
Strings.isNullOrEmpty(e.getMessage()) ? "got exception" : e.getMessage());
} finally {
if (client != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ public enum HbStatus {
protected boolean isTypeRead = false;

/**
* msg and hbTime are no need to be synchronized to other Frontends,
* msg no need to be synchronized to other Frontends,
* and only Master Frontend has these info
*/
protected String msg;

@SerializedName(value = "hbTime")
protected long hbTime;

public HeartbeatResponse(Type type) {
Expand Down

0 comments on commit ac6a868

Please sign in to comment.