Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

branch-3.0: [fix](heartbeat) fix heartbeat editlog no persist hbTime #42986

Merged
merged 1 commit into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1996,6 +1996,12 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, masterOnly = true)
public static long max_backend_heartbeat_failure_tolerance_count = 1;

/**
* Even if a backend is healthy, still write a heartbeat editlog to update backend's lastUpdateMs of bdb image.
*/
@ConfField(mutable = true, masterOnly = true)
public static int editlog_healthy_heartbeat_seconds = 300;

/**
* Abort transaction time after lost heartbeat.
* The default value is 300s, which means transactions of be will be aborted after lost heartbeat 300s.
Expand Down
14 changes: 14 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
Expand Down Expand Up @@ -153,6 +154,8 @@ public class Backend implements Writable {
// send some queries to this BE, it is not an important problem.
private AtomicBoolean isShutDown = new AtomicBoolean(false);

private long nextForceEditlogHeartbeatTime = System.currentTimeMillis() + (new SecureRandom()).nextInt(60 * 1000);

public Backend() {
this.host = "";
this.version = "";
Expand Down Expand Up @@ -881,7 +884,18 @@ public boolean handleHbResponse(BackendHbResponse hbResponse, boolean isReplay)

heartbeatErrMsg = "";
this.heartbeatFailureCounter = 0;

// even if no change, write an editlog to make lastUpdateMs in image update
if (System.currentTimeMillis() >= this.nextForceEditlogHeartbeatTime) {
isChanged = true;
int delaySecond = Config.editlog_healthy_heartbeat_seconds + (new SecureRandom()).nextInt(60);
this.nextForceEditlogHeartbeatTime = System.currentTimeMillis() + delaySecond * 1000L;
}
} else {
// for a bad BackendHbResponse, its hbTime is last succ hbTime, not this hbTime
if (hbResponse.getHbTime() > 0) {
this.lastUpdateMs = hbResponse.getHbTime();
}
// Only set backend to dead if the heartbeat failure counter exceed threshold.
// And if it is a replay process, must set backend to dead.
if (isReplay || ++this.heartbeatFailureCounter >= Config.max_backend_heartbeat_failure_tolerance_count) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,12 @@ public BackendHbResponse(long beId, int bePort, int httpPort, int brpcPort, long
this.beMemory = beMemory;
}

public BackendHbResponse(long beId, String errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
this.beId = beId;
this.msg = errMsg;
}

public BackendHbResponse(long beId, String host, String errMsg) {
public BackendHbResponse(long beId, String host, long lastHbTime, String errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
this.beId = beId;
this.host = host;
this.hbTime = lastHbTime;
this.msg = errMsg;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,13 +315,13 @@ public HeartbeatResponse call() {
System.currentTimeMillis(), beStartTime, version, nodeRole,
fragmentNum, lastFragmentUpdateTime, isShutDown, arrowFlightSqlPort, beMemory);
} else {
return new BackendHbResponse(backendId, backend.getHost(),
return new BackendHbResponse(backendId, backend.getHost(), backend.getLastUpdateMs(),
result.getStatus().getErrorMsgs().isEmpty()
? "Unknown error" : result.getStatus().getErrorMsgs().get(0));
}
} catch (Exception e) {
LOG.warn("backend heartbeat got exception", e);
return new BackendHbResponse(backendId, backend.getHost(),
return new BackendHbResponse(backendId, backend.getHost(), backend.getLastUpdateMs(),
Strings.isNullOrEmpty(e.getMessage()) ? "got exception" : e.getMessage());
} finally {
if (client != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ public enum HbStatus {
protected boolean isTypeRead = false;

/**
* msg and hbTime are no need to be synchronized to other Frontends,
* msg no need to be synchronized to other Frontends,
* and only Master Frontend has these info
*/
protected String msg;

@SerializedName(value = "hbTime")
protected long hbTime;

public HeartbeatResponse(Type type) {
Expand Down
Loading