Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
yujun777 committed Oct 30, 2024
1 parent 061fe5d commit 3621967
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2020,7 +2020,7 @@ public class Config extends ConfigBase {
* Even if a backend is healthy, still write a heartbeat editlog to update backend's lastUpdateMs of bdb image.
*/
@ConfField(mutable = true, masterOnly = true)
public static long editlog_healthy_heartbeat_seconds = 300;
public static int editlog_healthy_heartbeat_seconds = 300;

/**
* Abort transaction time after lost heartbeat.
Expand Down
9 changes: 0 additions & 9 deletions fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,6 @@ public class Env {
private AtomicBoolean canRead = new AtomicBoolean(false);
private String toMasterProgress = "";
private BlockingQueue<FrontendNodeType> typeTransferQueue;
private long transferToMasterTime = -1L;

// node name is used for bdbje NodeName.
protected String nodeName;
Expand Down Expand Up @@ -1688,8 +1687,6 @@ private void transferToMaster() {
if (analysisManager != null) {
analysisManager.getStatisticsCache().preHeat();
}

transferToMasterTime = System.currentTimeMillis();
} catch (Throwable e) {
// When failed to transfer to master, we need to exit the process.
// Otherwise, the process will be in an unknown state.
Expand Down Expand Up @@ -1920,8 +1917,6 @@ private void transferToNonMaster(FrontendNodeType newType) {
followerColumnSender = new FollowerColumnSender();
followerColumnSender.start();
}

transferToMasterTime = -1L;
} catch (Throwable e) {
// When failed to transfer to non-master, we need to exit the process.
// Otherwise, the process will be in an unknown state.
Expand All @@ -1930,10 +1925,6 @@ private void transferToNonMaster(FrontendNodeType newType) {
}
}

public long getTransferToMasterTime() {
return transferToMasterTime;
}

// Set global variable 'lower_case_table_names' only when the cluster is initialized.
private void initLowerCaseTableNames() {
if (Config.lower_case_table_names > 2 || Config.lower_case_table_names < 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ public void checkDecommissionState(Map<String, List<Long>> clusterToBes) {
private boolean completeRouteInfo() {
List<UpdateCloudReplicaInfo> updateReplicaInfos = new ArrayList<UpdateCloudReplicaInfo>();
long[] assignedErrNum = {0L};
long needRehashDeadTime = System.currentTimeMillis() - Config.rehash_tablet_after_be_dead_seconds * 1000L;
loopCloudReplica((Database db, Table table, Partition partition, MaterializedIndex index, String cluster) -> {
boolean assigned = false;
List<Long> beIds = new ArrayList<Long>();
Expand All @@ -526,7 +527,8 @@ private boolean completeRouteInfo() {

// primary backend is alive or dead not long
Backend be = replica.getPrimaryBackend(cluster);
if (!needRehashTabletOnPrimaryBackend(be)) {
if (be != null && (be.isQueryAvailable()
|| (!be.isQueryDisabled() && be.getLastUpdateMs() > needRehashDeadTime))) {
beIds.add(be.getId());
tabletIds.add(tablet.getId());
continue;
Expand Down Expand Up @@ -590,40 +592,6 @@ private boolean completeRouteInfo() {
return true;
}

private boolean needRehashTabletOnPrimaryBackend(Backend be) {
if (be == null) {
return true;
}

// be is alive and not disable query
if (be.isQueryAvailable()) {
return false;
}

// disable query
if (be.isQueryDisabled()) {
return true;
}

// backend's last heartbeat time maybe incorrect because not always write heartbeat editlog.
// only backend state change can write a heartbeat editlog, like alive change to dead, or dead change to alive.
//
// suppose steps as follow:
// 1. be dead at time T1;
// 2. be alive at time T2, since be state change, write an editlog, be's last update ms = T2;
// 3. be heartbeat ok at time T3. but no state change, no write editlog;
// 4. kill -9 master fe at time T4;
// 5. kill -9 be at time T5;
// 6. master fe become alive, it will replay editlog of step 2, now be's last update ms = T2.
// this is incorrect, should least >= T3.
//
// so rehash a primary be need two condititions:
// a. be lost heartbeat for a long time;
// b. fe had become master for a long time;
long ts = System.currentTimeMillis() - Config.rehash_tablet_after_be_dead_seconds * 1000L;
return be.getLastUpdateMs() < ts && Env.getCurrentEnv().getTransferToMasterTime() < ts;
}

public void fillBeToTablets(long be, long tableId, long partId, long indexId, Tablet tablet,
Map<Long, List<Tablet>> globalBeToTablets,
Map<Long, Map<Long, List<Tablet>>> beToTabletsInTable,
Expand Down
11 changes: 8 additions & 3 deletions fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;

Expand Down Expand Up @@ -805,7 +807,7 @@ public String toString() {
public String getHealthyStatus() {
return "Backend [id=" + id + ", isDecommission: " + isDecommissioned
+ ", backendStatus: " + backendStatus + ", isAlive: " + isAlive.get() + ", lastUpdateTime: "
+ TimeUtils.longToTimeString(lastUpdateMs) + "]";
+ TimeUtils.longToTimeString(lastUpdateMs);
}

/**
Expand Down Expand Up @@ -886,7 +888,7 @@ public boolean handleHbResponse(BackendHbResponse hbResponse, boolean isReplay)

// even if no change, write an editlog to make lastUpdateMs in image update
if (System.currentTimeMillis() - this.lastEditlogHeartbeatTime
>= Config.editlog_healthy_heartbeat_seconds) {
>= Config.editlog_healthy_heartbeat_seconds * 1000L) {
isChanged = true;
}
} else {
Expand All @@ -911,7 +913,10 @@ public boolean handleHbResponse(BackendHbResponse hbResponse, boolean isReplay)
}

if (isChanged) {
this.lastEditlogHeartbeatTime = System.currentTimeMillis();
Random random = new SecureRandom();
int second = random.nextInt(Math.max(Config.heartbeat_interval_second,
Config.editlog_healthy_heartbeat_seconds));
this.lastEditlogHeartbeatTime = System.currentTimeMillis() + second * 1000L / 2;
}

return isChanged;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,6 @@ public BackendHbResponse(long beId, int bePort, int httpPort, int brpcPort, long
this.beMemory = beMemory;
}

public BackendHbResponse(long beId, long lastHbTime, String errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
this.beId = beId;
this.hbTime = lastHbTime;
this.msg = errMsg;
}

public BackendHbResponse(long beId, String host, long lastHbTime, String errMsg) {
super(HeartbeatResponse.Type.BACKEND);
this.status = HbStatus.BAD;
Expand Down

0 comments on commit 3621967

Please sign in to comment.