Skip to content

Commit

Permalink
make use of new hdfs user configuration in namenode and datanode actions
Browse files Browse the repository at this point in the history
  • Loading branch information
ndimiduk committed Jan 13, 2023
1 parent d2f4f0d commit 7c6e217
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
import org.slf4j.Logger;
Expand Down Expand Up @@ -57,39 +57,47 @@ protected Logger getLogger() {
@Override
public void perform() throws Exception {
getLogger().info("Performing action: Restart active namenode");
Configuration conf = CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
throw new Exception("HA for namenode is not enabled");
}
ZKWatcher zkw = null;
RecoverableZooKeeper rzk = null;

final String hadoopHAZkNode;
String activeNamenode = null;
String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
try {
zkw = new ZKWatcher(conf, "get-active-namenode", null);
rzk = zkw.getRecoverableZooKeeper();
String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
for (String eachEntry : subChildern) {
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
byte[] data =
rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, null);
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
activeNamenode = proto.getHostname();
}
int activeNamenodePort = -1;
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
final Configuration conf = dfs.getConf();
hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
final String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);

if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
getLogger().info("HA for HDFS is not enabled; skipping");
return;
}
} finally {
if (zkw != null) {
zkw.close();
try (final ZKWatcher zkw = new ZKWatcher(conf, "get-active-namenode", null)) {
final RecoverableZooKeeper rzk = zkw.getRecoverableZooKeeper();
// If hadoopHAZkNode == '/', pass '' instead because then joinZNode will return '//' as a
// prefix
// which zk doesn't like as a prefix on the path.
final String hadoopHAZkNodePath = ZNodePaths.joinZNode(
(hadoopHAZkNode != null && hadoopHAZkNode.equals("/")) ? "" : hadoopHAZkNode,
nameServiceID);
final List<String> subChildren = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
for (final String eachEntry : subChildren) {
if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
byte[] data = rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME),
false, null);
ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
activeNamenode = proto.getHostname();
activeNamenodePort = proto.getPort();
}
}
}
}

if (activeNamenode == null) {
throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
} else {
getLogger().info("Found Active NameNode host: {}", activeNamenode);
final ServerName activeNNHost = ServerName.valueOf(activeNamenode, activeNamenodePort, -1L);
getLogger().info("Restarting Active NameNode: {}", activeNamenode);
restartNameNode(activeNNHost, this.sleepTime);
}
getLogger().info("Found active namenode host:" + activeNamenode);
ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
getLogger().info("Restarting Active NameNode :" + activeNamenode);
restartNameNode(activeNNHost, sleepTime);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,11 @@
package org.apache.hadoop.hbase.chaos.actions;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Arrays;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -48,18 +45,15 @@ protected Logger getLogger() {
@Override
public void perform() throws Exception {
getLogger().info("Performing action: Restart random data node");
ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
final ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes());
restartDataNode(server, sleepTime);
}

public ServerName[] getDataNodes() throws IOException {
DistributedFileSystem fs =
(DistributedFileSystem) CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf());
DFSClient dfsClient = fs.getClient();
List<ServerName> hosts = new ArrayList<>();
for (DatanodeInfo dataNode : dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) {
hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1));
private ServerName[] getDataNodes() throws IOException {
try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) {
final DFSClient dfsClient = dfs.getClient();
return Arrays.stream(dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE))
.map(dn -> ServerName.valueOf(dn.getHostName(), -1, -1)).toArray(ServerName[]::new);
}
return hosts.toArray(new ServerName[0]);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction;
import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction;
import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction;
import org.apache.hadoop.hbase.chaos.actions.RestartActiveNameNodeAction;
import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction;
import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction;
import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction;
Expand Down Expand Up @@ -56,6 +57,7 @@ public ChaosMonkey build() {
// only allow 2 servers to be dead.
new RollingBatchRestartRsAction(5000, 1.0f, 2, true),
new ForceBalancerAction(),
new RestartActiveNameNodeAction(60000),
new RestartRandomDataNodeAction(60000),
new RestartRandomZKNodeAction(60000),
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
Expand Down

0 comments on commit 7c6e217

Please sign in to comment.