-
Notifications
You must be signed in to change notification settings - Fork 3.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HBASE-23693 Split failure may cause region hole and data loss when use zk assign #1071
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,13 +44,17 @@ | |
import org.apache.hadoop.hbase.TableName; | ||
import org.apache.hadoop.hbase.TableStateManager; | ||
import org.apache.hadoop.hbase.classification.InterfaceAudience; | ||
import org.apache.hadoop.hbase.client.Mutation; | ||
import org.apache.hadoop.hbase.client.Put; | ||
import org.apache.hadoop.hbase.client.RegionReplicaUtil; | ||
import org.apache.hadoop.hbase.client.Result; | ||
import org.apache.hadoop.hbase.master.RegionState.State; | ||
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos; | ||
import org.apache.hadoop.hbase.util.Bytes; | ||
import org.apache.hadoop.hbase.util.ConfigUtil; | ||
import org.apache.hadoop.hbase.util.FSUtils; | ||
import org.apache.hadoop.hbase.util.Pair; | ||
import org.apache.hadoop.hbase.util.PairOfSameType; | ||
import org.apache.hadoop.hbase.zookeeper.ZKAssign; | ||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; | ||
import org.apache.zookeeper.KeeperException; | ||
|
@@ -737,11 +741,13 @@ public void regionOffline( | |
public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) { | ||
// Offline all regions on this server not already in transition. | ||
List<HRegionInfo> rits = new ArrayList<HRegionInfo>(); | ||
Set<HRegionInfo> regionsToClean = new HashSet<HRegionInfo>(); | ||
Set<Pair<HRegionInfo, HRegionInfo>> regionsToClean = | ||
new HashSet<Pair<HRegionInfo, HRegionInfo>>(); | ||
// Offline regions outside the loop and synchronized block to avoid | ||
// ConcurrentModificationException and deadlock in case of meta anassigned, | ||
// but RegionState a blocked. | ||
Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>(); | ||
Map<String, HRegionInfo> daughter2Parent = new HashMap<>(); | ||
synchronized (this) { | ||
Set<HRegionInfo> assignedRegions = serverHoldings.get(sn); | ||
if (assignedRegions == null) { | ||
|
@@ -758,8 +764,20 @@ public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final Ser | |
// Delete the ZNode if exists | ||
ZKAssign.deleteNodeFailSilent(watcher, region); | ||
regionsToOffline.add(region); | ||
PairOfSameType<HRegionInfo> daughterRegions = | ||
MetaTableAccessor.getDaughterRegionsFromParent(this.server.getConnection(), region); | ||
if (daughterRegions != null) { | ||
if (daughterRegions.getFirst() != null) { | ||
daughter2Parent.put(daughterRegions.getFirst().getEncodedName(), region); | ||
} | ||
if (daughterRegions.getSecond() != null) { | ||
daughter2Parent.put(daughterRegions.getSecond().getEncodedName(), region); | ||
} | ||
} | ||
} catch (KeeperException ke) { | ||
server.abort("Unexpected ZK exception deleting node " + region, ke); | ||
} catch (IOException e) { | ||
LOG.warn("get daughter from meta exception " + region, e); | ||
Comment on lines
+767
to
+780
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it really entering this block? My understanding is that parent region would be in OFFLINE state and SPLIT=true after createDaughters call completed inside SplitTransactionImpl.execute. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but at this time(after createDaughters call completed inside SplitTransactionImpl.execute) the split transaction is still not entirely complete. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
And that's why seems wrong to have SPLIT=true already at that point. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we update the meta information later, we can only put it after the completion of execute There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, that's what I think.
If you think it's too much work try fixing the state/split flag updates on this PR, then yeah, we can merge this one for now, then work on the other solution in a different jira/RP. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, I hope to merge this patch first, and other work can be completed in new JIRA. At least it can solve the problem of data loss. In extreme scenarios, the region hole can be temporarily repaired by HBCK. |
||
} | ||
} | ||
} | ||
|
@@ -783,10 +801,20 @@ public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final Ser | |
LOG.info("Found region in " + state + | ||
" to be reassigned by ServerCrashProcedure for " + sn); | ||
rits.add(hri); | ||
} else if(state.isSplittingNew() || state.isMergingNew()) { | ||
LOG.info("Offline/Cleanup region if no meta entry exists, hri: " + hri + | ||
" state: " + state); | ||
regionsToClean.add(state.getRegion()); | ||
} else if (state.isSplittingNew() || state.isMergingNew()) { | ||
LOG.info( | ||
"Offline/Cleanup region if no meta entry exists, hri: " + hri + " state: " + state); | ||
if (daughter2Parent.containsKey(hri.getEncodedName())) { | ||
HRegionInfo parent = daughter2Parent.get(hri.getEncodedName()); | ||
HRegionInfo info = getHRIFromMeta(parent); | ||
if (info != null && info.isSplit() && info.isOffline()) { | ||
regionsToClean.add(Pair.newPair(state.getRegion(), info)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, so parent is OFFLINE and SPLIT=true in meta only, but not in assignedRegions? Sounds wrong that we marked parent as OFFLINE and SPLIT even when splitting was still not entirely complete. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the RS machine crash when the SplitTransactionImpl step after PONR, master will handle the split rollback. Under normal conditions, it cleanup daughter region and try to assign parent region. If assign failed, parent region RIT so CatalogJanitor blocked; but at this time the master switch, new master will delete parent region because of OFFLINE and SPLIT=true. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This daughter2Parent added stingency makes sense to me. Nice. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yeah, but it seems to work only because Master has different info for the region than what is actual in meta. What if Active Master also crashes before it triggers SCP and daughters get cleaned up? The other master will load regions state as seen in meta, then it can run into same problem again. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
daughter cleanup is excuted in SCP, and in this patch it will do three things together:
So if Active Master also crashes before it triggers SCP, the daughter won`t be deleted. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that's my point. We would now have a new active master that sees parent split as complete, although split was midway through. It will potentially remove the parent, and fail to online the daughters. I wonder if working on correcting the regions state and split flag updates would sort split failures at different scenarios. It also does not seem consistent the way we do these updates in meta only and don't reflect it on the "in-memory" region info master has. |
||
} else { | ||
regionsToClean.add(Pair.<HRegionInfo, HRegionInfo>newPair(state.getRegion(), null)); | ||
} | ||
} else { | ||
regionsToClean.add(Pair.<HRegionInfo, HRegionInfo>newPair(state.getRegion(), null)); | ||
} | ||
} else { | ||
LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state); | ||
} | ||
|
@@ -803,19 +831,34 @@ public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final Ser | |
return rits; | ||
} | ||
|
||
private HRegionInfo getHRIFromMeta(HRegionInfo parent) { | ||
Result result = null; | ||
try { | ||
result = | ||
MetaTableAccessor.getRegionResult(this.server.getConnection(), parent.getRegionName()); | ||
HRegionInfo info = MetaTableAccessor.getHRegionInfo(result); | ||
return info; | ||
} catch (IOException e) { | ||
LOG.error("got exception when query meta with region " + parent.getEncodedName(), e); | ||
return null; | ||
} | ||
} | ||
|
||
/** | ||
* This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held. | ||
* In ZK mode we rollback and hence cleanup daughters/merged region. We also cleanup if | ||
* meta doesn't have these regions. | ||
* | ||
* @param hris The hris to check if empty in hbase:meta and if so, clean them up. | ||
*/ | ||
private void cleanFailedSplitMergeRegions(Set<HRegionInfo> hris) { | ||
private void cleanFailedSplitMergeRegions(Set<Pair<HRegionInfo, HRegionInfo>> hris) { | ||
if (hris.isEmpty()) { | ||
return; | ||
} | ||
|
||
for (HRegionInfo hri : hris) { | ||
for (Pair<HRegionInfo, HRegionInfo> hriPair : hris) { | ||
HRegionInfo hri = hriPair.getFirst(); | ||
HRegionInfo parentInfo = hriPair.getSecond(); | ||
// This is RPC to meta table. It is done while we have a synchronize on | ||
// regionstates. No progress will be made if meta is not available at this time. | ||
// This is a cleanup task. Not critical. | ||
|
@@ -829,6 +872,15 @@ private void cleanFailedSplitMergeRegions(Set<HRegionInfo> hris) { | |
if (regionPair != null) { | ||
MetaTableAccessor.deleteRegion(this.server.getConnection(), hri); | ||
} | ||
if (parentInfo != null) { | ||
List<Mutation> mutations = new ArrayList<Mutation>(); | ||
HRegionInfo copyOfParent = new HRegionInfo(parentInfo); | ||
copyOfParent.setOffline(false); | ||
copyOfParent.setSplit(false); | ||
Put putParent = MetaTableAccessor.makePutFromRegionInfo(copyOfParent); | ||
mutations.add(putParent); | ||
MetaTableAccessor.mutateMetaTable(this.server.getConnection(), mutations); | ||
} | ||
LOG.debug("Cleaning up HDFS since no meta entry exists, hri: " + hri); | ||
FSUtils.deleteRegionDir(server.getConfiguration(), hri); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggest this needs more comment on why we need this accounting in daughter2Parent.