diff --git a/hugegraph-pd/hg-pd-core/pom.xml b/hugegraph-pd/hg-pd-core/pom.xml new file mode 100644 index 0000000000..1f23259d21 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/pom.xml @@ -0,0 +1,88 @@ + + + + + 4.0.0 + + + org.apache.hugegraph + hugegraph-pd + ${revision} + ../pom.xml + + + hg-pd-core + + + 0.5.10 + + + + com.alipay.sofa + jraft-core + + 1.3.13 + + + org.rocksdb + rocksdbjni + + + + + org.rocksdb + rocksdbjni + 6.29.5 + + + org.apache.hugegraph + hg-pd-grpc + + + org.springframework + spring-context + 5.3.20 + + + org.apache.hugegraph + hg-pd-common + + + org.springframework.boot + spring-boot + 2.5.14 + + + org.projectlombok + lombok + 1.18.24 + + + org.apache.commons + commons-lang3 + 3.12.0 + + + com.google.code.gson + gson + 2.8.9 + + + diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/ConfigService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/ConfigService.java new file mode 100644 index 0000000000..cc28c1b0a7 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/ConfigService.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.util.List; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.meta.ConfigMetaStore; +import org.apache.hugegraph.pd.meta.MetadataFactory; +import org.apache.hugegraph.pd.raft.RaftEngine; +import org.apache.hugegraph.pd.raft.RaftStateListener; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class ConfigService implements RaftStateListener { + + private final ConfigMetaStore meta; + private PDConfig pdConfig; + + public ConfigService(PDConfig config) { + this.pdConfig = config; + config.setConfigService(this); + meta = MetadataFactory.newConfigMeta(config); + } + + public Metapb.PDConfig getPDConfig(long version) throws PDException { + return this.meta.getPdConfig(version); + } + + public Metapb.PDConfig getPDConfig() throws PDException { + return this.meta.getPdConfig(0); + } + + public Metapb.PDConfig setPDConfig(Metapb.PDConfig mConfig) throws PDException { + Metapb.PDConfig oldCfg = getPDConfig(); + Metapb.PDConfig.Builder builder = oldCfg.toBuilder().mergeFrom(mConfig) + .setVersion(oldCfg.getVersion() + 1) + .setTimestamp(System.currentTimeMillis()); + mConfig = this.meta.setPdConfig(builder.build()); + log.info("PDConfig has been modified, new PDConfig is {}", mConfig); + updatePDConfig(mConfig); + return mConfig; + } + + public List getGraphSpace(String graphSpaceName) throws PDException { + return this.meta.getGraphSpace(graphSpaceName); + } + + public Metapb.GraphSpace setGraphSpace(Metapb.GraphSpace graphSpace) throws PDException { + return this.meta.setGraphSpace(graphSpace.toBuilder() + .setTimestamp(System.currentTimeMillis()) + .build()); + } + + /** + * 从存储中读取配置项,并覆盖全局的PDConfig对象 + * + * @return + */ + public PDConfig loadConfig() { + try { + Metapb.PDConfig mConfig = this.meta.getPdConfig(0); + if (mConfig == null) { + mConfig = Metapb.PDConfig.newBuilder() + .setPartitionCount(pdConfig.getInitialPartitionCount()) + .setShardCount(pdConfig.getPartition().getShardCount()) + .setVersion(1) + .setTimestamp(System.currentTimeMillis()) + .setMaxShardsPerStore( + pdConfig.getPartition().getMaxShardsPerStore()) + .build(); + } + if (RaftEngine.getInstance().isLeader()) { + this.meta.setPdConfig(mConfig); + } + pdConfig = updatePDConfig(mConfig); + } catch (Exception e) { + log.error("ConfigService loadConfig exception {}", e); + } + return pdConfig; + } + + public synchronized PDConfig updatePDConfig(Metapb.PDConfig mConfig) { + log.info("update pd config: mConfig:{}", mConfig); + pdConfig.getPartition().setShardCount(mConfig.getShardCount()); + pdConfig.getPartition().setTotalCount(mConfig.getPartitionCount()); + pdConfig.getPartition().setMaxShardsPerStore(mConfig.getMaxShardsPerStore()); + return pdConfig; + } + + public synchronized PDConfig setPartitionCount(int count) { + Metapb.PDConfig mConfig = null; + try { + mConfig = getPDConfig(); + mConfig = mConfig.toBuilder().setPartitionCount(count).build(); + setPDConfig(mConfig); + } catch (PDException e) { + log.error("ConfigService exception {}", e); + e.printStackTrace(); + } + return pdConfig; + } + + /** + * meta store中的数量 + * 由于可能会受分区分裂/合并的影响,原始的partition count不推荐使用 + * + * @return partition count of cluster + * @throws PDException when io error + */ + public int getPartitionCount() throws PDException { + return getPDConfig().getPartitionCount(); + } + + @Override + public void onRaftLeaderChanged() { + loadConfig(); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/IdService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/IdService.java new file mode 100644 index 0000000000..0c854d06db --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/IdService.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.meta.IdMetaStore; +import org.apache.hugegraph.pd.meta.MetadataFactory; + +public class IdService { + + private final IdMetaStore meta; + private PDConfig pdConfig; + + public IdService(PDConfig config) { + this.pdConfig = config; + meta = MetadataFactory.newHugeServerMeta(config); + } + + public PDConfig getPdConfig() { + return pdConfig; + } + + public void setPdConfig(PDConfig pdConfig) { + this.pdConfig = pdConfig; + } + + public long getId(String key, int delta) throws PDException { + return meta.getId(key, delta); + } + + public void resetId(String key) throws PDException { + meta.resetId(key); + } + + /** + * 获取自增循环不重复id, 达到上限后从0开始自增.自动跳过正在使用的cid + * + * @param key + * @param max + * @return + * @throws PDException + */ + public long getCId(String key, long max) throws PDException { + return meta.getCId(key, max); + } + + public long getCId(String key, String name, long max) throws PDException { + return meta.getCId(key, name, max); + } + + /** + * 删除一个自增循环id + * + * @param key + * @param cid + * @return + * @throws PDException + */ + public long delCId(String key, long cid) throws PDException { + return meta.delCId(key, cid); + } + + public long delCIdDelay(String key, String name, long cid) throws PDException { + return meta.delCIdDelay(key, name, cid); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/KvService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/KvService.java new file mode 100644 index 0000000000..f31196f81c --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/KvService.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.kv.Kv; +import org.apache.hugegraph.pd.grpc.kv.V; +import org.apache.hugegraph.pd.meta.MetadataKeyHelper; +import org.apache.hugegraph.pd.meta.MetadataRocksDBStore; +import org.apache.hugegraph.pd.store.KV; +import org.springframework.stereotype.Service; + +import com.google.protobuf.InvalidProtocolBufferException; + +import lombok.extern.slf4j.Slf4j; + +/** + * + **/ +@Slf4j +@Service +public class KvService { + + public static final char KV_DELIMITER = '@'; + // TODO 主前缀之后,增加类名做区分 + private static final String TTL_PREFIX = "T"; + private static final String KV_PREFIX = "K"; + private static final String LOCK_PREFIX = "L"; + private static final String KV_PREFIX_DELIMITER = KV_PREFIX + KV_DELIMITER; + private static final byte[] EMPTY_VALUE = new byte[0]; + private final MetadataRocksDBStore meta; + private PDConfig pdConfig; + + public KvService(PDConfig config) { + this.pdConfig = config; + meta = new MetadataRocksDBStore(config); + } + + public static String getKey(Object... keys) { + StringBuilder builder = MetadataKeyHelper.getStringBuilderHelper(); + builder.append(KV_PREFIX).append(KV_DELIMITER); + for (Object key : keys) { + builder.append(key == null ? "" : key).append(KV_DELIMITER); + } + return builder.substring(0, builder.length() - 1); + } + + public static byte[] getKeyBytes(Object... keys) { + String key = getKey(keys); + return key.getBytes(Charset.defaultCharset()); + } + + public static String getKeyWithoutPrefix(Object... keys) { + StringBuilder builder = MetadataKeyHelper.getStringBuilderHelper(); + for (Object key : keys) { + builder.append(key == null ? "" : key).append(KV_DELIMITER); + } + return builder.substring(0, builder.length() - 1); + } + + public static String getDelimiter() { + return String.valueOf(KV_DELIMITER); + } + + public PDConfig getPdConfig() { + return pdConfig; + } + + public void setPdConfig(PDConfig pdConfig) { + this.pdConfig = pdConfig; + } + + public void put(String key, String value) throws PDException { + V storeValue = V.newBuilder().setValue(value).setTtl(0).build(); + meta.put(getStoreKey(key), storeValue.toByteArray()); + // log.warn("add key with key-{}:value-{}", key, value); + } + + public void put(String key, String value, long ttl) throws PDException { + long curTime = System.currentTimeMillis(); + curTime += ttl; + V storeValue = V.newBuilder().setValue(value).setSt(ttl).setTtl(curTime).build(); + meta.put(getStoreKey(key), storeValue.toByteArray()); + meta.put(getTTLStoreKey(key, curTime), EMPTY_VALUE); + // log.warn("add key with key-{}:value-{}:ttl-{}", key, value, ttl); + } + + public String get(String key) throws PDException { + byte[] storeKey = getStoreKey(key); + return get(storeKey); + } + + public String get(byte[] keyBytes) throws PDException { + byte[] bytes = meta.getOne(keyBytes); + String v = getValue(keyBytes, bytes); + return v; + } + + private String getValue(byte[] keyBytes, byte[] valueBytes) throws PDException { + if (valueBytes == null || valueBytes.length == 0) { + return ""; + } + try { + V v = V.parseFrom(valueBytes); + if (v.getTtl() == 0 || v.getTtl() >= System.currentTimeMillis()) { + return v.getValue(); + } else { + meta.remove(keyBytes); + meta.remove(getTTLStoreKey(new String(keyBytes), v.getTtl())); + } + } catch (Exception e) { + log.error("parse value with error:{}", e.getMessage()); + throw new PDException(-1, e.getMessage()); + } + return null; + } + + public boolean keepAlive(String key) throws PDException { + byte[] bytes = meta.getOne(getStoreKey(key)); + try { + if (bytes == null || bytes.length == 0) { + return false; + } + V v = V.parseFrom(bytes); + if (v != null) { + long ttl = v.getTtl(); + long st = v.getSt(); + meta.remove(getTTLStoreKey(key, ttl)); + put(key, v.getValue(), st); + return true; + } else { + return false; + } + } catch (InvalidProtocolBufferException e) { + throw new PDException(-1, e.getMessage()); + } + } + + public Kv delete(String key) throws PDException { + byte[] storeKey = getStoreKey(key); + String value = this.get(storeKey); + meta.remove(storeKey); + Kv.Builder builder = Kv.newBuilder().setKey(key); + if (value != null) { + builder.setValue(value); + } + Kv kv = builder.build(); + // log.warn("delete kv with key :{}", key); + return kv; + } + + public List deleteWithPrefix(String key) throws PDException { + byte[] storeKey = getStoreKey(key); + //TODO to many rows for scan + List kvList = meta.scanPrefix(storeKey); + LinkedList kvs = new LinkedList<>(); + for (KV kv : kvList) { + String kvKey = new String(kv.getKey()).replaceFirst(KV_PREFIX_DELIMITER, ""); + String kvValue = getValue(kv.getKey(), kv.getValue()); + if (kvValue != null) { + kvs.add(Kv.newBuilder().setKey(kvKey).setValue(kvValue).build()); + } + } + meta.removeByPrefix(storeKey); + // log.warn("delete kv with key prefix :{}", key); + return kvs; + } + + /** + * scan result ranged from key start and key end + * + * @param keyStart + * @param keyEnd + * @return Records + * @throws PDException + */ + public Map scanRange(String keyStart, String keyEnd) throws PDException { + List list = meta.scanRange(getStoreKey(keyStart), getStoreKey(keyEnd)); + Map map = new HashMap<>(); + for (KV kv : list) { + String kvKey = new String(kv.getKey()).replaceFirst(KV_PREFIX_DELIMITER, ""); + String kvValue = getValue(kv.getKey(), kv.getValue()); + if (kvValue != null) { + map.put(kvKey, kvValue); + } + } + return map; + } + + public Map scanWithPrefix(String key) throws PDException { + List kvList = meta.scanPrefix(getStoreKey(key)); + HashMap map = new HashMap<>(); + for (KV kv : kvList) { + String kvKey = new String(kv.getKey()).replaceFirst(KV_PREFIX_DELIMITER, ""); + String kvValue = getValue(kv.getKey(), kv.getValue()); + if (kvValue != null) { + map.put(kvKey, kvValue); + } + } + return map; + } + + public boolean locked(String key) throws PDException { + String lockKey = KvService.getKeyWithoutPrefix(KvService.LOCK_PREFIX, key); + Map allLock = scanWithPrefix(lockKey); + return allLock != null && allLock.size() != 0; + } + + private boolean owned(String key, long clientId) throws PDException { + String lockKey = KvService.getKeyWithoutPrefix(KvService.LOCK_PREFIX, key); + Map allLock = scanWithPrefix(lockKey); + if (allLock.size() == 0) { + return true; + } + for (Map.Entry entry : allLock.entrySet()) { + String entryKey = entry.getKey(); + String[] split = entryKey.split(String.valueOf(KV_DELIMITER)); + if (Long.valueOf(split[split.length - 1]).equals(clientId)) { + return true; + } + } + return false; + } + + public boolean lock(String key, long ttl, long clientId) throws PDException { + //TODO lock improvement + synchronized (KvService.class) { + if (!owned(key, clientId)) { + return false; + } + put(getLockKey(key, clientId), " ", ttl); + return true; + } + } + + public boolean lockWithoutReentrant(String key, long ttl, + long clientId) throws PDException { + synchronized (KvService.class) { + if (locked(key)) { + return false; + } + put(getLockKey(key, clientId), " ", ttl); + return true; + } + } + + public boolean unlock(String key, long clientId) throws PDException { + synchronized (KvService.class) { + if (!owned(key, clientId)) { + return false; + } + delete(getLockKey(key, clientId)); + return true; + } + } + + public boolean keepAlive(String key, long clientId) throws PDException { + String lockKey = getLockKey(key, clientId); + return keepAlive(lockKey); + } + + public String getLockKey(String key, long clientId) { + return getKeyWithoutPrefix(LOCK_PREFIX, key, clientId); + } + + public byte[] getStoreKey(String key) { + return getKeyBytes(key); + } + + public byte[] getTTLStoreKey(String key, long time) { + return getKeyBytes(TTL_PREFIX, time, key); + } + + public void clearTTLData() { + try { + byte[] ttlStartKey = getTTLStoreKey("", 0); + byte[] ttlEndKey = getTTLStoreKey("", System.currentTimeMillis()); + List kvList = meta.scanRange(ttlStartKey, ttlEndKey); + for (KV kv : kvList) { + String key = new String(kv.getKey()); + int index = key.indexOf(KV_DELIMITER, 2); + String delKey = key.substring(index + 1); + delete(delKey); + meta.remove(kv.getKey()); + } + } catch (Exception e) { + log.error("clear ttl data with error :", e); + } + } + + public MetadataRocksDBStore getMeta() { + return meta; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/LogService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/LogService.java new file mode 100644 index 0000000000..35959849bc --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/LogService.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.util.List; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.meta.LogMeta; +import org.apache.hugegraph.pd.meta.MetadataFactory; +import org.springframework.stereotype.Service; + +import com.google.protobuf.Any; +import com.google.protobuf.GeneratedMessageV3; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class LogService { + + public static final String GRPC = "GRPC"; + public static final String REST = "REST"; + public static final String TASK = "TASK"; + public static final String NODE_CHANGE = "NODE_CHANGE"; + public static final String PARTITION_CHANGE = "PARTITION_CHANGE"; + private final LogMeta logMeta; + + public LogService(PDConfig pdConfig) { + logMeta = MetadataFactory.newLogMeta(pdConfig); + } + + public List getLog(String action, Long start, Long end) throws PDException { + return logMeta.getLog(action, start, end); + } + + public void insertLog(String action, String message, GeneratedMessageV3 target) { + try { + Metapb.LogRecord logRecord = Metapb.LogRecord.newBuilder() + .setAction(action) + .setMessage(message) + .setTimestamp(System.currentTimeMillis()) + .setObject(Any.pack(target)) + .build(); + logMeta.insertLog(logRecord); + } catch (PDException e) { + log.debug("Insert log with error:{}", e); + } + + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionInstructionListener.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionInstructionListener.java new file mode 100644 index 0000000000..2b1e4a6375 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionInstructionListener.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.grpc.pulse.ChangeShard; +import org.apache.hugegraph.pd.grpc.pulse.CleanPartition; +import org.apache.hugegraph.pd.grpc.pulse.DbCompaction; +import org.apache.hugegraph.pd.grpc.pulse.MovePartition; +import org.apache.hugegraph.pd.grpc.pulse.PartitionKeyRange; +import org.apache.hugegraph.pd.grpc.pulse.SplitPartition; +import org.apache.hugegraph.pd.grpc.pulse.TransferLeader; + +/** + * 分区命令监听 + */ +public interface PartitionInstructionListener { + + void changeShard(Metapb.Partition partition, ChangeShard changeShard) throws PDException; + + void transferLeader(Metapb.Partition partition, TransferLeader transferLeader) throws + PDException; + + void splitPartition(Metapb.Partition partition, SplitPartition splitPartition) throws + PDException; + + void dbCompaction(Metapb.Partition partition, DbCompaction dbCompaction) throws PDException; + + void movePartition(Metapb.Partition partition, MovePartition movePartition) throws PDException; + + void cleanPartition(Metapb.Partition partition, CleanPartition cleanPartition) throws + PDException; + + void changePartitionKeyRange(Metapb.Partition partition, + PartitionKeyRange partitionKeyRange) throws PDException; + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionService.java new file mode 100644 index 0000000000..c8ec3e3e7d --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionService.java @@ -0,0 +1,1562 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hugegraph.pd.common.KVPair; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.common.PartitionUtils; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.MetaTask; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.grpc.pulse.ChangeShard; +import org.apache.hugegraph.pd.grpc.pulse.CleanPartition; +import org.apache.hugegraph.pd.grpc.pulse.CleanType; +import org.apache.hugegraph.pd.grpc.pulse.ConfChangeType; +import org.apache.hugegraph.pd.grpc.pulse.DbCompaction; +import org.apache.hugegraph.pd.grpc.pulse.MovePartition; +import org.apache.hugegraph.pd.grpc.pulse.PartitionKeyRange; +import org.apache.hugegraph.pd.grpc.pulse.SplitPartition; +import org.apache.hugegraph.pd.grpc.pulse.TransferLeader; +import org.apache.hugegraph.pd.meta.MetadataFactory; +import org.apache.hugegraph.pd.meta.PartitionMeta; +import org.apache.hugegraph.pd.meta.TaskInfoMeta; +import org.apache.hugegraph.pd.raft.RaftStateListener; + +import lombok.extern.slf4j.Slf4j; + +/** + * 分区管理 + */ +@Slf4j +public class PartitionService implements RaftStateListener { + + private final long Partition_Version_Skip = 0x0F; + private final StoreNodeService storeService; + private final PartitionMeta partitionMeta; + private final PDConfig pdConfig; + // 分区命令监听 + private final List instructionListeners; + + // 分区状态监听 + private final List statusListeners; + + public PartitionService(PDConfig config, StoreNodeService storeService) { + this.pdConfig = config; + this.storeService = storeService; + partitionMeta = MetadataFactory.newPartitionMeta(config); + instructionListeners = + Collections.synchronizedList(new ArrayList()); + statusListeners = Collections.synchronizedList(new ArrayList()); + } + + public void init() throws PDException { + partitionMeta.init(); + storeService.addStatusListener(new StoreStatusListener() { + @Override + public void onStoreStatusChanged(Metapb.Store store, Metapb.StoreState old, + Metapb.StoreState status) { + if (status == Metapb.StoreState.Tombstone) { + // Store被停机,通知所有该store所有分区,迁移数据 + storeOffline(store); + } + } + + @Override + public void onGraphChange(Metapb.Graph graph, + Metapb.GraphState stateOld, + Metapb.GraphState stateNew) { + + } + + @Override + public void onStoreRaftChanged(Metapb.Store store) { + + } + }); + } + + /** + * 返回Key所属的partition + * + * @param graphName + * @param key + * @return + */ + public Metapb.PartitionShard getPartitionShard(String graphName, byte[] key) throws + PDException { + long code = PartitionUtils.calcHashcode(key); + return getPartitionByCode(graphName, code); + } + + /** + * 根据hashcode返回所属的partition + * + * @param graphName + * @param code + * @return + */ + public Metapb.PartitionShard getPartitionByCode(String graphName, long code) throws + PDException { + if (code < 0 || code >= PartitionUtils.MAX_VALUE) { + throw new PDException(Pdpb.ErrorType.NOT_FOUND_VALUE, "code error"); + } + // 根据Code查找分区id,如果没有找到,创建新的分区 + Metapb.Partition partition = partitionMeta.getPartitionByCode(graphName, code); + + if (partition == null) { + synchronized (this) { + if (partition == null) { + partition = newPartition(graphName, code); + } + } + } + + Metapb.PartitionShard partShard = Metapb.PartitionShard.newBuilder() + .setPartition(partition) + .setLeader(storeService.getLeader( + partition, 0)) + .build(); + log.debug( + "{} Partition get code = {}, partition id = {}, start = {}, end = {}, leader = {}", + graphName, (code), partition.getId(), partition.getStartKey(), + partition.getEndKey(), partShard.getLeader()); + + return partShard; + } + + /** + * 根据ID返回分区信息 + * + * @param graphName + * @param partId + * @return + * @throws PDException + */ + public Metapb.PartitionShard getPartitionShardById(String graphName, int partId) throws + PDException { + Metapb.Partition partition = partitionMeta.getPartitionById(graphName, partId); + if (partition == null) { + return null; + } + + Metapb.PartitionShard partShard = Metapb.PartitionShard.newBuilder() + .setPartition(partition) + // 此处需要返回正确的leader,暂时默认取第一个 + .setLeader(storeService.getLeader( + partition, 0)) + .build(); + + return partShard; + } + + public Metapb.Partition getPartitionById(String graphName, int partId) throws PDException { + return partitionMeta.getPartitionById(graphName, partId); + } + + public List getPartitionById(int partId) throws PDException { + return partitionMeta.getPartitionById(partId); + } + + /** + * 获取图的所有分区 + */ + public List getPartitions() { + return partitionMeta.getPartitions(); + } + + public List getPartitions(String graphName) { + if (StringUtils.isAllEmpty(graphName)) { + return partitionMeta.getPartitions(); + } + return partitionMeta.getPartitions(graphName); + } + + /** + * 查找在store上的所有分区 + * + * @param store + * @return + */ + public List getPartitionByStore(Metapb.Store store) throws PDException { + List partitions = new ArrayList<>(); + getGraphs().forEach(graph -> { + getPartitions(graph.getGraphName()).forEach(partition -> { + try { + storeService.getShardGroup(partition.getId()).getShardsList().forEach(shard -> { + if (shard.getStoreId() == store.getId()) { + partitions.add(partition); + } + }); + } catch (PDException e) { + throw new RuntimeException(e); + } + }); + }); + return partitions; + } + + /** + * 产生一个新的分区 + * + * @param graphName + * @return + */ + private Metapb.Partition newPartition(String graphName, long code) throws PDException { + Metapb.Graph graph = partitionMeta.getAndCreateGraph(graphName); + int partitionSize = PartitionUtils.MAX_VALUE / graph.getPartitionCount(); + if (PartitionUtils.MAX_VALUE % graph.getPartitionCount() != 0) { + // 有余数,分区除不尽 + partitionSize++; + } + + int partitionId = (int) (code / partitionSize); + long startKey = (long) partitionSize * partitionId; + long endKey = (long) partitionSize * (partitionId + 1); + + // 检查本地 + Metapb.Partition partition = partitionMeta.getPartitionById(graphName, partitionId); + if (partition == null) { + storeService.allocShards(null, partitionId); + + // 分配store + partition = Metapb.Partition.newBuilder() + .setId(partitionId) + .setVersion(0) + .setState(Metapb.PartitionState.PState_Normal) + .setStartKey(startKey) + .setEndKey(endKey) + .setGraphName(graphName) + .build(); + + log.info("Create newPartition {}", partition); + } + + partitionMeta.updatePartition(partition); + + return partition; + } + + /** + * 计算Key所属的分区,此处采用Hash映射的方法。 + * + * @param graphName + * @param key + * @return + */ + protected int getPartitionId(String graphName, byte[] key) throws PDException { + int code = PartitionUtils.calcHashcode(key); + Metapb.Partition partition = partitionMeta.getPartitionByCode(graphName, code); + return partition != null ? partition.getId() : -1; + } + + /** + * 获取key范围所跨越的所有分区 + * 暂时使用hashcode计算,正常做法,基于key进行查询 + * + * @param graphName + * @param startKey + * @param endKey + */ + public List scanPartitions(String graphName, byte[] startKey, + byte[] endKey) + throws PDException { + int startPartId = getPartitionId(graphName, startKey); + int endPartId = getPartitionId(graphName, endKey); + + List partShards = new ArrayList<>(); + for (int id = startPartId; id <= endPartId; id++) { + Metapb.Partition partition = partitionMeta.getPartitionById(graphName, id); + partShards.add( + Metapb.PartitionShard.newBuilder() + .setPartition(partition) + // 此处需要返回正确的leader,暂时默认取第一个 + .setLeader(storeService.getLeader(partition, 0)) + .build() + ); + } + return partShards; + } + + public synchronized long updatePartition(List partitions) throws PDException { + for (Metapb.Partition pt : partitions) { + Metapb.Partition oldPt = getPartitionById(pt.getGraphName(), pt.getId()); + partitionMeta.updatePartition(pt); + onPartitionChanged(oldPt, pt); + } + return partitions.size(); + } + + /** + * 更新分区以及图的状态 + * + * @param graph + * @param partId + * @param state + * @throws PDException + */ + public synchronized void updatePartitionState(String graph, int partId, + Metapb.PartitionState state) throws PDException { + Metapb.Partition partition = getPartitionById(graph, partId); + + if (partition.getState() != state) { + Metapb.Partition newPartition = partitionMeta.updatePartition(partition.toBuilder() + .setState(state) + .build()); + + onPartitionChanged(partition, newPartition); + } + } + + public synchronized void updateGraphState(String graphName, Metapb.PartitionState state) throws + PDException { + Metapb.Graph graph = getGraph(graphName); + if (graph != null) { + partitionMeta.updateGraph(graph.toBuilder() + .setState(state).build()); + } + } + + public synchronized long removePartition(String graphName, int partId) throws PDException { + log.info("Partition {}-{} removePartition", graphName, partId); + Metapb.Partition partition = partitionMeta.getPartitionById(graphName, partId); + var ret = partitionMeta.removePartition(graphName, partId); + partitionMeta.reload(); + onPartitionRemoved(partition); + + // source中有些是 offline的,删除后,需要更新图的状态 + try { + Metapb.PartitionState state = Metapb.PartitionState.PState_Normal; + for (Metapb.Partition pt : partitionMeta.getPartitions(partition.getGraphName())) { + if (pt.getState().getNumber() > state.getNumber()) { + state = pt.getState(); + } + } + updateGraphState(partition.getGraphName(), state); + + state = Metapb.PartitionState.PState_Normal; + for (Metapb.ShardGroup group : storeService.getShardGroups()) { + if (group.getState().getNumber() > state.getNumber()) { + state = group.getState(); + } + } + storeService.updateClusterStatus(state); + + } catch (PDException e) { + log.error("onPartitionChanged", e); + } + + return ret; + } + + public Metapb.PartitionStats getPartitionStats(String graphName, int partitionId) throws + PDException { + return partitionMeta.getPartitionStats(graphName, partitionId); + } + + /** + * 获取图的分区状态 + */ + public List getPartitionStatus(String graphName) + throws PDException { + return partitionMeta.getPartitionStats(graphName); + } + + /** + * 返回图的信息 + */ + public List getGraphs() throws PDException { + return partitionMeta.getGraphs(); + } + + public Metapb.Graph getGraph(String graphName) throws PDException { + return partitionMeta.getGraph(graphName); + } + + /** + * 删除图以及图的所有分区 + */ + public Metapb.Graph delGraph(String graphName) throws PDException { + log.info("delGraph {}", graphName); + Metapb.Graph graph = getGraph(graphName); + getPartitions(graphName).forEach(partition -> { + onPartitionRemoved(partition); + }); + partitionMeta.removeAllPartitions(graphName); + partitionMeta.removeGraph(graphName); + return graph; + } + + /** + * 修改图信息,需要通知到store + */ + public synchronized Metapb.Graph updateGraph(Metapb.Graph graph) throws PDException { + Metapb.Graph lastGraph = partitionMeta.getAndCreateGraph(graph.getGraphName()); + log.info("updateGraph graph: {}, last: {}", graph, lastGraph); + + int partCount = + (graph.getGraphName().endsWith("/s") || graph.getGraphName().endsWith("/m")) ? + 1 : pdConfig.getPartition().getTotalCount(); + + // set the partition count to specified if legal. + if (graph.getPartitionCount() <= partCount && graph.getPartitionCount() > 0) { + partCount = graph.getPartitionCount(); + } + + if (partCount == 0) { + throw new PDException(10010, "update graph error, partition count = 0"); + } + + graph = lastGraph.toBuilder() + .mergeFrom(graph) + .setPartitionCount(partCount) + .build(); + partitionMeta.updateGraph(graph); + + // 分区数发生改变 + if (lastGraph.getPartitionCount() != graph.getPartitionCount()) { + log.info("updateGraph graph: {}, partition count changed from {} to {}", + graph.getGraphName(), lastGraph.getPartitionCount(), + graph.getPartitionCount()); + // TODO 修改图的分区数,需要进行数据迁移。 + } + return graph; + } + + // partitionId -> (storeId -> shard committedIndex) + public Map> getCommittedIndexStats() throws PDException { + Map> map = new HashMap<>(); + for (Metapb.Store store : storeService.getActiveStores()) { + for (Metapb.RaftStats raftStats : store.getStats().getRaftStatsList()) { + int partitionID = raftStats.getPartitionId(); + if (!map.containsKey(partitionID)) { + map.put(partitionID, new HashMap<>()); + } + Map storeMap = map.get(partitionID); + if (!storeMap.containsKey(store.getId())) { + storeMap.put(store.getId(), raftStats.getCommittedIndex()); + } + } + } + return map; + } + + /** + * 存储被下线,迁移分区数据 + * + * @param store + */ + public void storeOffline(Metapb.Store store) { + try { + log.info("storeOffline store id: {}, address: {}, state: {}", + store.getId(), store.getAddress(), store.getState()); + List partitions = getPartitionByStore(store); + var partIds = new HashSet(); + for (Metapb.Partition p : partitions) { + if (partIds.contains(p.getId())) { + continue; + } + shardOffline(p, store.getId()); + partIds.add(p.getId()); + } + } catch (PDException e) { + log.error("storeOffline exception: ", e); + } + } + + /** + * 存储被下线,迁移分区数据 + */ + public synchronized void shardOffline(Metapb.Partition partition, long storeId) { + try { + log.info("shardOffline Partition {} - {} shardOffline store : {}", + partition.getGraphName(), partition.getId(), storeId); + // partition = getPartitionById(partition.getGraphName(), partition.getId()); + // Metapb.Partition.Builder builder = Metapb.Partition.newBuilder(partition); + // builder.clearShards(); + // partition.getShardsList().forEach(shard -> { + // if (shard.getStoreId() != storeId) + // builder.addShards(shard); + // }); + // partition = builder.build(); + Metapb.Graph graph = getGraph(partition.getGraphName()); + reallocPartitionShards(graph, partition); + + } catch (PDException e) { + log.error("storeOffline exception: ", e); + } + } + + private boolean isShardListEquals(List list1, List list2) { + if (list1 == list2) { + return true; + } else if (list1 != null && list2 != null) { + + var s1 = list1.stream().map(Metapb.Shard::getStoreId).sorted(Long::compare) + .collect(Collectors.toList()); + var s2 = list2.stream().map(Metapb.Shard::getStoreId).sorted(Long::compare) + .collect(Collectors.toList()); + + if (s1.size() == s2.size()) { + for (int i = 0; i < s1.size(); i++) { + if (s1.get(i) != s2.get(i)) { + return false; + } + } + return true; + } + } + + return false; + } + + /** + * 重新分配shard + * + * @param graph + * @param partition + * @throws PDException + */ + public void reallocPartitionShards(Metapb.Graph graph, Metapb.Partition partition) throws + PDException { + if (partition == null) { + return; + } + List originalShards = storeService.getShardList(partition.getId()); + + var shardGroup = storeService.getShardGroup(partition.getId()); + + List shards = storeService.reallocShards(shardGroup); + + if (isShardListEquals(originalShards, shards)) { + log.info("reallocPartitionShards:{} vs {}", shardGroup, shards); + // partition = Metapb.Partition.newBuilder(partition) + // .clearShards().addAllShards(shards) + // .build(); + // partitionMeta.updatePartition(partition); + fireChangeShard(partition, shards, ConfChangeType.CONF_CHANGE_TYPE_ADJUST); + } + } + + public synchronized void reallocPartitionShards(String graphName, int partitionId) throws + PDException { + reallocPartitionShards(partitionMeta.getGraph(graphName), + partitionMeta.getPartitionById(graphName, partitionId)); + } + + /** + * 迁移分区副本 + */ + public synchronized void movePartitionsShard(Integer partitionId, long fromStore, + long toStore) { + try { + log.info("movePartitionsShard partitionId {} from store {} to store {}", partitionId, + fromStore, toStore); + for (Metapb.Graph graph : getGraphs()) { + Metapb.Partition partition = + this.getPartitionById(graph.getGraphName(), partitionId); + if (partition == null) { + continue; + } + + var shardGroup = storeService.getShardGroup(partitionId); + List shards = new ArrayList<>(); + shardGroup.getShardsList().forEach(shard -> { + if (shard.getStoreId() != fromStore) { + shards.add(shard); + } + }); + + shards.add(Metapb.Shard.newBuilder().setStoreId(toStore) + .setRole(Metapb.ShardRole.Follower).build()); + + // storeService.updateShardGroup(partitionId, shards, -1, -1); + // storeService.onShardGroupStatusChanged(shardGroup, newShardGroup); + fireChangeShard(partition, shards, ConfChangeType.CONF_CHANGE_TYPE_ADJUST); + // shard group和 graph无关,迁移一个就够了 + break; + } + } catch (PDException e) { + log.error("Partition {} movePartitionsShard exception {}", partitionId, e); + } + } + + /** + * 把集群中所有的分区,拆成split + * + * @param splits 拆分分区 + */ + public synchronized void splitPartition(List> splits) throws + PDException { + var tasks = new HashMap>>(); + + for (var pair : splits) { + for (var partition : getPartitionById(pair.getKey())) { + if (!tasks.containsKey(partition.getGraphName())) { + tasks.put(partition.getGraphName(), new ArrayList<>()); + } + tasks.get(partition.getGraphName()).add(pair); + } + } + + for (var entry : tasks.entrySet()) { + splitPartition(getGraph(entry.getKey()), entry.getValue()); + } + } + + /** + * 分区分裂, 把一个图拆分到N 个 + * + * @param graph graph + * @param toCount target count + * @throws PDException + */ + + public synchronized void splitPartition(Metapb.Graph graph, int toCount) throws PDException { + + var partitionCount = getPartitions(graph.getGraphName()).size(); + var maxShardsPerStore = pdConfig.getPartition().getMaxShardsPerStore(); + var shardCount = pdConfig.getPartition().getShardCount(); + + if (shardCount * toCount > storeService.getActiveStores().size() * maxShardsPerStore) { + throw new PDException(Pdpb.ErrorType.Too_Many_Partitions_Per_Store_VALUE, + "can't satisfy target shard group count, reached the upper " + + "limit of the cluster"); + } + + if (toCount % partitionCount != 0 || toCount <= partitionCount) { + throw new PDException(Pdpb.ErrorType.Invalid_Split_Partition_Count_VALUE, + "invalid split partition count, make sure to count is N time of" + + " current partition count"); + } + + // 由于是整数倍数,扩充因子为 toCount / current count + var splitCount = toCount / partitionCount; + var list = new ArrayList>(); + for (int i = 0; i < partitionCount; i++) { + list.add(new KVPair<>(i, splitCount)); + } + + splitPartition(graph, list); + } + + private synchronized void splitPartition(Metapb.Graph graph, + List> splits) + throws PDException { + var taskInfoMeta = storeService.getTaskInfoMeta(); + if (taskInfoMeta.scanSplitTask(graph.getGraphName()).size() > 0) { + return; + } + + splits.sort(Comparator.comparing(KVPair::getKey)); + log.info("split partition, graph: {}, splits:{}", graph, splits); + + // 从最后一个partition下标开始 + var i = getPartitions(graph.getGraphName()).size(); + + for (var pair : splits) { + Metapb.Partition partition = + partitionMeta.getPartitionById(graph.getGraphName(), pair.getKey()); + if (partition != null) { + var splitCount = pair.getValue(); + long splitLen = (partition.getEndKey() - partition.getStartKey()) / splitCount; + + List newPartitions = new ArrayList<>(); + // 第一个分区也就是原分区 + newPartitions.add(partition.toBuilder() + .setStartKey(partition.getStartKey()) + .setEndKey(partition.getStartKey() + splitLen) + .setId(partition.getId()) + .setState(Metapb.PartitionState.PState_Offline) + .build()); + + int idx = 0; + + for (; idx < splitCount - 2; idx++) { + newPartitions.add(partition.toBuilder() + .setStartKey(newPartitions.get(idx).getEndKey()) + .setEndKey(newPartitions.get(idx).getEndKey() + + splitLen) + .setId(i) + .setState(Metapb.PartitionState.PState_Offline) + .build()); + i += 1; + } + + newPartitions.add(partition.toBuilder() + .setStartKey(newPartitions.get(idx).getEndKey()) + .setEndKey(partition.getEndKey()) + .setId(i) + .setState(Metapb.PartitionState.PState_Offline) + .build()); + i += 1; + + // try to save new partitions, and repair shard group + for (int j = 0; j < newPartitions.size(); j++) { + var newPartition = newPartitions.get(j); + + if (j != 0) { + partitionMeta.updatePartition(newPartition); + } + // 创建shard group,如果为空,则按照partition的shard group为蓝本,去创建,保证在一个机器上 + // 如果存在,则由于各个图的分区数量不一样,需要store端复制到其他机器上 + var shardGroup = storeService.getShardGroup(newPartition.getId()); + if (shardGroup == null) { + shardGroup = storeService.getShardGroup(partition.getId()).toBuilder() + .setId(newPartition.getId()) + .build(); + storeService.getStoreInfoMeta().updateShardGroup(shardGroup); + updateShardGroupCache(shardGroup); + } + + // 做shard list的检查 + if (shardGroup.getShardsCount() != pdConfig.getPartition().getShardCount()) { + storeService.reallocShards(shardGroup); + } + } + + SplitPartition splitPartition = SplitPartition.newBuilder() + .addAllNewPartition(newPartitions) + .build(); + + fireSplitPartition(partition, splitPartition); + // 修改Partition状态为下线,任务完成后恢复为上线 + updatePartitionState(partition.getGraphName(), partition.getId(), + Metapb.PartitionState.PState_Offline); + + // 记录事务 + var task = MetaTask.Task.newBuilder().setPartition(partition) + .setSplitPartition(splitPartition) + .build(); + taskInfoMeta.addSplitTask(pair.getKey(), task.getPartition(), + task.getSplitPartition()); + } + } + } + + /** + * 转移leader到其他shard上. + * 转移一个partition即可 + */ + public void transferLeader(Integer partId, Metapb.Shard shard) { + try { + var partitions = getPartitionById(partId); + if (partitions.size() > 0) { + fireTransferLeader(partitions.get(0), + TransferLeader.newBuilder().setShard(shard).build()); + } +// for (Metapb.Graph graph : getGraphs()) { +// Metapb.Partition partition = this.getPartitionById(graph.getGraphName(), partId); +// if (partition != null) { +// fireTransferLeader(partition, TransferLeader.newBuilder().setShard(shard) +// .build()); +// } +// } + } catch (PDException e) { + log.error("Partition {} transferLeader exception {}", partId, e); + } + } + + /** + * 分区合并,将整个集群的分区数,合并到toCount个 + * + * @param toCount 目标分区数 + * @throws PDException when query errors + */ + public void combinePartition(int toCount) throws PDException { + + int shardsTotalCount = getShardGroupCount(); + for (var graph : getGraphs()) { + // 对所有大于toCount分区的图,都进行缩容 + if (graph.getPartitionCount() > toCount) { + combineGraphPartition(graph, toCount, shardsTotalCount); + } + } + } + + /** + * 针对单个图,进行分区合并 + * + * @param graphName the name of the graph + * @param toCount the target partition count + * @throws PDException when query errors + */ + + public void combineGraphPartition(String graphName, int toCount) throws PDException { + combineGraphPartition(getGraph(graphName), toCount, getShardGroupCount()); + } + + /** + * 单图合并的内部实现 + * + * @param graph the name of the graph + * @param toCount the target partition count + * @param shardCount the shard count of the clusters + * @throws PDException when query errors + */ + private synchronized void combineGraphPartition(Metapb.Graph graph, int toCount, int shardCount) + throws PDException { + if (graph == null) { + throw new PDException(1, + "Graph not exists, try to use full graph name, like " + + "/DEFAULT/GRAPH_NAME/g"); + } + + log.info("Combine graph {} partition, from {}, to {}, with shard count:{}", + graph.getGraphName(), graph.getPartitionCount(), toCount, shardCount); + + if (!checkTargetCount(graph.getPartitionCount(), toCount, shardCount)) { + log.error("Combine partition, illegal toCount:{}, graph:{}", toCount, + graph.getGraphName()); + throw new PDException(2, + "illegal partition toCount, should between 1 ~ shard group " + + "count and " + + " can be dived by shard group count"); + } + + var taskInfoMeta = storeService.getTaskInfoMeta(); + if (taskInfoMeta.scanMoveTask(graph.getGraphName()).size() > 0) { + throw new PDException(3, "Graph Combine process exists"); + } + + // 按照 key start 排序,合并后的key range 是连续的 + var partitions = getPartitions(graph.getGraphName()).stream() + .sorted(Comparator.comparing( + Metapb.Partition::getStartKey)) + .collect(Collectors.toList()); + + // 分区编号不一定是连续的 + var sortPartitions = getPartitions(graph.getGraphName()) + .stream() + .sorted(Comparator.comparing(Metapb.Partition::getId)) + .collect(Collectors.toList()); + + var groupSize = partitions.size() / toCount; // merge group size + // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 共12个分区, 合并成4个 + // 方案:0,1,2 => 0, 3,4,5 -> 1, 6,7,8 ->2, 9,10,11 -> 3 + // 保证分区的连续性. + for (int i = 0; i < toCount; i++) { + var startKey = partitions.get(i * groupSize).getStartKey(); + var endKey = partitions.get(i * groupSize + groupSize - 1).getEndKey(); + // compose the key range + // the start key and end key should be changed if combine success. + + var targetPartition = Metapb.Partition.newBuilder(sortPartitions.get(i)) + .setStartKey(startKey) + .setEndKey(endKey) + .build(); + + for (int j = 0; j < groupSize; j++) { + var partition = partitions.get(i * groupSize + j); + // 分区id相同,就跳过 + if (i == partition.getId()) { + continue; + } + + log.info("combine partition of graph :{}, from part id {} to {}", + partition.getGraphName(), + partition.getId(), targetPartition.getId()); + MovePartition movePartition = MovePartition.newBuilder() + .setTargetPartition(targetPartition) + .setKeyStart(partition.getStartKey()) + .setKeyEnd(partition.getEndKey()) + .build(); + taskInfoMeta.addMovePartitionTask(partition, movePartition); + // source 下线 + updatePartitionState(partition.getGraphName(), partition.getId(), + Metapb.PartitionState.PState_Offline); + fireMovePartition(partition, movePartition); + } + // target 下线 + updatePartitionState(targetPartition.getGraphName(), targetPartition.getId(), + Metapb.PartitionState.PState_Offline); + } + + storeService.updateClusterStatus(Metapb.ClusterState.Cluster_Offline); + } + + /** + * 通过 storeService 获取 raft group 总数 + * + * @return the count of raft groups + */ + private int getShardGroupCount() { + try { + return Optional.ofNullable(storeService.getShardGroups()).orElseGet(ArrayList::new) + .size(); + } catch (PDException e) { + log.error("get shard group failed, error: {}", e); + } + return 0; + } + + /** + * 判断图分区是否能够从from合并到to个 + * + * @param fromCount 现在的分区数 + * @param toCount 目标分区数 + * @return true when available , or otherwise + */ + private boolean checkTargetCount(int fromCount, int toCount, int shardCount) { + // 要介于 1 ~ N 中间,而且可以整除 + return toCount >= 1 && toCount < fromCount && fromCount % toCount == 0 && + toCount < shardCount; + } + + /** + * 处理分区心跳, 记录Leader信息 + * 检查term和version,比较是否是最新的消息 + * + * @param stats + */ + public void partitionHeartbeat(Metapb.PartitionStats stats) throws PDException { + + Metapb.ShardGroup shardGroup = storeService.getShardGroup(stats.getId()); + // shard group version changes + // (shard group 由pd控制, 在分裂等操作后,可能出现短暂不一致的情况,以pd为准) + // store控制shard leader + if (shardGroup != null && + (shardGroup.getVersion() < stats.getLeaderTerm() || + shardGroup.getConfVer() < stats.getConfVer())) { + storeService.updateShardGroup(stats.getId(), + stats.getShardList(), stats.getLeaderTerm(), + stats.getConfVer()); + } + + List partitions = getPartitionById(stats.getId()); + for (Metapb.Partition partition : partitions) { + // partitionMeta.getAndCreateGraph(partition.getGraphName()); + checkShardState(partition, stats); + } + // 统计信息 + partitionMeta.updatePartitionStats(stats.toBuilder() + .setTimestamp(System.currentTimeMillis()).build()); + } + + /** + * 检查shard状态,离线shard影响到分区状态 + * + * @param stats + */ + private void checkShardState(Metapb.Partition partition, Metapb.PartitionStats stats) { + + try { + int offCount = 0; + for (Metapb.ShardStats shard : stats.getShardStatsList()) { + if (shard.getState() == Metapb.ShardState.SState_Offline) { + offCount++; + } + } + if (partition.getState() != Metapb.PartitionState.PState_Offline) { + if (offCount == 0) { + updatePartitionState(partition.getGraphName(), partition.getId(), + Metapb.PartitionState.PState_Normal); + } else if (offCount * 2 < stats.getShardStatsCount()) { + updatePartitionState(partition.getGraphName(), partition.getId(), + Metapb.PartitionState.PState_Warn); + } else { + updatePartitionState(partition.getGraphName(), partition.getId(), + Metapb.PartitionState.PState_Warn); + } + } + } catch (Exception e) { + log.error("Partition {}-{} checkShardState exception {}", + partition.getGraphName(), partition.getId(), e); + } + } + + public void addInstructionListener(PartitionInstructionListener event) { + instructionListeners.add(event); + } + + public void addStatusListener(PartitionStatusListener listener) { + statusListeners.add(listener); + } + + /** + * 发起改变shard命令 + * + * @param changeType + */ + protected void fireChangeShard(Metapb.Partition partition, List shards, + ConfChangeType changeType) { + log.info("fireChangeShard partition: {}-{}, changeType:{} {}", partition.getGraphName(), + partition.getId(), changeType, shards); + instructionListeners.forEach(cmd -> { + try { + cmd.changeShard(partition, ChangeShard.newBuilder() + .addAllShard(shards).setChangeType(changeType) + .build()); + } catch (Exception e) { + log.error("fireChangeShard", e); + } + }); + } + + public void changeShard(int groupId, List shards) throws PDException { + var partitions = getPartitionById(groupId); + if (partitions.size() == 0) { + return; + } + fireChangeShard(partitions.get(0), shards, ConfChangeType.CONF_CHANGE_TYPE_ADJUST); + } + + /** + * 发送分区分裂消息 + * + * @param partition + */ + protected void fireSplitPartition(Metapb.Partition partition, SplitPartition splitPartition) { + log.info("fireSplitPartition partition: {}-{}, split :{}", + partition.getGraphName(), partition.getId(), splitPartition); + instructionListeners.forEach(cmd -> { + try { + cmd.splitPartition(partition, splitPartition); + } catch (Exception e) { + log.error("fireSplitPartition", e); + } + }); + } + + /** + * 发送Leader切换消息 + */ + protected void fireTransferLeader(Metapb.Partition partition, TransferLeader transferLeader) { + log.info("fireTransferLeader partition: {}-{}, leader :{}", + partition.getGraphName(), partition.getId(), transferLeader); + instructionListeners.forEach(cmd -> { + try { + cmd.transferLeader(partition, transferLeader); + } catch (Exception e) { + log.error("fireSplitPartition", e); + } + }); + } + + /** + * 发送分区移动数据的消息 + * + * @param partition 原分区 + * @param movePartition 目标分区,包含 key range + */ + protected void fireMovePartition(Metapb.Partition partition, MovePartition movePartition) { + log.info("fireMovePartition partition: {} -> {}", + partition, movePartition); + + instructionListeners.forEach(cmd -> { + try { + cmd.movePartition(partition, movePartition); + } catch (Exception e) { + log.error("fireMovePartition", e); + } + }); + } + + protected void fireCleanPartition(Metapb.Partition partition, CleanPartition cleanPartition) { + log.info("fireCleanPartition partition: {} -> just keep : {}->{}", + partition.getId(), cleanPartition.getKeyStart(), cleanPartition.getKeyEnd()); + + instructionListeners.forEach(cmd -> { + try { + cmd.cleanPartition(partition, cleanPartition); + } catch (Exception e) { + log.error("cleanPartition", e); + } + }); + } + + protected void fireChangePartitionKeyRange(Metapb.Partition partition, + PartitionKeyRange partitionKeyRange) { + log.info("fireChangePartitionKeyRange partition: {}-{} -> key range {}", + partition.getGraphName(), partition.getId(), partitionKeyRange); + + instructionListeners.forEach(cmd -> { + try { + cmd.changePartitionKeyRange(partition, partitionKeyRange); + } catch (Exception e) { + log.error("cleanPartition", e); + } + }); + } + + /** + * 处理图迁移任务 + * + * @param task + */ + public synchronized void handleMoveTask(MetaTask.Task task) throws PDException { + var taskInfoMeta = storeService.getTaskInfoMeta(); + var partition = task.getPartition(); + var movePartition = task.getMovePartition(); + + MetaTask.Task pdMetaTask = taskInfoMeta.getMovePartitionTask(partition.getGraphName(), + movePartition.getTargetPartition() + .getId(), + partition.getId()); + + log.info("report move task, graph:{}, pid : {}->{}, state: {}", + task.getPartition().getGraphName(), + task.getPartition().getId(), task.getMovePartition().getTargetPartition().getId(), + task.getState()); + + // 已经被处理(前面有failed) + if (pdMetaTask != null) { + var newTask = pdMetaTask.toBuilder().setState(task.getState()).build(); + taskInfoMeta.updateMovePartitionTask(newTask); + + List subTasks = taskInfoMeta.scanMoveTask(partition.getGraphName()); + + var finished = subTasks.stream().allMatch(t -> + t.getState() == + MetaTask.TaskState.Task_Success || + t.getState() == + MetaTask.TaskState.Task_Failure); + + if (finished) { + var allSuccess = subTasks.stream().allMatch( + t -> t.getState() == MetaTask.TaskState.Task_Success); + if (allSuccess) { + log.info("graph:{} combine task all success!", partition.getGraphName()); + handleMoveTaskAllSuccess(subTasks, partition.getGraphName(), taskInfoMeta); + } else { + log.info("graph:{} combine task failed!", partition.getGraphName()); + handleMoveTaskIfFailed(partition.getGraphName(), taskInfoMeta); + } + } + } + } + + /** + * 当所有的迁移子任务成功: + * 1. 发送清理source分区指令 + * 2. 设置target上线, 更新key range, 更新 graph partition count + * 3. 删除move task,任务结束 + * + * @param subTasks all move sub tasks + * @param graphName graph name + * @param taskInfoMeta task info meta + * @throws PDException returns if write db failed + */ + private void handleMoveTaskAllSuccess(List subTasks, String graphName, + TaskInfoMeta taskInfoMeta) throws PDException { + + var targetPartitionIds = new HashSet(); + var targetPartitions = new ArrayList(); + var deleteFlags = + subTasks.stream().map(task -> task.getMovePartition().getTargetPartition().getId()) + .collect(Collectors.toSet()); + + for (MetaTask.Task subTask : subTasks) { + var source = subTask.getPartition(); + var targetPartition = subTask.getMovePartition().getTargetPartition(); + // 是否处理过 + if (!targetPartitionIds.contains(targetPartition.getId())) { + // 更新range + var old = getPartitionById(targetPartition.getGraphName(), targetPartition.getId()); + var newPartition = Metapb.Partition.newBuilder(old) + .setStartKey(targetPartition.getStartKey()) + .setEndKey(targetPartition.getEndKey()) + .setState(Metapb.PartitionState.PState_Normal) + .build(); + // 在 key range之前更新,避免store没有分区的问题, 需要到pd查询 + updatePartition(List.of(newPartition)); + targetPartitions.add(newPartition); + + // 发送key range 变更消息 + PartitionKeyRange partitionKeyRange = PartitionKeyRange.newBuilder() + .setPartitionId(old.getId()) + .setKeyStart( + targetPartition.getStartKey()) + .setKeyEnd( + targetPartition.getEndKey()) + .build(); + // 通知store + fireChangePartitionKeyRange( + old.toBuilder().setState(Metapb.PartitionState.PState_Normal).build(), + partitionKeyRange); + + // 将 target 设置为上线. source 理论上可能被删掉,所以不处理 + updatePartitionState(newPartition.getGraphName(), newPartition.getId(), + Metapb.PartitionState.PState_Normal); + + targetPartitionIds.add(targetPartition.getId()); + } + + CleanPartition cleanPartition = CleanPartition.newBuilder() + .setKeyStart(source.getStartKey()) + .setKeyEnd(source.getEndKey()) + .setCleanType( + CleanType.CLEAN_TYPE_EXCLUDE_RANGE) + // target 的 partition只需要清理数据,不需要删除分区 + .setDeletePartition(!deleteFlags.contains( + source.getId())) + .build(); + + log.info("pd clean data: {}-{}, key range:{}-{}, type:{}, delete partition:{}", + source.getGraphName(), + source.getId(), + cleanPartition.getKeyStart(), + cleanPartition.getKeyEnd(), + CleanType.CLEAN_TYPE_EXCLUDE_RANGE, + cleanPartition.getDeletePartition()); + + // 清理掉被移动分区的数据 + fireCleanPartition(source, cleanPartition); + } + + // 更新key range, 本地更新,client更新 + // updatePartition(targetPartitions); + + // 更新target 分区状态,source 可能被删掉,所以不处理 + targetPartitions.forEach(p -> { + try { + updatePartitionState(p.getGraphName(), p.getId(), + Metapb.PartitionState.PState_Normal); + } catch (PDException e) { + throw new RuntimeException(e); + } + }); + + partitionMeta.reload(); + + // 更新graph partition count + var graph = getGraph(graphName).toBuilder() + .setPartitionCount(targetPartitionIds.size()) + .build(); + updateGraph(graph); + + // 事务完成 + taskInfoMeta.removeMoveTaskPrefix(graphName); + } + + /** + * 如果缩容任务有失败的,回滚合并操作 + * 1. 清理原来的target 分区,将迁移过来的数据再删掉 + * 2. 将source/target 分区设置为上线 + * 3. 删除task,任务结束 + * + * @param graphName graph name + * @param taskInfoMeta task info meta + * @throws PDException return if write to db failed + */ + private void handleMoveTaskIfFailed(String graphName, TaskInfoMeta taskInfoMeta) throws + PDException { + // 发送清理target分区的任务, 回滚target分区 + var targetPartitionIds = new HashSet(); + for (var metaTask : taskInfoMeta.scanMoveTask(graphName)) { + + var source = metaTask.getPartition(); + // 设置 source 为上线 + updatePartitionState(source.getGraphName(), source.getId(), + Metapb.PartitionState.PState_Normal); + var movedPartition = metaTask.getMovePartition().getTargetPartition(); + + if (targetPartitionIds.contains(movedPartition.getId())) { + continue; + } + + var targetPartition = getPartitionById(graphName, movedPartition.getId()); + + CleanPartition cleanPartition = CleanPartition.newBuilder() + .setKeyStart( + targetPartition.getStartKey()) + .setKeyEnd(targetPartition.getEndKey()) + .setCleanType( + CleanType.CLEAN_TYPE_KEEP_RANGE) + .setDeletePartition(false) + .build(); + fireCleanPartition(targetPartition, cleanPartition); + targetPartitionIds.add(targetPartition.getId()); + + // 设置target 上线 + updatePartitionState(targetPartition.getGraphName(), targetPartition.getId(), + Metapb.PartitionState.PState_Normal); + } + // 清理掉任务列表 + taskInfoMeta.removeMoveTaskPrefix(graphName); + } + + /** + * 处理clean task + * + * @param task clean task + */ + public void handleCleanPartitionTask(MetaTask.Task task) { + log.info("clean task {} -{}, key range:{}~{}, report: {}", + task.getPartition().getGraphName(), + task.getPartition().getId(), + task.getCleanPartition().getKeyStart(), + task.getCleanPartition().getKeyEnd(), + task.getState() + ); + + // 如果失败重试? + } + + public synchronized void handleSplitTask(MetaTask.Task task) throws PDException { + + var taskInfoMeta = storeService.getTaskInfoMeta(); + var partition = task.getPartition(); + + MetaTask.Task pdMetaTask = + taskInfoMeta.getSplitTask(partition.getGraphName(), partition.getId()); + + log.info("report split task, graph:{}, pid : {}, state: {}", + task.getPartition().getGraphName(), + task.getPartition().getId(), task.getState()); + + if (pdMetaTask != null) { + var newTask = pdMetaTask.toBuilder().setState(task.getState()).build(); + taskInfoMeta.updateSplitTask(newTask); + + List subTasks = taskInfoMeta.scanSplitTask(partition.getGraphName()); + + var finished = subTasks.stream().allMatch(t -> + t.getState() == + MetaTask.TaskState.Task_Success || + t.getState() == + MetaTask.TaskState.Task_Failure); + + if (finished) { + var allSuccess = subTasks.stream().allMatch( + t -> t.getState() == MetaTask.TaskState.Task_Success); + if (allSuccess) { + log.info("graph:{} split task all success!", partition.getGraphName()); + handleSplitTaskAllSuccess(subTasks, partition.getGraphName(), taskInfoMeta); + } else { + handleSplitTaskIfFailed(subTasks, partition.getGraphName(), taskInfoMeta); + } + } + } + } + + private void handleSplitTaskAllSuccess(List subTasks, String graphName, + TaskInfoMeta taskInfoMeta) + throws PDException { + + int addedPartitions = 0; + var partitions = new ArrayList(); + for (MetaTask.Task subTask : subTasks) { + var source = subTask.getPartition(); + var newPartition = subTask.getSplitPartition().getNewPartitionList().get(0); + + // 发送key range 变更消息 + PartitionKeyRange partitionKeyRange = PartitionKeyRange.newBuilder() + .setPartitionId(source.getId()) + .setKeyStart( + newPartition.getStartKey()) + .setKeyEnd( + newPartition.getEndKey()) + .build(); + // 通知store + fireChangePartitionKeyRange(source, partitionKeyRange); + // 将 target 设置为上线. source 理论上可能被删掉,所以不处理 + + CleanPartition cleanPartition = CleanPartition.newBuilder() + .setKeyStart(newPartition.getStartKey()) + .setKeyEnd(newPartition.getEndKey()) + .setCleanType( + CleanType.CLEAN_TYPE_KEEP_RANGE) + // target 的 partition只需要清理数据,不需要删除分区 + .setDeletePartition(false) + .build(); + + log.info("pd clean data: {}-{}, key range:{}-{}, type:{}, delete partition:{}", + source.getGraphName(), + source.getId(), + cleanPartition.getKeyStart(), + cleanPartition.getKeyEnd(), + CleanType.CLEAN_TYPE_EXCLUDE_RANGE, + cleanPartition.getDeletePartition()); + + fireCleanPartition(source, cleanPartition); + + // 更新partition state + for (var sp : subTask.getSplitPartition().getNewPartitionList()) { + partitions.add( + sp.toBuilder().setState(Metapb.PartitionState.PState_Normal).build()); + } + + addedPartitions += subTask.getSplitPartition().getNewPartitionCount() - 1; + } + + updatePartition(partitions); + partitionMeta.reload(); + + var graph = getGraph(graphName); + + // set partition count + if (pdConfig.getConfigService().getPartitionCount() != + storeService.getShardGroups().size()) { + pdConfig.getConfigService().setPartitionCount(storeService.getShardGroups().size()); + log.info("set the partition count of config server to {}", + storeService.getShardGroups().size()); + } + + // 更新graph partition count + var newGraph = graph.toBuilder() + .setPartitionCount(graph.getPartitionCount() + addedPartitions) + .build(); + updateGraph(newGraph); + + // 事务完成 + taskInfoMeta.removeSplitTaskPrefix(graphName); + } + + private void handleSplitTaskIfFailed(List subTasks, String graphName, + TaskInfoMeta taskInfoMeta) + throws PDException { + for (var metaTask : subTasks) { + var splitPartitions = metaTask.getSplitPartition().getNewPartitionList(); + for (int i = 1; i < splitPartitions.size(); i++) { + var split = splitPartitions.get(i); + CleanPartition cleanPartition = CleanPartition.newBuilder() + .setKeyStart(split.getStartKey()) + .setKeyEnd(split.getEndKey()) + .setCleanType( + CleanType.CLEAN_TYPE_EXCLUDE_RANGE) + .setDeletePartition(true) + .build(); + + fireCleanPartition(split, cleanPartition); + } + + // set partition state normal + var partition = metaTask.getPartition(); + updatePartitionState(partition.getGraphName(), partition.getId(), + Metapb.PartitionState.PState_Normal); + } + // 清理掉任务列表 + taskInfoMeta.removeSplitTaskPrefix(graphName); + } + + /** + * 接收到Leader改变的消息 + * 更新图状态,触发分区变更 + */ + protected void onPartitionChanged(Metapb.Partition old, Metapb.Partition partition) { + log.info("onPartitionChanged partition: {}", partition); + if (old != null && old.getState() != partition.getState()) { + // 状态改变,重置图的状态 + Metapb.PartitionState state = Metapb.PartitionState.PState_Normal; + for (Metapb.Partition pt : partitionMeta.getPartitions(partition.getGraphName())) { + if (pt.getState().getNumber() > state.getNumber()) { + state = pt.getState(); + } + } + try { + updateGraphState(partition.getGraphName(), state); + } catch (PDException e) { + log.error("onPartitionChanged", e); + } + + } + + statusListeners.forEach(e -> { + e.onPartitionChanged(old, partition); + }); + } + + protected void onPartitionRemoved(Metapb.Partition partition) { + log.info("onPartitionRemoved partition: {}", partition); + statusListeners.forEach(e -> { + e.onPartitionRemoved(partition); + }); + } + + /** + * PD的leader发生改变,需要重新加载数据 + */ + @Override + public void onRaftLeaderChanged() { + log.info("Partition service reload cache from rocksdb, due to leader change"); + try { + partitionMeta.reload(); + } catch (PDException e) { + log.error("Partition meta reload exception {}", e); + } + } + + /** + * 分区状态发生改变,需要传播到图、集群 + * + * @param graph + * @param partId + * @param state + */ + public void onPartitionStateChanged(String graph, int partId, + Metapb.PartitionState state) throws PDException { + updatePartitionState(graph, partId, state); + } + + /** + * Shard状态发生改变,需要传播到分区、图、集群 + * + * @param graph + * @param partId + * @param state + */ + public void onShardStateChanged(String graph, int partId, Metapb.PartitionState state) { + + } + + /** + * 发送rocksdb compaction 消息 + * + * @param partId + * @param tableName + */ + public void fireDbCompaction(int partId, String tableName) { + + try { + for (Metapb.Graph graph : getGraphs()) { + Metapb.Partition partition = + partitionMeta.getPartitionById(graph.getGraphName(), partId); + + DbCompaction dbCompaction = DbCompaction.newBuilder() + .setTableName(tableName) + .build(); + instructionListeners.forEach(cmd -> { + try { + cmd.dbCompaction(partition, dbCompaction); + } catch (Exception e) { + log.error("firedbCompaction", e); + } + }); + } + } catch (PDException e) { + e.printStackTrace(); + } + + } + + public void updateShardGroupCache(Metapb.ShardGroup group) { + partitionMeta.getPartitionCache().updateShardGroup(group); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionStatusListener.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionStatusListener.java new file mode 100644 index 0000000000..fea0ce35d9 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/PartitionStatusListener.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import org.apache.hugegraph.pd.grpc.Metapb; + +/** + * 分区状态监听 + */ +public interface PartitionStatusListener { + + void onPartitionChanged(Metapb.Partition partition, Metapb.Partition newPartition); + + void onPartitionRemoved(Metapb.Partition partition); +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/RegistryService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/RegistryService.java new file mode 100644 index 0000000000..86922d56d3 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/RegistryService.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.discovery.NodeInfo; +import org.apache.hugegraph.pd.grpc.discovery.NodeInfos; +import org.apache.hugegraph.pd.grpc.discovery.Query; +import org.apache.hugegraph.pd.meta.DiscoveryMetaStore; +import org.apache.hugegraph.pd.meta.MetadataFactory; + +public class RegistryService { + + private final PDConfig pdConfig; + private final DiscoveryMetaStore meta; + + public RegistryService(PDConfig config) { + this.pdConfig = config; + meta = MetadataFactory.newDiscoveryMeta(config); + } + + public void register(NodeInfo nodeInfo, int outTimes) throws PDException { + meta.register(nodeInfo, outTimes); + } + + public NodeInfos getNodes(Query query) { + return meta.getNodes(query); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/ShardGroupStatusListener.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/ShardGroupStatusListener.java new file mode 100644 index 0000000000..d5c068de94 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/ShardGroupStatusListener.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import org.apache.hugegraph.pd.grpc.Metapb; + +public interface ShardGroupStatusListener { + + void onShardListChanged(Metapb.ShardGroup shardGroup, Metapb.ShardGroup newShardGroup); + + void onShardListOp(Metapb.ShardGroup shardGroup); +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreMonitorDataService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreMonitorDataService.java new file mode 100644 index 0000000000..54ff6b6e8d --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreMonitorDataService.java @@ -0,0 +1,265 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.meta.MetadataKeyHelper; +import org.springframework.stereotype.Service; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class StoreMonitorDataService { + + private static final String MONITOR_DATA_PREFIX = "SMD"; + private final PDConfig pdConfig; + private final KvService kvService; + /** + * the last timestamp of the store monitor data, + * used for determine the gap of store's heartbeat. + */ + private final Map lastStoreStateTimestamp; + + public StoreMonitorDataService(PDConfig pdConfig) { + this.pdConfig = pdConfig; + this.kvService = new KvService(pdConfig); + this.lastStoreStateTimestamp = new HashMap<>(); + } + + /** + * save the store stats + * + * @param storeStats + */ + public void saveMonitorData(Metapb.StoreStats storeStats) throws PDException { + long storeId = storeStats.getStoreId(); + /** + * load the latest store timestamp when start up or alter leader + */ + if (!lastStoreStateTimestamp.containsKey(storeId)) { + long lastTimestamp = getLatestStoreMonitorDataTimeStamp(storeId); + log.debug("store id : {}, last timestamp :{}", storeId, lastTimestamp); + lastStoreStateTimestamp.put(storeId, lastTimestamp); + } + + long current = System.currentTimeMillis() / 1000; + long interval = this.pdConfig.getStore().getMonitorInterval(); + + // exceed the interval + if (current - lastStoreStateTimestamp.getOrDefault(storeId, 0L) >= interval) { + saveMonitorDataToDb(storeStats, current); + log.debug("store id: {}, system info:{}", storeId, + debugMonitorInfo(storeStats.getSystemMetricsList())); + lastStoreStateTimestamp.put(storeId, current); + } + } + + /** + * save the snapshot of store status + * + * @param storeStats store status + * @param ts, timestamp + * @return store status + * @throws PDException + */ + private void saveMonitorDataToDb(Metapb.StoreStats storeStats, long ts) throws PDException { + String key = getMonitorDataKey(storeStats.getStoreId(), ts); + log.debug("store id: {}, save monitor data info, ts:{}, my key:{}", storeStats.getStoreId(), + ts, key); + kvService.put(key, extractMetricsFromStoreStatus(storeStats)); + } + + public String debugMonitorInfo(List systemInfo) { + StringBuilder sb = new StringBuilder(); + sb.append("["); + for (Metapb.RecordPair pair : systemInfo) { + sb.append(pair.getKey()); + sb.append(":"); + sb.append(pair.getValue()); + sb.append(","); + } + sb.append("]"); + return sb.toString(); + } + + /** + * get the historical monitor data by store id, by range(start, end) + * + * @param storeId store id + * @param start range start + * @param end range end + * @return list of store stats + */ + public Map getStoreMonitorData(long storeId, long start, long end) throws + PDException { + log.debug("get monitor data, store id:{}, start{}, end:{}", + storeId, + getMonitorDataKey(storeId, start), + getMonitorDataKey(storeId, end)); + return kvService.scanRange(getMonitorDataKey(storeId, start), + getMonitorDataKey(storeId, end)); + } + + /** + * for api service + * + * @param storeId + * @return + * @throws PDException + */ + public List> getStoreMonitorData(long storeId) throws PDException { + List> result = new LinkedList<>(); + long current = System.currentTimeMillis() / 1000; + long start = current - this.pdConfig.getStore().getRetentionPeriod(); + + try { + for (Map.Entry entry : getStoreMonitorData(storeId, start, + current).entrySet()) { + String[] arr = + entry.getKey().split(String.valueOf(MetadataKeyHelper.getDelimiter())); + Map map = new HashMap(); + long timestamp = Long.parseLong(arr[arr.length - 1]); + map.put("ts", timestamp); + for (String pair : entry.getValue().split(",")) { + String[] p = pair.split(":"); + if (p.length == 2) { + map.put(p[0], Long.parseLong(p[1])); + } + } + result.add(map); + } + result.sort((o1, o2) -> o1.get("ts").compareTo(o2.get("ts"))); + } catch (PDException e) { + log.error(e.getMessage()); + } + return result; + } + + /** + * for api service, export txt + * + * @param storeId + * @return + * @throws PDException + */ + public String getStoreMonitorDataText(long storeId) throws PDException { + + List> result = getStoreMonitorData(storeId); + StringBuilder sb = new StringBuilder(); + if (result.size() > 0) { + DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + Map lastRow = result.get(result.size() - 1); + List columns = new ArrayList<>(); + // construct columns, ts + sorted keys + columns.add("ts"); + columns.addAll(lastRow.keySet().stream() + .filter(x -> !"ts".equals(x)) + .sorted() + .collect(Collectors.toList())); + sb.append(String.join(",", columns).replace("\"", "")).append("\r\n"); + for (Map row : result) { + for (String key : columns) { + // ts + , + ... + if ("ts".equals(key)) { + // format ts + sb.append(dtf.format( + LocalDateTime.ofInstant(Instant.ofEpochSecond(row.get(key)), + ZoneId.systemDefault()))); + continue; + } else { + sb.append(",").append(row.getOrDefault(key, 0L)); + } + } + sb.append("\r\n"); + } + } + return sb.toString(); + } + + /** + * remove the monitor data of the store that before till(not include) + * + * @param storeId store id + * @param till expire time + * @return affect rows + */ + public int removeExpiredMonitorData(long storeId, long till) throws PDException { + String keyStart = getMonitorDataKey(storeId, 1); + String keyEnd = getMonitorDataKey(storeId, till); + int records = 0; + for (String key : kvService.scanRange(keyStart, keyEnd).keySet()) { + kvService.delete(key); + log.debug("remove monitor data, key: {}", key); + records += 1; + } + return records; + } + + /** + * get the latest timestamp of the store monitor data + * + * @param storeId + * @return timestamp(by seconds) + */ + public long getLatestStoreMonitorDataTimeStamp(long storeId) { + long maxId = 0L; + long current = System.currentTimeMillis() / 1000; + long start = current - this.pdConfig.getStore().getMonitorInterval(); + String keyStart = getMonitorDataKey(storeId, start); + String keyEnd = getMonitorDataKey(storeId, current); + try { + for (String key : kvService.scanRange(keyStart, keyEnd).keySet()) { + String[] arr = key.split(String.valueOf(MetadataKeyHelper.getDelimiter())); + maxId = Math.max(maxId, Long.parseLong(arr[arr.length - 1])); + } + } catch (PDException e) { + } + return maxId; + } + + private String getMonitorDataKey(long storeId, long ts) { + String builder = MONITOR_DATA_PREFIX + + MetadataKeyHelper.getDelimiter() + + storeId + + MetadataKeyHelper.getDelimiter() + + ts; + return builder; + } + + private String extractMetricsFromStoreStatus(Metapb.StoreStats storeStats) { + List list = new ArrayList<>(); + for (Metapb.RecordPair pair : storeStats.getSystemMetricsList()) { + list.add("\"" + pair.getKey() + "\":" + pair.getValue()); + } + return String.join(",", list); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreNodeService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreNodeService.java new file mode 100644 index 0000000000..b755326340 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreNodeService.java @@ -0,0 +1,1073 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hugegraph.pd.common.KVPair; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.grpc.Metapb.GraphMode; +import org.apache.hugegraph.pd.grpc.Metapb.GraphModeReason; +import org.apache.hugegraph.pd.grpc.Metapb.GraphState; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.grpc.Pdpb.CacheResponse; +import org.apache.hugegraph.pd.grpc.pulse.ConfChangeType; +import org.apache.hugegraph.pd.meta.MetadataFactory; +import org.apache.hugegraph.pd.meta.MetadataKeyHelper; +import org.apache.hugegraph.pd.meta.StoreInfoMeta; +import org.apache.hugegraph.pd.meta.TaskInfoMeta; + +import com.google.gson.Gson; + +import lombok.extern.slf4j.Slf4j; + +/** + * HgStore注册、保活管理类 + */ +@Slf4j +public class StoreNodeService { + + private static final Long STORE_HEART_BEAT_INTERVAL = 30000L; + private static final String graphSpaceConfPrefix = "HUGEGRAPH/hg/GRAPHSPACE/CONF/"; + // Store状态监听 + private final List statusListeners; + private final List shardGroupStatusListeners; + private final StoreInfoMeta storeInfoMeta; + private final TaskInfoMeta taskInfoMeta; + private final Random random = new Random(System.currentTimeMillis()); + private final KvService kvService; + private final ConfigService configService; + private final PDConfig pdConfig; + private PartitionService partitionService; + private final Runnable quotaChecker = () -> { + try { + getQuota(); + } catch (Exception e) { + log.error( + "obtaining and sending graph space quota information with error: ", + e); + } + }; + private Metapb.ClusterStats clusterStats; + + public StoreNodeService(PDConfig config) { + this.pdConfig = config; + storeInfoMeta = MetadataFactory.newStoreInfoMeta(pdConfig); + taskInfoMeta = MetadataFactory.newTaskInfoMeta(pdConfig); + shardGroupStatusListeners = Collections.synchronizedList(new ArrayList<>()); + statusListeners = Collections.synchronizedList(new ArrayList()); + clusterStats = Metapb.ClusterStats.newBuilder() + .setState(Metapb.ClusterState.Cluster_Not_Ready) + .setTimestamp(System.currentTimeMillis()) + .build(); + kvService = new KvService(pdConfig); + configService = new ConfigService(pdConfig); + } + + public void init(PartitionService partitionService) { + this.partitionService = partitionService; + partitionService.addStatusListener(new PartitionStatusListener() { + @Override + public void onPartitionChanged(Metapb.Partition old, Metapb.Partition partition) { + if (old != null && old.getState() != partition.getState()) { + // 状态改变,重置集群状态 + try { + List partitions = + partitionService.getPartitionById(partition.getId()); + Metapb.PartitionState state = Metapb.PartitionState.PState_Normal; + for (Metapb.Partition pt : partitions) { + if (pt.getState().getNumber() > state.getNumber()) { + state = pt.getState(); + } + } + updateShardGroupState(partition.getId(), state); + + for (Metapb.ShardGroup group : getShardGroups()) { + if (group.getState().getNumber() > state.getNumber()) { + state = group.getState(); + } + } + updateClusterStatus(state); + } catch (PDException e) { + log.error("onPartitionChanged exception: ", e); + } + } + } + + @Override + public void onPartitionRemoved(Metapb.Partition partition) { + + } + }); + } + + /** + * 集群是否准备就绪 + * + * @return + */ + public boolean isOK() { + return this.clusterStats.getState().getNumber() < + Metapb.ClusterState.Cluster_Offline.getNumber(); + } + + /** + * Store注册,记录Store的ip地址,首次注册需要生成store_ID + * + * @param store + */ + public Metapb.Store register(Metapb.Store store) throws PDException { + if (store.getId() == 0) { + // 初始注册,生成新id,保证Id不重复。 + store = newStoreNode(store); + } + + if (!storeInfoMeta.storeExists(store.getId())) { + log.error("Store id {} does not belong to this PD, address = {}", store.getId(), + store.getAddress()); + // storeId不存在,抛出异常 + throw new PDException(Pdpb.ErrorType.STORE_ID_NOT_EXIST_VALUE, + String.format("Store id %d doest not exist.", store.getId())); + } + + // 如果store状态为Tombstone拒绝注册。 + Metapb.Store lastStore = storeInfoMeta.getStore(store.getId()); + if (lastStore.getState() == Metapb.StoreState.Tombstone) { + log.error("Store id {} has been removed, Please reinitialize, address = {}", + store.getId(), store.getAddress()); + // storeId不存在,抛出异常 + throw new PDException(Pdpb.ErrorType.STORE_HAS_BEEN_REMOVED_VALUE, + String.format("Store id %d has been removed. %s", store.getId(), + store.getAddress())); + } + + // offline或者up,或者在初始激活列表中,自动上线 + Metapb.StoreState storeState = lastStore.getState(); + if (storeState == Metapb.StoreState.Offline || storeState == Metapb.StoreState.Up + || inInitialStoreList(store)) { + storeState = Metapb.StoreState.Up; + } else { + storeState = Metapb.StoreState.Pending; + } + + store = Metapb.Store.newBuilder(lastStore) + .setAddress(store.getAddress()) + .setRaftAddress(store.getRaftAddress()) + .setDataVersion(store.getDataVersion()) + .setDeployPath(store.getDeployPath()) + .setVersion(store.getVersion()) + .setDataPath(store.getDataPath()) + .setState(storeState).setCores(store.getCores()) + .clearLabels().addAllLabels(store.getLabelsList()) + .setLastHeartbeat(System.currentTimeMillis()).build(); + + long current = System.currentTimeMillis(); + boolean raftChanged = false; + // 上线状态的Raft Address 发生了变更 + if (!Objects.equals(lastStore.getRaftAddress(), store.getRaftAddress()) && + storeState == Metapb.StoreState.Up) { + // 时间间隔太短,而且raft有变更,则认为是无效的store + if (current - lastStore.getLastHeartbeat() < STORE_HEART_BEAT_INTERVAL * 0.8) { + throw new PDException(Pdpb.ErrorType.STORE_PROHIBIT_DUPLICATE_VALUE, + String.format("Store id %d may be duplicate. addr: %s", + store.getId(), store.getAddress())); + } else if (current - lastStore.getLastHeartbeat() > STORE_HEART_BEAT_INTERVAL * 1.2) { + // 认为发生了变更 + raftChanged = true; + } else { + // 等待下次注册 + return Metapb.Store.newBuilder(store).setId(0L).build(); + } + } + + // 存储store信息 + storeInfoMeta.updateStore(store); + if (storeState == Metapb.StoreState.Up) { + // 更新store 活跃状态 + storeInfoMeta.keepStoreAlive(store); + onStoreStatusChanged(store, Metapb.StoreState.Offline, Metapb.StoreState.Up); + checkStoreStatus(); + } + + // 等store信息保存后,再发送变更 + if (raftChanged) { + onStoreRaftAddressChanged(store); + } + + log.info("Store register, id = {} {}", store.getId(), store); + return store; + } + + private boolean inInitialStoreList(Metapb.Store store) { + return this.pdConfig.getInitialStoreMap().containsKey(store.getAddress()); + } + + /** + * 产生一个新的store对象 + * + * @param store + * @return + * @throws PDException + */ + private synchronized Metapb.Store newStoreNode(Metapb.Store store) throws PDException { + long id = random.nextLong() & Long.MAX_VALUE; + while (id == 0 || storeInfoMeta.storeExists(id)) { + id = random.nextLong() & Long.MAX_VALUE; + } + store = Metapb.Store.newBuilder(store) + .setId(id) + .setState(Metapb.StoreState.Pending) + .setStartTimestamp(System.currentTimeMillis()).build(); + storeInfoMeta.updateStore(store); + return store; + } + + /** + * 根据store_id返回Store信息 + * + * @param id + * @return + * @throws PDException + */ + public Metapb.Store getStore(long id) throws PDException { + Metapb.Store store = storeInfoMeta.getStore(id); + if (store == null) { + throw new PDException(Pdpb.ErrorType.STORE_ID_NOT_EXIST_VALUE, + String.format("Store id %x doest not exist.", id)); + } + return store; + } + + /** + * 更新Store信息,检测Store状态的变化,通知到Hugestore + */ + public synchronized Metapb.Store updateStore(Metapb.Store store) throws PDException { + log.info("updateStore storeId: {}, address: {}, state: {}", store.getId(), + store.getAddress(), store.getState()); + Metapb.Store lastStore = storeInfoMeta.getStore(store.getId()); + if (lastStore == null) { + return null; + } + Metapb.Store.Builder builder = + Metapb.Store.newBuilder(lastStore).clearLabels().clearStats(); + store = builder.mergeFrom(store).build(); + if (store.getState() == Metapb.StoreState.Tombstone) { + List activeStores = getStores(); + if (lastStore.getState() == Metapb.StoreState.Up + && activeStores.size() - 1 < pdConfig.getMinStoreCount()) { + throw new PDException(Pdpb.ErrorType.LESS_ACTIVE_STORE_VALUE, + "The number of active stores is less then " + + pdConfig.getMinStoreCount()); + } + } + + storeInfoMeta.updateStore(store); + if (store.getState() != Metapb.StoreState.Unknown && + store.getState() != lastStore.getState()) { + // 如果希望将store下线 + if (store.getState() == Metapb.StoreState.Exiting) { + if (lastStore.getState() == Metapb.StoreState.Exiting) { + //如果已经是下线中的状态,则不作进一步处理 + return lastStore; + } + + List activeStores = this.getActiveStores(); + Map storeMap = new HashMap<>(); + activeStores.forEach(s -> { + storeMap.put(s.getId(), s); + }); + //如果store已经离线,直接从活跃中删除,如果store在线,暂时不从活跃中删除,等把状态置成Tombstone的时候再删除 + if (!storeMap.containsKey(store.getId())) { + log.info("updateStore removeActiveStores store {}", store.getId()); + storeInfoMeta.removeActiveStore(store); + } + storeTurnoff(store); + } else if (store.getState() == Metapb.StoreState.Offline) { //监控到store已经离线,从活跃中删除 + storeInfoMeta.removeActiveStore(store); + } else if (store.getState() == Metapb.StoreState.Tombstone) { + // 状态发生改变,Store关机,修改shardGroup,进行副本迁移 + log.info("updateStore removeActiveStores store {}", store.getId()); + storeInfoMeta.removeActiveStore(store); + // 存储下线 + storeTurnoff(store); + } else if (store.getState() == Metapb.StoreState.Up) { + storeInfoMeta.keepStoreAlive(store); + checkStoreStatus(); + } + onStoreStatusChanged(lastStore, lastStore.getState(), store.getState()); + } + return store; + } + + /** + * store被关机,重新分配shardGroup的shard + * + * @param store + * @throws PDException + */ + public synchronized void storeTurnoff(Metapb.Store store) throws PDException { + // 遍历ShardGroup,重新分配shard + for (Metapb.ShardGroup group : getShardGroupsByStore(store.getId())) { + Metapb.ShardGroup.Builder builder = Metapb.ShardGroup.newBuilder(group); + builder.clearShards(); + group.getShardsList().forEach(shard -> { + if (shard.getStoreId() != store.getId()) { + builder.addShards(shard); + } + }); + reallocShards(builder.build()); + } + } + + /** + * 根据图名返回stores信息,如果graphName为空,返回所有store信息 + * + * @throws PDException + */ + public List getStores() throws PDException { + return storeInfoMeta.getStores(null); + } + + public List getStores(String graphName) throws PDException { + return storeInfoMeta.getStores(graphName); + } + + public List getStoreStatus(boolean isActive) throws PDException { + return storeInfoMeta.getStoreStatus(isActive); + } + + public List getShardGroups() throws PDException { + return storeInfoMeta.getShardGroups(); + } + + public Metapb.ShardGroup getShardGroup(int groupId) throws PDException { + return storeInfoMeta.getShardGroup(groupId); + } + + public List getShardList(int groupId) throws PDException { + var shardGroup = getShardGroup(groupId); + if (shardGroup != null) { + return shardGroup.getShardsList(); + } + return new ArrayList<>(); + } + + public List getShardGroupsByStore(long storeId) throws PDException { + List shardGroups = new ArrayList<>(); + storeInfoMeta.getShardGroups().forEach(shardGroup -> { + shardGroup.getShardsList().forEach(shard -> { + if (shard.getStoreId() == storeId) { + shardGroups.add(shardGroup); + } + }); + }); + return shardGroups; + } + + /** + * 返回活跃的store + * + * @param graphName + * @return + * @throws PDException + */ + public List getActiveStores(String graphName) throws PDException { + return storeInfoMeta.getActiveStores(graphName); + } + + public List getActiveStores() throws PDException { + return storeInfoMeta.getActiveStores(); + } + + public List getTombStores() throws PDException { + List stores = new ArrayList<>(); + for (Metapb.Store store : this.getStores()) { + if (store.getState() == Metapb.StoreState.Tombstone) { + stores.add(store); + } + } + return stores; + } + + public long removeStore(Long storeId) throws PDException { + return storeInfoMeta.removeStore(storeId); + } + + /** + * 给partition分配store,根据图的配置,决定分配几个peer + * 分配完所有的shards,保存ShardGroup对象(store不变动,只执行一次) + */ + public synchronized List allocShards(Metapb.Graph graph, int partId) throws + PDException { + // 多图共用raft分组,因此分配shard只依赖partitionId. + // 图根据数据大小可以设置分区的数量,但总数不能超过raft分组数量 + if (storeInfoMeta.getShardGroup(partId) == null) { + // 获取活跃的store key + // 根据 partionID计算store + List stores = storeInfoMeta.getActiveStores(); + + if (stores.size() == 0) { + throw new PDException(Pdpb.ErrorType.NO_ACTIVE_STORE_VALUE, + "There is no any online store"); + } + + if (stores.size() < pdConfig.getMinStoreCount()) { + throw new PDException(Pdpb.ErrorType.LESS_ACTIVE_STORE_VALUE, + "The number of active stores is less then " + + pdConfig.getMinStoreCount()); + } + + int shardCount = pdConfig.getPartition().getShardCount(); + shardCount = Math.min(shardCount, stores.size()); + //两个shard无法选出leader + // 不能为0 + + if (shardCount == 2 || shardCount < 1) { + shardCount = 1; + } + + // 一次创建完所有的ShardGroup,保证初始的groupID有序,方便人工阅读 + for (int groupId = 0; groupId < pdConfig.getConfigService().getPartitionCount(); + groupId++) { + int storeIdx = groupId % stores.size(); //store分配规则,简化为取模 + List shards = new ArrayList<>(); + for (int i = 0; i < shardCount; i++) { + Metapb.Shard shard = + Metapb.Shard.newBuilder().setStoreId(stores.get(storeIdx).getId()) + .setRole(i == 0 ? Metapb.ShardRole.Leader : + Metapb.ShardRole.Follower) // + .build(); + shards.add(shard); + storeIdx = (storeIdx + 1) >= stores.size() ? 0 : ++storeIdx; // 顺序选择 + } + + Metapb.ShardGroup group = Metapb.ShardGroup.newBuilder() + .setId(groupId) + .setState( + Metapb.PartitionState.PState_Normal) + .addAllShards(shards).build(); + + // new group + storeInfoMeta.updateShardGroup(group); + partitionService.updateShardGroupCache(group); + onShardGroupStatusChanged(group, group); + log.info("alloc shard group: id {}", groupId); + } + } + return storeInfoMeta.getShardGroup(partId).getShardsList(); + } + + /** + * 根据graph的shard_count,重新分配shard + * 发送变更change shard指令 + */ + public synchronized List reallocShards(Metapb.ShardGroup shardGroup) throws + PDException { + List stores = storeInfoMeta.getActiveStores(); + + if (stores.size() == 0) { + throw new PDException(Pdpb.ErrorType.NO_ACTIVE_STORE_VALUE, + "There is no any online store"); + } + + if (stores.size() < pdConfig.getMinStoreCount()) { + throw new PDException(Pdpb.ErrorType.LESS_ACTIVE_STORE_VALUE, + "The number of active stores is less then " + + pdConfig.getMinStoreCount()); + } + + int shardCount = pdConfig.getPartition().getShardCount(); + shardCount = Math.min(shardCount, stores.size()); + if (shardCount == 2 || shardCount < 1) { + // 两个shard无法选出leader + // 不能为0 + shardCount = 1; + } + + List shards = new ArrayList<>(); + shards.addAll(shardGroup.getShardsList()); + + if (shardCount > shards.size()) { + // 需要增加shard + log.info("reallocShards ShardGroup {}, add shards from {} to {}", + shardGroup.getId(), shards.size(), shardCount); + int storeIdx = shardGroup.getId() % stores.size(); //store分配规则,简化为取模 + for (int addCount = shardCount - shards.size(); addCount > 0; ) { + // 检查是否已经存在 + if (!isStoreInShards(shards, stores.get(storeIdx).getId())) { + Metapb.Shard shard = Metapb.Shard.newBuilder() + .setStoreId(stores.get(storeIdx).getId()) + .build(); + shards.add(shard); + addCount--; + } + storeIdx = (storeIdx + 1) >= stores.size() ? 0 : ++storeIdx; // 顺序选择 + } + } else if (shardCount < shards.size()) { + // 需要减shard + log.info("reallocShards ShardGroup {}, remove shards from {} to {}", + shardGroup.getId(), shards.size(), shardCount); + + int subCount = shards.size() - shardCount; + Iterator iterator = shards.iterator(); + while (iterator.hasNext() && subCount > 0) { + if (iterator.next().getRole() != Metapb.ShardRole.Leader) { + iterator.remove(); + subCount--; + } + } + } else { + return shards; + } + + Metapb.ShardGroup group = Metapb.ShardGroup.newBuilder(shardGroup) + .clearShards() + .addAllShards(shards).build(); + storeInfoMeta.updateShardGroup(group); + partitionService.updateShardGroupCache(group); + // change shard group + onShardGroupStatusChanged(shardGroup, group); + + var partitions = partitionService.getPartitionById(shardGroup.getId()); + if (partitions.size() > 0) { + // send one message, change shard is regardless with partition/graph + partitionService.fireChangeShard(partitions.get(0), shards, + ConfChangeType.CONF_CHANGE_TYPE_ADJUST); + } + + log.info("reallocShards ShardGroup {}, shards: {}", group.getId(), group.getShardsList()); + return shards; + } + + /** + * 根据partition的数量,分配group shard + * + * @param groups list of (partition id, count) + * @return total groups + */ + public synchronized int splitShardGroups(List> groups) throws + PDException { + int sum = groups.stream().map(pair -> pair.getValue()).reduce(0, Integer::sum); + // shard group 太大 + if (sum > getActiveStores().size() * pdConfig.getPartition().getMaxShardsPerStore()) { + throw new PDException(Pdpb.ErrorType.Too_Many_Partitions_Per_Store_VALUE, + "can't satisfy target shard group count"); + } + + partitionService.splitPartition(groups); + + return sum; + } + + /** + * 分配shard group,为分裂做准备 + * + * @return true + * @throws PDException + */ + private boolean isStoreInShards(List shards, long storeId) { + AtomicBoolean exist = new AtomicBoolean(false); + shards.forEach(s -> { + if (s.getStoreId() == storeId) { + exist.set(true); + } + }); + return exist.get(); + } + + /** + * update shard group and cache. + * send shard group change message. + * + * @param groupId : shard group + * @param shards : shard lists + * @param version: term version, ignored if less than 0 + * @param confVersion : conf version, ignored if less than 0 + * @return + */ + public synchronized Metapb.ShardGroup updateShardGroup(int groupId, List shards, + long version, long confVersion) throws + PDException { + Metapb.ShardGroup group = this.storeInfoMeta.getShardGroup(groupId); + + if (group == null) { + return null; + } + + var builder = Metapb.ShardGroup.newBuilder(group); + if (version >= 0) { + builder.setVersion(version); + } + + if (confVersion >= 0) { + builder.setConfVer(confVersion); + } + + var newGroup = builder.clearShards().addAllShards(shards).build(); + + storeInfoMeta.updateShardGroup(newGroup); + partitionService.updateShardGroupCache(newGroup); + onShardGroupStatusChanged(group, newGroup); + log.info("Raft {} updateShardGroup {}", groupId, newGroup); + return group; + } + + /** + * 通知 store 进行shard group的重建操作 + * + * @param groupId raft group id + * @param shards shard list: 如果为空,则删除对应的partition engine + */ + public void shardGroupOp(int groupId, List shards) throws PDException { + + var shardGroup = getShardGroup(groupId); + + if (shardGroup == null) { + return; + } + + var newGroup = shardGroup.toBuilder().clearShards().addAllShards(shards).build(); + if (shards.size() == 0) { + var partitions = partitionService.getPartitionById(groupId); + for (var partition : partitions) { + partitionService.removePartition(partition.getGraphName(), groupId); + } + deleteShardGroup(groupId); + } + + onShardGroupOp(newGroup); + } + + /** + * 删除 shard group + * + * @param groupId shard group id + */ + public synchronized void deleteShardGroup(int groupId) throws PDException { + Metapb.ShardGroup group = this.storeInfoMeta.getShardGroup(groupId); + if (group != null) { + storeInfoMeta.deleteShardGroup(groupId); + } + + onShardGroupStatusChanged(group, null); + + // 修正store的分区数. (分区合并导致) + var shardGroups = getShardGroups(); + if (shardGroups != null) { + var count1 = pdConfig.getConfigService().getPDConfig().getPartitionCount(); + var maxGroupId = + getShardGroups().stream().map(Metapb.ShardGroup::getId).max(Integer::compareTo); + if (maxGroupId.get() < count1) { + pdConfig.getConfigService().setPartitionCount(maxGroupId.get() + 1); + } + } + } + + public synchronized void updateShardGroupState(int groupId, Metapb.PartitionState state) throws + PDException { + Metapb.ShardGroup shardGroup = storeInfoMeta.getShardGroup(groupId) + .toBuilder() + .setState(state).build(); + storeInfoMeta.updateShardGroup(shardGroup); + partitionService.updateShardGroupCache(shardGroup); + } + + /** + * 接收Store的心跳 + * + * @param storeStats + * @throws PDException + */ + public Metapb.ClusterStats heartBeat(Metapb.StoreStats storeStats) throws PDException { + this.storeInfoMeta.updateStoreStats(storeStats); + Metapb.Store lastStore = this.getStore(storeStats.getStoreId()); + if (lastStore == null) { + //store不存在 + throw new PDException(Pdpb.ErrorType.STORE_ID_NOT_EXIST_VALUE, + String.format("Store id %d does not exist.", + storeStats.getStoreId())); + } + if (lastStore.getState() == Metapb.StoreState.Tombstone) { + throw new PDException(Pdpb.ErrorType.STORE_HAS_BEEN_REMOVED_VALUE, + String.format( + "Store id %d is useless since it's state is Tombstone", + storeStats.getStoreId())); + } + Metapb.Store nowStore; + // 如果正在做store下线操作 + if (lastStore.getState() == Metapb.StoreState.Exiting) { + List activeStores = this.getActiveStores(); + Map storeMap = new HashMap<>(); + activeStores.forEach(store -> { + storeMap.put(store.getId(), store); + }); + // 下线的store的分区为0,说明已经迁移完毕,可以下线,如果非0,则迁移还在进行,需要等待 + if (storeStats.getPartitionCount() > 0 && + storeMap.containsKey(storeStats.getStoreId())) { + nowStore = Metapb.Store.newBuilder(lastStore) + .setStats(storeStats) + .setLastHeartbeat(System.currentTimeMillis()) + .setState(Metapb.StoreState.Exiting).build(); + this.storeInfoMeta.updateStore(nowStore); + return this.clusterStats; + } else { + nowStore = Metapb.Store.newBuilder(lastStore) + .setStats(storeStats) + .setLastHeartbeat(System.currentTimeMillis()) + .setState(Metapb.StoreState.Tombstone).build(); + this.storeInfoMeta.updateStore(nowStore); + storeInfoMeta.removeActiveStore(nowStore); + return this.clusterStats; + } + } + + if (lastStore.getState() == Metapb.StoreState.Pending) { + nowStore = Metapb.Store.newBuilder(lastStore) + .setStats(storeStats) + .setLastHeartbeat(System.currentTimeMillis()) + .setState(Metapb.StoreState.Pending).build(); + this.storeInfoMeta.updateStore(nowStore); + return this.clusterStats; + } else { + if (lastStore.getState() == Metapb.StoreState.Offline) { + this.updateStore( + Metapb.Store.newBuilder(lastStore).setState(Metapb.StoreState.Up).build()); + } + nowStore = Metapb.Store.newBuilder(lastStore) + .setState(Metapb.StoreState.Up) + .setStats(storeStats) + .setLastHeartbeat(System.currentTimeMillis()).build(); + this.storeInfoMeta.updateStore(nowStore); + this.storeInfoMeta.keepStoreAlive(nowStore); + this.checkStoreStatus(); + return this.clusterStats; + } + } + + public synchronized Metapb.ClusterStats updateClusterStatus(Metapb.ClusterState state) { + this.clusterStats = clusterStats.toBuilder().setState(state).build(); + return this.clusterStats; + } + + public Metapb.ClusterStats updateClusterStatus(Metapb.PartitionState state) { + Metapb.ClusterState cstate = Metapb.ClusterState.Cluster_OK; + switch (state) { + case PState_Normal: + cstate = Metapb.ClusterState.Cluster_OK; + break; + case PState_Warn: + cstate = Metapb.ClusterState.Cluster_Warn; + break; + case PState_Fault: + cstate = Metapb.ClusterState.Cluster_Fault; + break; + case PState_Offline: + cstate = Metapb.ClusterState.Cluster_Offline; + break; + } + return updateClusterStatus(cstate); + } + + public Metapb.ClusterStats getClusterStats() { + return this.clusterStats; + } + + /** + * 检查集群健康状态 + * 活跃机器数是否大于最小阈值 + * 分区shard在线数已否过半 * + */ + public synchronized void checkStoreStatus() { + Metapb.ClusterStats.Builder builder = Metapb.ClusterStats.newBuilder() + .setState( + Metapb.ClusterState.Cluster_OK); + try { + List activeStores = this.getActiveStores(); + if (activeStores.size() < pdConfig.getMinStoreCount()) { + builder.setState(Metapb.ClusterState.Cluster_Not_Ready); + builder.setMessage("The number of active stores is " + activeStores.size() + + ", less than pd.initial-store-count:" + + pdConfig.getMinStoreCount()); + } + Map storeMap = new HashMap<>(); + activeStores.forEach(store -> { + storeMap.put(store.getId(), store); + }); + + if (builder.getState() == Metapb.ClusterState.Cluster_OK) { + // 检查每个分区的在线shard数量是否大于半数 + for (Metapb.ShardGroup group : this.getShardGroups()) { + int count = 0; + for (Metapb.Shard shard : group.getShardsList()) { + count += storeMap.containsKey(shard.getStoreId()) ? 1 : 0; + } + if (count * 2 < group.getShardsList().size()) { + builder.setState(Metapb.ClusterState.Cluster_Not_Ready); + builder.setMessage( + "Less than half of active shard, partitionId is " + group.getId()); + break; + } + } + } + + } catch (PDException e) { + log.error("StoreNodeService updateClusterStatus exception {}", e); + } + this.clusterStats = builder.setTimestamp(System.currentTimeMillis()).build(); + if (this.clusterStats.getState() != Metapb.ClusterState.Cluster_OK) { + log.error("The cluster is not ready, {}", this.clusterStats); + } + } + + public void addStatusListener(StoreStatusListener listener) { + statusListeners.add(listener); + } + + protected void onStoreRaftAddressChanged(Metapb.Store store) { + log.info("onStoreRaftAddressChanged storeId = {}, new raft addr:", store.getId(), + store.getRaftAddress()); + statusListeners.forEach(e -> { + e.onStoreRaftChanged(store); + }); + } + + public void addShardGroupStatusListener(ShardGroupStatusListener listener) { + shardGroupStatusListeners.add(listener); + } + + protected void onStoreStatusChanged(Metapb.Store store, Metapb.StoreState old, + Metapb.StoreState stats) { + log.info("onStoreStatusChanged storeId = {} from {} to {}", store.getId(), old, stats); + statusListeners.forEach(e -> { + e.onStoreStatusChanged(store, old, stats); + }); + } + + protected void onShardGroupStatusChanged(Metapb.ShardGroup group, Metapb.ShardGroup newGroup) { + log.info("onShardGroupStatusChanged, groupId: {}, from {} to {}", group.getId(), group, + newGroup); + shardGroupStatusListeners.forEach(e -> e.onShardListChanged(group, newGroup)); + } + + protected void onShardGroupOp(Metapb.ShardGroup shardGroup) { + log.info("onShardGroupOp, group id: {}, shard group:{}", shardGroup.getId(), shardGroup); + shardGroupStatusListeners.forEach(e -> e.onShardListOp(shardGroup)); + } + + /** + * 检查当前store是否可下线 + * 活跃机器数小于等于最小阈值,不可下线 + * 分区shard在线数不超过半数, 不可下线 + */ + public boolean checkStoreCanOffline(Metapb.Store currentStore) { + try { + long currentStoreId = currentStore.getId(); + List activeStores = this.getActiveStores(); + Map storeMap = new HashMap<>(); + activeStores.forEach(store -> { + if (store.getId() != currentStoreId) { + storeMap.put(store.getId(), store); + } + }); + + if (storeMap.size() < pdConfig.getMinStoreCount()) { + return false; + } + + // 检查每个分区的在线shard数量是否大于半数 + for (Metapb.ShardGroup group : this.getShardGroups()) { + int count = 0; + for (Metapb.Shard shard : group.getShardsList()) { + long storeId = shard.getStoreId(); + count += storeMap.containsKey(storeId) ? 1 : 0; + } + if (count * 2 < group.getShardsList().size()) { + return false; + } + } + } catch (PDException e) { + log.error("StoreNodeService checkStoreCanOffline exception {}", e); + return false; + } + + return true; + } + + /** + * 对store上的对rocksdb进行compaction + * + * @param groupId + * @param tableName + * @return + */ + public synchronized void shardGroupsDbCompaction(int groupId, String tableName) throws + PDException { + + // 通知所有的store,对rocksdb进行compaction + partitionService.fireDbCompaction(groupId, tableName); + // TODO 异常怎么处理? + } + + public Map getQuota() throws PDException { + List graphs = partitionService.getGraphs(); + String delimiter = String.valueOf(MetadataKeyHelper.DELIMITER); + HashMap storages = new HashMap<>(); + for (Metapb.Graph g : graphs) { + String graphName = g.getGraphName(); + String[] splits = graphName.split(delimiter); + if (!graphName.endsWith("/g") || splits.length < 2) { + continue; + } + String graphSpace = splits[0]; + storages.putIfAbsent(graphSpace, 0L); + List stores = getStores(graphName); + long dataSize = 0; + for (Metapb.Store store : stores) { + List gss = store.getStats() + .getGraphStatsList(); + for (Metapb.GraphStats gs : gss) { + boolean nameEqual = graphName.equals(gs.getGraphName()); + boolean roleEqual = Metapb.ShardRole.Leader.equals( + gs.getRole()); + if (nameEqual && roleEqual) { + dataSize += gs.getApproximateSize(); + } + } + } + Long size = storages.get(graphSpace); + size += dataSize; + storages.put(graphSpace, size); + + } + Metapb.GraphSpace.Builder spaceBuilder = Metapb.GraphSpace.newBuilder(); + HashMap limits = new HashMap<>(); + for (Map.Entry item : storages.entrySet()) { + String spaceName = item.getKey(); + String value = kvService.get(graphSpaceConfPrefix + spaceName); + if (!StringUtils.isEmpty(value)) { + HashMap config = new Gson().fromJson(value, HashMap.class); + Long size = item.getValue(); + int limit = ((Double) config.get("storage_limit")).intValue(); + long limitByLong = limit * 1024L * 1024L; + try { + spaceBuilder.setName(spaceName).setStorageLimit(limitByLong).setUsedSize(size); + Metapb.GraphSpace graphSpace = spaceBuilder.build(); + configService.setGraphSpace(graphSpace); + } catch (Exception e) { + log.error("update graph space with error:", e); + } + // KB and GB * 1024L * 1024L + if (size > limitByLong) { + limits.put(spaceName, true); + continue; + } + } + limits.put(spaceName, false); + + } + GraphState.Builder stateBuilder = GraphState.newBuilder() + .setMode(GraphMode.ReadOnly) + .setReason( + GraphModeReason.Quota); + for (Metapb.Graph g : graphs) { + String graphName = g.getGraphName(); + String[] splits = graphName.split(delimiter); + if (!graphName.endsWith("/g") || splits.length < 2) { + continue; + } + String graphSpace = splits[0]; + Metapb.GraphState gsOld = g.getGraphState(); + GraphMode gmOld = gsOld != null ? gsOld.getMode() : GraphMode.ReadWrite; + GraphMode gmNew = limits.get( + graphSpace) ? GraphMode.ReadOnly : GraphMode.ReadWrite; + if (gmOld == null || gmOld.getNumber() != gmNew.getNumber()) { + stateBuilder.setMode(gmNew); + if (gmNew.getNumber() == GraphMode.ReadOnly.getNumber()) { + stateBuilder.setReason(GraphModeReason.Quota); + } + GraphState gsNew = stateBuilder.build(); + Metapb.Graph newGraph = g.toBuilder().setGraphState(gsNew) + .build(); + partitionService.updateGraph(newGraph); + statusListeners.forEach(listener -> { + listener.onGraphChange(newGraph, gsOld, gsNew); + }); + } + } + + return limits; + } + + public Runnable getQuotaChecker() { + return quotaChecker; + } + + public TaskInfoMeta getTaskInfoMeta() { + return taskInfoMeta; + } + + public StoreInfoMeta getStoreInfoMeta() { + return storeInfoMeta; + } + + /** + * 获得分区的Leader + * + * @param partition + * @param initIdx + * @return + */ + public Metapb.Shard getLeader(Metapb.Partition partition, int initIdx) { + Metapb.Shard leader = null; + try { + var shardGroup = this.getShardGroup(partition.getId()); + for (Metapb.Shard shard : shardGroup.getShardsList()) { + if (shard.getRole() == Metapb.ShardRole.Leader) { + leader = shard; + } + } + } catch (Exception e) { + log.error("get leader error: group id:{}, error: {}", + partition.getId(), e.getMessage()); + } + return leader; + } + + public CacheResponse getCache() throws PDException { + + List stores = getStores(); + List groups = getShardGroups(); + List graphs = partitionService.getGraphs(); + CacheResponse cache = CacheResponse.newBuilder().addAllGraphs(graphs) + .addAllShards(groups) + .addAllStores(stores) + .build(); + return cache; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreStatusListener.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreStatusListener.java new file mode 100644 index 0000000000..a5b96cf307 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/StoreStatusListener.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import org.apache.hugegraph.pd.grpc.Metapb; + +public interface StoreStatusListener { + + void onStoreStatusChanged(Metapb.Store store, Metapb.StoreState old, + Metapb.StoreState status); + + void onGraphChange(Metapb.Graph graph, Metapb.GraphState stateOld, + Metapb.GraphState stateNew); + + void onStoreRaftChanged(Metapb.Store store); +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/TaskScheduleService.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/TaskScheduleService.java new file mode 100644 index 0000000000..889e5a0234 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/TaskScheduleService.java @@ -0,0 +1,845 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.PriorityQueue; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; + +import org.apache.hugegraph.pd.common.KVPair; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.MetaTask; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.meta.TaskInfoMeta; +import org.apache.hugegraph.pd.raft.RaftEngine; + +import lombok.extern.slf4j.Slf4j; + +/** + * 任务调度服务,定时检查Store、资源、分区的状态,及时迁移数据,错误节点 + * 1、监测Store是否离线 + * 2、监测Partition的副本是否正确 + * 3、监测Partition的工作模式是否正确 + * 4、监测Partition是否需要分裂,监测分裂是否完成 + */ +@Slf4j +public class TaskScheduleService { + + private static final String BALANCE_SHARD_KEY = "BALANCE_SHARD_KEY"; + private final long TurnOffAndBalanceInterval = 30 * 60 * 1000; //机器下线30后才能进行动态平衡 + private final long BalanceLeaderInterval = 30 * 1000; // leader平衡时间间隔 + private final PDConfig pdConfig; + private final long clusterStartTime; // + private final StoreNodeService storeService; + private final PartitionService partitionService; + private final ScheduledExecutorService executor; + private final TaskInfoMeta taskInfoMeta; + private final StoreMonitorDataService storeMonitorDataService; + private final KvService kvService; + private final LogService logService; + // 先按照value排序,再按照key排序 + private final Comparator> kvPairComparatorAsc = (o1, o2) -> { + if (o1.getValue() == o2.getValue()) { + return o1.getKey().compareTo(o2.getKey()); + } + return o1.getValue().compareTo(o2.getValue()); + }; + // 先按照value排序(倒序),再按照key排序(升序) + private final Comparator> kvPairComparatorDesc = (o1, o2) -> { + if (o1.getValue() == o2.getValue()) { + return o2.getKey().compareTo(o1.getKey()); + } + return o2.getValue().compareTo(o1.getValue()); + }; + private long lastStoreTurnoffTime = 0; + private long lastBalanceLeaderTime = 0; + + public TaskScheduleService(PDConfig config, StoreNodeService storeService, + PartitionService partitionService) { + this.pdConfig = config; + this.storeService = storeService; + this.partitionService = partitionService; + this.taskInfoMeta = new TaskInfoMeta(config); + this.logService = new LogService(pdConfig); + this.storeMonitorDataService = new StoreMonitorDataService(pdConfig); + this.clusterStartTime = System.currentTimeMillis(); + this.kvService = new KvService(pdConfig); + this.executor = new ScheduledThreadPoolExecutor(16); + } + + public void init() { + executor.scheduleWithFixedDelay(() -> { + try { + patrolStores(); + } catch (Throwable e) { + log.error("patrolStores exception: ", e); + } + + }, 60, 60, TimeUnit.SECONDS); + executor.scheduleWithFixedDelay(() -> { + try { + patrolPartitions(); + balancePartitionLeader(false); + balancePartitionShard(); + } catch (Throwable e) { + log.error("patrolPartitions exception: ", e); + } + }, pdConfig.getPatrolInterval(), pdConfig.getPatrolInterval(), TimeUnit.SECONDS); + executor.scheduleWithFixedDelay(() -> { + if (isLeader()) { + kvService.clearTTLData(); + } + }, 1000, 1000, TimeUnit.MILLISECONDS); + executor.scheduleWithFixedDelay( + () -> { + if (isLeader()) { + storeService.getQuotaChecker(); + } + }, 2, 30, + TimeUnit.SECONDS); + // clean expired monitor data each 10 minutes, delay 3min. + if (isLeader() && this.pdConfig.getStore().isMonitorDataEnabled()) { + executor.scheduleAtFixedRate(() -> { + Long expTill = System.currentTimeMillis() / 1000 - + this.pdConfig.getStore().getRetentionPeriod(); + log.debug("monitor data keys before " + expTill + " will be deleted"); + int records = 0; + try { + for (Metapb.Store store : storeService.getStores()) { + int cnt = + this.storeMonitorDataService.removeExpiredMonitorData(store.getId(), + expTill); + log.debug("store id :{}, records:{}", store.getId(), cnt); + records += cnt; + } + } catch (PDException e) { + throw new RuntimeException(e); + } + log.debug(String.format("%d records has been deleted", records)); + }, 180, 600, TimeUnit.SECONDS); + } + + storeService.addStatusListener(new StoreStatusListener() { + @Override + public void onStoreStatusChanged(Metapb.Store store, Metapb.StoreState old, + Metapb.StoreState status) { + if (status == Metapb.StoreState.Tombstone) { + lastStoreTurnoffTime = System.currentTimeMillis(); + } + + if (status == Metapb.StoreState.Up) { + executor.schedule(() -> { + try { //store 上线后延时1分钟进行leader平衡 + balancePartitionLeader(false); + } catch (PDException e) { + log.error("exception {}", e); + } + }, BalanceLeaderInterval, TimeUnit.MILLISECONDS); + + } + } + + @Override + public void onGraphChange(Metapb.Graph graph, + Metapb.GraphState stateOld, + Metapb.GraphState stateNew) { + + } + + @Override + public void onStoreRaftChanged(Metapb.Store store) { + + } + }); + } + + public void shutDown() { + executor.shutdownNow(); + } + + private boolean isLeader() { + return RaftEngine.getInstance().isLeader(); + } + + /** + * 巡查所有的store,检查是否在线,存储空间是否充足 + */ + public List patrolStores() throws PDException { + if (!isLeader()) { + return null; + } + + List changedStores = new ArrayList<>(); + // 检查store在线状态 + List stores = storeService.getStores(""); + Map activeStores = storeService.getActiveStores("") + .stream().collect( + Collectors.toMap(Metapb.Store::getId, t -> t)); + for (Metapb.Store store : stores) { + Metapb.Store changeStore = null; + if ((store.getState() == Metapb.StoreState.Up + || store.getState() == Metapb.StoreState.Unknown) + && !activeStores.containsKey(store.getId())) { + // 不在线,修改状态为离线 + changeStore = Metapb.Store.newBuilder(store) + .setState(Metapb.StoreState.Offline) + .build(); + + } else if ((store.getState() == Metapb.StoreState.Exiting && + !activeStores.containsKey(store.getId())) || + (store.getState() == Metapb.StoreState.Offline && + (System.currentTimeMillis() - store.getLastHeartbeat() > + pdConfig.getStore().getMaxDownTime() * 1000) && + (System.currentTimeMillis() - clusterStartTime > + pdConfig.getStore().getMaxDownTime() * 1000))) { + //手工修改为下线或者离线达到时长 + // 修改状态为关机, 增加 checkStoreCanOffline 检测 + if (storeService.checkStoreCanOffline(store)) { + changeStore = Metapb.Store.newBuilder(store) + .setState(Metapb.StoreState.Tombstone).build(); + this.logService.insertLog(LogService.NODE_CHANGE, + LogService.TASK, changeStore); + log.info("patrolStores store {} Offline", changeStore.getId()); + } + } + if (changeStore != null) { + storeService.updateStore(changeStore); + changedStores.add(changeStore); + } + } + return changedStores; + } + + /** + * 巡查所有的分区,检查副本数是否正确 + */ + public List patrolPartitions() throws PDException { + if (!isLeader()) { + return null; + } + + // 副本数不一致,重新分配副本 + for (Metapb.ShardGroup group : storeService.getShardGroups()) { + if (group.getShardsCount() != pdConfig.getPartition().getShardCount()) { + storeService.reallocShards(group); + // 避免后面的 balance partition shard 马上执行. + kvService.put(BALANCE_SHARD_KEY, "DOING", 180 * 1000); + } + } + //检查shard是否在线。 + Map tombStores = storeService.getTombStores().stream().collect( + Collectors.toMap(Metapb.Store::getId, t -> t)); + + var partIds = new HashSet(); + + for (var pair : tombStores.entrySet()) { + for (var partition : partitionService.getPartitionByStore(pair.getValue())) { + if (partIds.contains(partition.getId())) { + continue; + } + partIds.add(partition.getId()); + + storeService.storeTurnoff(pair.getValue()); + partitionService.shardOffline(partition, pair.getValue().getId()); + } + + } + + return null; + } + + /** + * 在Store之间平衡分区的数量 + * 机器转为UP半小时后才能进行动态平衡 + */ + public synchronized Map> balancePartitionShard() throws + PDException { + log.info("balancePartitions starting, isleader:{}", isLeader()); + + if (!isLeader()) { + return null; + } + + if (System.currentTimeMillis() - lastStoreTurnoffTime < TurnOffAndBalanceInterval) { + return null;//机器下线半小时后才能进行动态平衡 + } + + int activeStores = storeService.getActiveStores().size(); + if (activeStores == 0) { + log.warn("balancePartitionShard non active stores, skip to balancePartitionShard"); + return null; + } + + // 避免频繁调用. (当改变副本数,需要调整shard list,此时又需要平衡分区)会发送重复的指令。造成结果不可预料。 + // 严重会删除掉分区. + if (Objects.equals(kvService.get(BALANCE_SHARD_KEY), "DOING")) { + return null; + } + + int totalShards = pdConfig.getConfigService().getPartitionCount() * + pdConfig.getPartition().getShardCount(); + int averageCount = totalShards / activeStores; + int remainder = totalShards % activeStores; + + // 统计每个store上分区, StoreId ->PartitionID, ShardRole + Map> partitionMap = new HashMap<>(); + storeService.getActiveStores().forEach(store -> { + partitionMap.put(store.getId(), new HashMap<>()); + }); + + // 如果是leaner 说明迁移正在进行,不要重复提交任务 + AtomicReference isLeaner = new AtomicReference<>(false); + partitionService.getPartitions().forEach(partition -> { + + try { + storeService.getShardList(partition.getId()).forEach(shard -> { + Long storeId = shard.getStoreId(); + // 判断每个shard为leaner或者状态非正常状态 + if (shard.getRole() == Metapb.ShardRole.Learner + || partition.getState() != Metapb.PartitionState.PState_Normal) { + isLeaner.set(true); + } + if (partitionMap.containsKey(storeId)) { + partitionMap.get(storeId).put(partition.getId(), shard.getRole()); + } + }); + } catch (PDException e) { + log.error("get partition {} shard list error:{}.", partition.getId(), + e.getMessage()); + } + }); + + if (isLeaner.get()) { + log.warn("balancePartitionShard is doing, skip this balancePartitionShard task"); + return null; + } + + // 按照shard数量由高到低排序store + List> sortedList = new ArrayList<>(); + partitionMap.forEach((storeId, shards) -> { + sortedList.add(new KVPair(storeId, shards.size())); + }); + // 由大到小排序的list + sortedList.sort(((o1, o2) -> o2.getValue().compareTo(o1.getValue()))); + // 最大堆 + PriorityQueue> maxHeap = new PriorityQueue<>(sortedList.size(), + (o1, o2) -> o2.getValue() + .compareTo( + o1.getValue())); + + // 各个副本的 committedIndex + Map> committedIndexMap = partitionService.getCommittedIndexStats(); + // 分区ID --> 源StoreID,目标StoreID + Map> movedPartitions = new HashMap<>(); + // 移除多余的shard, 按照shards由多到少的顺序遍历store,余数remainder优先给shards多的store分配,减少迁移的概率 + for (int index = 0; index < sortedList.size(); index++) { + long storeId = sortedList.get(index).getKey(); + if (!partitionMap.containsKey(storeId)) { + log.error("cannot found storeId {} in partitionMap", storeId); + return null; + } + Map shards = partitionMap.get(storeId); + int targetCount = index < remainder ? averageCount + 1 : averageCount; + // 移除多余的shard, 添加源StoreID. 非Leader,并且该分区唯一 + if (shards.size() > targetCount) { + int movedCount = shards.size() - targetCount; + log.info( + "balancePartitionShard storeId {}, shardsSize {}, targetCount {}, " + + "moveCount {}", + storeId, shards.size(), targetCount, movedCount); + for (Iterator iterator = shards.keySet().iterator(); + movedCount > 0 && iterator.hasNext(); ) { + Integer id = iterator.next(); + + if (!movedPartitions.containsKey(id)) { + log.info("store {}, shard of partition {} can be moved", storeId, id); + movedPartitions.put(id, new KVPair<>(storeId, 0L)); + movedCount--; + } + } + } else if (shards.size() < targetCount) { + int addCount = targetCount - shards.size(); + log.info( + "balancePartitionShard storeId {}, shardsSize {}, targetCount {}, " + + "addCount {}", + storeId, shards.size(), targetCount, addCount); + maxHeap.add(new KVPair<>(storeId, addCount)); + } + } + + if (movedPartitions.size() == 0) { + log.warn( + "movedPartitions is empty, totalShards:{} averageCount:{} remainder:{} " + + "sortedList:{}", + totalShards, averageCount, remainder, sortedList); + } + Iterator>> moveIterator = + movedPartitions.entrySet().iterator(); + + while (moveIterator.hasNext()) { + if (maxHeap.size() == 0) { + break; + } + Map.Entry> moveEntry = moveIterator.next(); + int partitionId = moveEntry.getKey(); + long sourceStoreId = moveEntry.getValue().getKey(); + + List> tmpList = new ArrayList<>(maxHeap.size()); + while (maxHeap.size() > 0) { + KVPair pair = maxHeap.poll(); + long destStoreId = pair.getKey(); + boolean destContains = false; + if (partitionMap.containsKey(destStoreId)) { + destContains = partitionMap.get(destStoreId).containsKey(partitionId); + } + // 如果目的store已经包含了该partition,则取一下store + if (!destContains) { + moveEntry.getValue().setValue(pair.getKey()); + log.info( + "balancePartitionShard will move partition {} from store {} to store " + + "{}", + moveEntry.getKey(), + moveEntry.getValue().getKey(), + moveEntry.getValue().getValue()); + if (pair.getValue() > 1) { + pair.setValue(pair.getValue() - 1); + tmpList.add(pair); + } + break; + } + tmpList.add(pair); + } + maxHeap.addAll(tmpList); + } + + kvService.put(BALANCE_SHARD_KEY, "DOING", 180 * 1000); + + // 开始迁移 + movedPartitions.forEach((partId, storePair) -> { + // 源和目标storeID都不为0 + if (storePair.getKey() > 0 && storePair.getValue() > 0) { + partitionService.movePartitionsShard(partId, storePair.getKey(), + storePair.getValue()); + } else { + log.warn("balancePartitionShard key or value is zero, partId:{} storePair:{}", + partId, storePair); + } + }); + return movedPartitions; + } + + /** + * 在Store之间平衡分区的Leader的数量 + */ + public synchronized Map balancePartitionLeader(boolean immediately) throws + PDException { + Map results = new HashMap<>(); + + if (!isLeader()) { + return results; + } + + if (!immediately && + System.currentTimeMillis() - lastBalanceLeaderTime < BalanceLeaderInterval) { + return results; + } + lastBalanceLeaderTime = System.currentTimeMillis(); + + List shardGroups = storeService.getShardGroups(); + + // 分裂或者缩容任务的时候,退出 + var taskMeta = storeService.getTaskInfoMeta(); + if (taskMeta.hasSplitTaskDoing() || taskMeta.hasMoveTaskDoing()) { + throw new PDException(1001, "split or combine task is processing, please try later!"); + } + + // 数据迁移的时候,退出 + if (Objects.equals(kvService.get(BALANCE_SHARD_KEY), "DOING")) { + throw new PDException(1001, "balance shard is processing, please try later!"); + } + + if (shardGroups.size() == 0) { + return results; + } + + Map storeShardCount = new HashMap<>(); + + shardGroups.forEach(group -> { + group.getShardsList().forEach(shard -> { + storeShardCount.put(shard.getStoreId(), + storeShardCount.getOrDefault(shard.getStoreId(), 0) + 1); + }); + }); + + log.info("balancePartitionLeader, shard group size: {}, by store: {}", shardGroups.size(), + storeShardCount); + + // 按照 target count, store id稳定排序 + PriorityQueue> targetCount = + new PriorityQueue<>(kvPairComparatorDesc); + + var sortedGroups = storeShardCount.entrySet().stream() + .map(entry -> new KVPair<>(entry.getKey(), + entry.getValue())) + .sorted(kvPairComparatorAsc) + .collect(Collectors.toList()); + int sum = 0; + + for (int i = 0; i < sortedGroups.size() - 1; i++) { + // at least one + int v = Math.max( + sortedGroups.get(i).getValue() / pdConfig.getPartition().getShardCount(), 1); + targetCount.add(new KVPair<>(sortedGroups.get(i).getKey(), v)); + sum += v; + } + // 最后一个, 除不尽的情况,保证总数正确 + targetCount.add(new KVPair<>(sortedGroups.get(sortedGroups.size() - 1).getKey(), + shardGroups.size() - sum)); + log.info("target count: {}", targetCount); + + for (var group : shardGroups) { + var map = group.getShardsList().stream() + .collect(Collectors.toMap(Metapb.Shard::getStoreId, shard -> shard)); + var tmpList = new ArrayList>(); + // store比较多的情况,可能不包含对应的store id. 则先将不符合的store保存到临时列表,直到找到一个合适的store + while (!targetCount.isEmpty()) { + var pair = targetCount.poll(); + var storeId = pair.getKey(); + if (map.containsKey(storeId)) { + if (map.get(storeId).getRole() != Metapb.ShardRole.Leader) { + log.info("shard group{}, store id:{}, set to leader", group.getId(), + storeId); + partitionService.transferLeader(group.getId(), map.get(storeId)); + results.put(group.getId(), storeId); + } else { + log.info("shard group {}, store id :{}, is leader, no need change", + group.getId(), storeId); + } + + if (pair.getValue() > 1) { + // count -1 + pair.setValue(pair.getValue() - 1); + tmpList.add(pair); + } + // 找到了,则处理完成 + break; + } else { + tmpList.add(pair); + } + } + targetCount.addAll(tmpList); + } + + return results; + } + + private long getMaxIndexGap(Map> committedIndexMap, int partitionId) { + long maxGap = Long.MAX_VALUE; + if (committedIndexMap == null || !committedIndexMap.containsKey(partitionId)) { + return maxGap; + } + Map shardMap = committedIndexMap.get(partitionId); + if (shardMap == null || shardMap.size() == 0) { + return maxGap; + } + List sortedList = new ArrayList<>(); + shardMap.forEach((storeId, committedIndex) -> { + sortedList.add(committedIndex); + }); + // 由大到小排序的list + sortedList.sort(Comparator.reverseOrder()); + maxGap = sortedList.get(0) - sortedList.get(sortedList.size() - 1); + return maxGap; + } + + /** + * 执行分区分裂,分为自动分裂和手工分裂 + * + * @return + * @throws PDException + */ + public List splitPartition( + Pdpb.OperationMode mode, List params) throws PDException { + + if (mode == Pdpb.OperationMode.Auto) { + return autoSplitPartition(); + } + + var list = params.stream() + .map(param -> new KVPair<>(param.getPartitionId(), param.getCount())) + .collect(Collectors.toList()); + + storeService.splitShardGroups(list); + return null; + } + + /** + * 自动进行分区分裂,每个store达到最大分区数量 + * 执行条件 + * 分裂后每台机器分区数量少于partition.max-partitions-per-store + * + * @throws PDException + */ + public List autoSplitPartition() throws PDException { + if (!isLeader()) { + return null; + } + + if (Metapb.ClusterState.Cluster_OK != storeService.getClusterStats().getState()) { + if (Metapb.ClusterState.Cluster_Offline == storeService.getClusterStats().getState()) { + throw new PDException(Pdpb.ErrorType.Split_Partition_Doing_VALUE, + "The data is splitting"); + } else { + throw new PDException(Pdpb.ErrorType.Cluster_State_Forbid_Splitting_VALUE, + "The current state of the cluster prohibits splitting data"); + } + } + + //For TEST + // pdConfig.getPartition().setMaxShardsPerStore(pdConfig.getPartition() + // .getMaxShardsPerStore()*2); + + // 计算集群能能支持的最大split count + int splitCount = pdConfig.getPartition().getMaxShardsPerStore() * + storeService.getActiveStores().size() / + (storeService.getShardGroups().size() * + pdConfig.getPartition().getShardCount()); + + if (splitCount < 2) { + throw new PDException(Pdpb.ErrorType.Too_Many_Partitions_Per_Store_VALUE, + "Too many partitions per store, partition.store-max-shard-count" + + " = " + + pdConfig.getPartition().getMaxShardsPerStore()); + } + + // 每store未达最大分区数,进行分裂 + log.info("Start to split partitions..., split count = {}", splitCount); + + // 设置集群状态为下线 + storeService.updateClusterStatus(Metapb.ClusterState.Cluster_Offline); + // 修改默认分区数量 + // pdConfig.getConfigService().setPartitionCount(storeService.getShardGroups().size() * + // splitCount); + + var list = storeService.getShardGroups().stream() + .map(shardGroup -> new KVPair<>(shardGroup.getId(), splitCount)) + .collect(Collectors.toList()); + storeService.splitShardGroups(list); + + return null; + } + + /** + * Store汇报任务状态 + * 分区状态发生改变,重新计算分区所在的ShardGroup、图和整个集群的状态 + * + * @param task + */ + public void reportTask(MetaTask.Task task) { + try { + switch (task.getType()) { + case Split_Partition: + partitionService.handleSplitTask(task); + break; + case Move_Partition: + partitionService.handleMoveTask(task); + break; + case Clean_Partition: + partitionService.handleCleanPartitionTask(task); + break; + default: + break; + } + } catch (Exception e) { + log.error("Report task exception {}, {}", e, task); + } + } + + /** + * 对rocksdb进行compaction + * + * @throws PDException + */ + public Boolean dbCompaction(String tableName) throws PDException { + if (!isLeader()) { + return false; + } + + for (Metapb.ShardGroup shardGroup : storeService.getShardGroups()) { + storeService.shardGroupsDbCompaction(shardGroup.getId(), tableName); + } + + // + return true; + } + + /** + * 判断是否能把一个store的分区全部迁出,给出判断结果和迁移方案 + */ + public Map canAllPartitionsMovedOut(Metapb.Store sourceStore) throws + PDException { + if (!isLeader()) { + return null; + } + // 分析一个store上面的分区是否可以完全迁出 + Map resultMap = new HashMap<>(); + // 定义对象用于保存源store上面的分区 StoreId ->PartitionID, ShardRole + Map> sourcePartitionMap = new HashMap<>(); + sourcePartitionMap.put(sourceStore.getId(), new HashMap<>()); + // 定义对象用于保存其他活跃store上面的分区 StoreId ->PartitionID, ShardRole + Map> otherPartitionMap = new HashMap<>(); + Map availableDiskSpace = new HashMap<>(); // 每个store剩余的磁盘空间 + Map partitionDataSize = new HashMap<>(); // 记录待迁移的分区的数据量 + + storeService.getActiveStores().forEach(store -> { + if (store.getId() != sourceStore.getId()) { + otherPartitionMap.put(store.getId(), new HashMap<>()); + // 记录其他store的剩余的磁盘空间, 单位为Byte + availableDiskSpace.put(store.getId(), store.getStats().getAvailable()); + } else { + resultMap.put("current_store_is_online", true); + } + }); + // 统计待迁移的分区的数据大小 (从storeStats中统计,单位为KB) + for (Metapb.GraphStats graphStats : sourceStore.getStats().getGraphStatsList()) { + partitionDataSize.put(graphStats.getPartitionId(), + partitionDataSize.getOrDefault(graphStats.getPartitionId(), 0L) + + graphStats.getApproximateSize()); + } + // 给sourcePartitionMap 和 otherPartitionMap赋值 + partitionService.getPartitions().forEach(partition -> { + try { + storeService.getShardList(partition.getId()).forEach(shard -> { + long storeId = shard.getStoreId(); + if (storeId == sourceStore.getId()) { + sourcePartitionMap.get(storeId).put(partition.getId(), shard.getRole()); + } else { + if (otherPartitionMap.containsKey(storeId)) { + otherPartitionMap.get(storeId).put(partition.getId(), shard.getRole()); + } + } + + }); + } catch (PDException e) { + throw new RuntimeException(e); + } + }); + // 统计待移除的分区:即源store上面的所有分区 + Map> movedPartitions = new HashMap<>(); + for (Map.Entry entry : sourcePartitionMap.get( + sourceStore.getId()).entrySet()) { + movedPartitions.put(entry.getKey(), new KVPair<>(sourceStore.getId(), 0L)); + } + // 统计其他store的分区数量, 用小顶堆保存,以便始终把分区数量较少的store优先考虑 + PriorityQueue> minHeap = new PriorityQueue<>(otherPartitionMap.size(), + (o1, o2) -> o1.getValue() + .compareTo( + o2.getValue())); + otherPartitionMap.forEach((storeId, shards) -> { + minHeap.add(new KVPair(storeId, shards.size())); + }); + // 遍历待迁移的分区,优先迁移到分区比较少的store + Iterator>> moveIterator = + movedPartitions.entrySet().iterator(); + while (moveIterator.hasNext()) { + Map.Entry> moveEntry = moveIterator.next(); + int partitionId = moveEntry.getKey(); + List> tmpList = new ArrayList<>(); // 记录已经弹出优先队列的元素 + while (minHeap.size() > 0) { + KVPair pair = minHeap.poll(); //弹出首个元素 + long storeId = pair.getKey(); + int partitionCount = pair.getValue(); + Map shards = otherPartitionMap.get(storeId); + final int unitRate = 1024; // 平衡不同存储单位的进率 + if ((!shards.containsKey(partitionId)) && ( + availableDiskSpace.getOrDefault(storeId, 0L) / unitRate >= + partitionDataSize.getOrDefault(partitionId, 0L))) { + // 如果目标store上面不包含该分区,且目标store剩余空间能容纳该分区,则进行迁移 + moveEntry.getValue().setValue(storeId); //设置移动的目标store + log.info("plan to move partition {} to store {}, " + + "available disk space {}, current partitionSize:{}", + partitionId, + storeId, + availableDiskSpace.getOrDefault(storeId, 0L) / unitRate, + partitionDataSize.getOrDefault(partitionId, 0L) + ); + // 更新该store预期的剩余空间 + availableDiskSpace.put(storeId, availableDiskSpace.getOrDefault(storeId, 0L) + - partitionDataSize.getOrDefault(partitionId, + 0L) * + unitRate); + // 更新统计变量中该store的分区数量 + partitionCount += 1; + pair.setValue(partitionCount); + tmpList.add(pair); + break; + } else { + tmpList.add(pair); + } + } + minHeap.addAll(tmpList); + } + //检查是否未存在未分配目标store的分区 + List remainPartitions = new ArrayList<>(); + movedPartitions.forEach((partId, storePair) -> { + if (storePair.getValue() == 0L) { + remainPartitions.add(partId); + } + }); + if (remainPartitions.size() > 0) { + resultMap.put("flag", false); + resultMap.put("movedPartitions", null); + } else { + resultMap.put("flag", true); + resultMap.put("movedPartitions", movedPartitions); + } + return resultMap; + + } + + public Map> movePartitions( + Map> movedPartitions) { + if (!isLeader()) { + return null; + } + // 开始迁移 + log.info("begin move partitions:"); + movedPartitions.forEach((partId, storePair) -> { + // 源和目标storeID都不为0 + if (storePair.getKey() > 0 && storePair.getValue() > 0) { + partitionService.movePartitionsShard(partId, storePair.getKey(), + storePair.getValue()); + } + }); + return movedPartitions; + } + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/config/PDConfig.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/config/PDConfig.java new file mode 100644 index 0000000000..6ff66459ef --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/config/PDConfig.java @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.config; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hugegraph.pd.ConfigService; +import org.apache.hugegraph.pd.IdService; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Configuration; +import org.springframework.stereotype.Component; + +import lombok.Data; + +/** + * PD配置文件 + */ +@Data +@Component +public class PDConfig { + + @Value("${pd.cluster_id:1}") + private long clusterId; // 集群ID + + @Value("${pd.patrol-interval:300}") + private long patrolInterval = 300; //巡查任务时间间隔 + @Value("${pd.data-path}") + private String dataPath; + @Value("${pd.initial-store-count:3}") + private int minStoreCount; + + // 初始store列表,该列表内的store自动激活 + @Value("${pd.initial-store-list: ''}") + private String initialStoreList; + @Value("${grpc.host}") + private String host; + + @Value("${license.verify-path}") + private String verifyPath; + @Value("${license.license-path}") + private String licensePath; + @Autowired + private ThreadPoolGrpc threadPoolGrpc; + @Autowired + private Raft raft; + @Autowired + private Store store; + @Autowired + private Partition partition; + @Autowired + private Discovery discovery; + private Map initialStoreMap = null; + private ConfigService configService; + private IdService idService; + + public Map getInitialStoreMap() { + if (initialStoreMap == null) { + initialStoreMap = new HashMap<>(); + Arrays.asList(initialStoreList.split(",")).forEach(s -> { + initialStoreMap.put(s, s); + }); + } + return initialStoreMap; + } + + /** + * 初始分区数量 + * Store数量 * 每Store最大副本数 /每分区副本数 + * + * @return + */ + public int getInitialPartitionCount() { + return getInitialStoreMap().size() * partition.getMaxShardsPerStore() + / partition.getShardCount(); + } + + public ConfigService getConfigService() { + return configService; + } + + public void setConfigService(ConfigService configService) { + this.configService = configService; + } + + public IdService getIdService() { + return idService; + } + + public void setIdService(IdService idService) { + this.idService = idService; + } + + @Data + @Configuration + public class ThreadPoolGrpc { + + @Value("${thread.pool.grpc.core:600}") + private int core; + @Value("${thread.pool.grpc.max:1000}") + private int max; + @Value("${thread.pool.grpc.queue:" + Integer.MAX_VALUE + "}") + private int queue; + } + + @Data + @Configuration + public class Raft { + + @Value("${raft.enable:true }") + private boolean enable; + @Value("${raft.address}") + private String address; + @Value("${pd.data-path}") + private String dataPath; + @Value("${raft.peers-list}") + private String peersList; + @Value("${raft.snapshotInterval: 300}") + private int snapshotInterval; + @Value("${raft.rpc-timeout:10000}") + private int rpcTimeout; + @Value("${grpc.host}") + private String host; + @Value("${server.port}") + private int port; + + @Value("${pd.cluster_id:1}") + private long clusterId; // 集群ID + @Value("${grpc.port}") + private int grpcPort; + + public String getGrpcAddress() { + return host + ":" + grpcPort; + } + } + + @Data + @Configuration + public class Store { + + // store 心跳超时时间 + @Value("${store.keepAlive-timeout:300}") + private long keepAliveTimeout = 300; + @Value("${store.max-down-time:1800}") + private long maxDownTime = 1800; + + @Value("${store.monitor_data_enabled:true}") + private boolean monitorDataEnabled = true; + + @Value("${store.monitor_data_interval: 1 minute}") + private String monitorDataInterval = "1 minute"; + + @Value("${store.monitor_data_retention: 1 day}") + private String monitorDataRetention = "1 day"; + + /** + * interval -> seconds. + * minimum value is 1 seconds. + * + * @return the seconds of the interval + */ + public Long getMonitorInterval() { + return parseTimeExpression(this.monitorDataInterval); + } + + /** + * the monitor data that saved in rocksdb, will be deleted + * out of period + * + * @return the period of the monitor data should keep + */ + public Long getRetentionPeriod() { + return parseTimeExpression(this.monitorDataRetention); + } + + /** + * parse time expression , support pattern: + * [1-9][ ](second, minute, hour, day, month, year) + * unit could not be null, the number part is 1 by default. + * + * @param exp + * @return seconds value of the expression. 1 will return by illegal expression + */ + private Long parseTimeExpression(String exp) { + if (exp != null) { + Pattern pattern = Pattern.compile( + "(?(\\d+)*)(\\s)*(?(second|minute|hour|day|month|year)$)"); + Matcher matcher = pattern.matcher(exp.trim()); + if (matcher.find()) { + String n = matcher.group("n"); + String unit = matcher.group("unit"); + + if (null == n || n.length() == 0) { + n = "1"; + } + + Long interval; + switch (unit) { + case "minute": + interval = 60L; + break; + case "hour": + interval = 3600L; + break; + case "day": + interval = 86400L; + break; + case "month": + interval = 86400L * 30; + break; + case "year": + interval = 86400L * 365; + break; + case "second": + default: + interval = 1L; + } + // avoid n == '0' + return Math.max(1L, interval * Integer.parseInt(n)); + } + } + return 1L; + } + + } + + @Data + @Configuration + public class Partition { + + private int totalCount = 0; + + // 每个Store最大副本数 + @Value("${partition.store-max-shard-count:24}") + private int maxShardsPerStore = 24; + + // 默认分副本数量 + @Value("${partition.default-shard-count:3}") + private int shardCount = 3; + + public int getTotalCount() { + if (totalCount == 0) { + totalCount = getInitialPartitionCount(); + } + return totalCount; + } + + public void setTotalCount(int totalCount) { + this.totalCount = totalCount; + } + } + + @Data + @Configuration + public class Discovery { + + // 客户端注册后,无心跳最长次数,超过后,之前的注册信息会被删除 + @Value("${discovery.heartbeat-try-count:3}") + private int heartbeatOutTimes = 3; + } + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/ConfigMetaStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/ConfigMetaStore.java new file mode 100644 index 0000000000..df332f46b6 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/ConfigMetaStore.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.List; +import java.util.Optional; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; + +public class ConfigMetaStore extends MetadataRocksDBStore { + + private final long clusterId; + + public ConfigMetaStore(PDConfig pdConfig) { + super(pdConfig); + this.clusterId = pdConfig.getClusterId(); + } + + /** + * 更新图空间存储状态信息 + * + * @param + */ + public Metapb.GraphSpace setGraphSpace(Metapb.GraphSpace graphSpace) throws PDException { + byte[] graphSpaceKey = MetadataKeyHelper.getGraphSpaceKey(graphSpace.getName()); + graphSpace = graphSpace.toBuilder().setTimestamp(System.currentTimeMillis()).build(); + put(graphSpaceKey, graphSpace.toByteArray()); + return graphSpace; + } + + public List getGraphSpace(String graphSpace) throws PDException { + byte[] graphSpaceKey = MetadataKeyHelper.getGraphSpaceKey(graphSpace); + return scanPrefix(Metapb.GraphSpace.parser(), graphSpaceKey); + } + + public Metapb.PDConfig setPdConfig(Metapb.PDConfig pdConfig) throws PDException { + byte[] graphSpaceKey = + MetadataKeyHelper.getPdConfigKey(String.valueOf(pdConfig.getVersion())); + Metapb.PDConfig config = Metapb.PDConfig.newBuilder( + pdConfig).setTimestamp(System.currentTimeMillis()).build(); + put(graphSpaceKey, config.toByteArray()); + return config; + } + + public Metapb.PDConfig getPdConfig(long version) throws PDException { + byte[] graphSpaceKey = MetadataKeyHelper.getPdConfigKey(version <= 0 ? null : + String.valueOf(version)); + Optional max = scanPrefix( + Metapb.PDConfig.parser(), graphSpaceKey).stream().max( + (o1, o2) -> (o1.getVersion() > o2.getVersion()) ? 1 : -1); + return max.isPresent() ? max.get() : null; + } + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/DiscoveryMetaStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/DiscoveryMetaStore.java new file mode 100644 index 0000000000..78bfe3473c --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/DiscoveryMetaStore.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.discovery.NodeInfo; +import org.apache.hugegraph.pd.grpc.discovery.NodeInfos; +import org.apache.hugegraph.pd.grpc.discovery.Query; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class DiscoveryMetaStore extends MetadataRocksDBStore { + + /** + * appName --> address --> registryInfo + */ + private static final String PREFIX = "REGIS-"; + private static final String SPLITTER = "-"; + + public DiscoveryMetaStore(PDConfig pdConfig) { + super(pdConfig); + } + + public void register(NodeInfo nodeInfo, int outTimes) throws PDException { + putWithTTL(toKey(nodeInfo.getAppName(), nodeInfo.getVersion(), nodeInfo.getAddress()), + nodeInfo.toByteArray(), (nodeInfo.getInterval() / 1000) * outTimes); + } + + byte[] toKey(String appName, String version, String address) { + StringBuilder builder = getPrefixBuilder(appName, version); + builder.append(SPLITTER); + builder.append(address); + return builder.toString().getBytes(); + } + + private StringBuilder getPrefixBuilder(String appName, String version) { + StringBuilder builder = new StringBuilder(); + builder.append(PREFIX); + if (!StringUtils.isEmpty(appName)) { + builder.append(appName); + builder.append(SPLITTER); + } + if (!StringUtils.isEmpty(version)) { + builder.append(version); + } + return builder; + } + + public NodeInfos getNodes(Query query) { + List nodeInfos = null; + try { + StringBuilder builder = getPrefixBuilder(query.getAppName(), + query.getVersion()); + nodeInfos = getInstanceListWithTTL( + NodeInfo.parser(), + builder.toString().getBytes()); + builder.setLength(0); + } catch (PDException e) { + log.error("An error occurred getting data from the store,{}", e); + } + if (query.getLabelsMap() != null && !query.getLabelsMap().isEmpty()) { + List result = new LinkedList(); + for (NodeInfo node : nodeInfos) { + if (labelMatch(node, query)) { + result.add(node); + } + } + return NodeInfos.newBuilder().addAllInfo(result).build(); + } + return NodeInfos.newBuilder().addAllInfo(nodeInfos).build(); + + } + + private boolean labelMatch(NodeInfo node, Query query) { + Map labelsMap = node.getLabelsMap(); + for (Map.Entry entry : query.getLabelsMap().entrySet()) { + if (!entry.getValue().equals(labelsMap.get(entry.getKey()))) { + return false; + } + } + return true; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/IdMetaStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/IdMetaStore.java new file mode 100644 index 0000000000..177e4255b5 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/IdMetaStore.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.store.KV; + +import com.caucho.hessian.io.Hessian2Input; +import com.caucho.hessian.io.Hessian2Output; + +import lombok.extern.slf4j.Slf4j; + +/** + * 自增id的实现类 + */ +@Slf4j +public class IdMetaStore extends MetadataRocksDBStore { + + private static final String ID_PREFIX = "@ID@"; + private static final String CID_PREFIX = "@CID@"; + private static final String CID_SLOT_PREFIX = "@CID_SLOT@"; + private static final String CID_DEL_SLOT_PREFIX = "@CID_DEL_SLOT@"; + private static final String SEPARATOR = "@"; + private static final ConcurrentHashMap SEQUENCES = new ConcurrentHashMap<>(); + public static long CID_DEL_TIMEOUT = 24 * 3600 * 1000; + private final long clusterId; + + public IdMetaStore(PDConfig pdConfig) { + super(pdConfig); + this.clusterId = pdConfig.getClusterId(); + } + + public static long bytesToLong(byte[] b) { + ByteBuffer buf = ByteBuffer.wrap(b); + return buf.getLong(); + } + + public static byte[] longToBytes(long l) { + ByteBuffer buf = ByteBuffer.wrap(new byte[Long.SIZE]); + buf.putLong(l); + buf.flip(); + return buf.array(); + } + + /** + * 获取自增id + * + * @param key + * @param delta + * @return + * @throws PDException + */ + public long getId(String key, int delta) throws PDException { + Object probableLock = getLock(key); + byte[] keyBs = (ID_PREFIX + key).getBytes(Charset.defaultCharset()); + synchronized (probableLock) { + byte[] bs = getOne(keyBs); + long current = bs != null ? bytesToLong(bs) : 0L; + long next = current + delta; + put(keyBs, longToBytes(next)); + return current; + } + } + + private Object getLock(String key) { + Object probableLock = new Object(); + Object currentLock = SEQUENCES.putIfAbsent(key, probableLock); + if (currentLock != null) { + probableLock = currentLock; + } + return probableLock; + } + + public void resetId(String key) throws PDException { + Object probableLock = new Object(); + Object currentLock = SEQUENCES.putIfAbsent(key, probableLock); + if (currentLock != null) { + probableLock = currentLock; + } + byte[] keyBs = (ID_PREFIX + key).getBytes(Charset.defaultCharset()); + synchronized (probableLock) { + removeByPrefix(keyBs); + } + } + + /** + * 在删除name标识的cid的24小时内重复申请同一个name的cid保持同一值 + * 如此设计为了防止缓存的不一致,造成数据错误 + * + * @param key + * @param name cid 标识 + * @param max + * @return + * @throws PDException + */ + public long getCId(String key, String name, long max) throws PDException { + // 检测是否有过期的cid,删除图的频率比较低,此处对性能影响不大 + byte[] delKeyPrefix = (CID_DEL_SLOT_PREFIX + + key + SEPARATOR).getBytes(Charset.defaultCharset()); + synchronized (this) { + scanPrefix(delKeyPrefix).forEach(kv -> { + long[] value = (long[]) deserialize(kv.getValue()); + if (value.length >= 2) { + if (System.currentTimeMillis() - value[1] > CID_DEL_TIMEOUT) { + try { + delCId(key, value[0]); + remove(kv.getKey()); + } catch (Exception e) { + log.error("Exception ", e); + } + } + } + }); + + // 从延时删除队列恢复Key + byte[] cidDelayKey = getCIDDelayKey(key, name); + byte[] value = getOne(cidDelayKey); + if (value != null) { + // 从延迟删除队列删除 + remove(cidDelayKey); + return ((long[]) deserialize(value))[0]; + } else { + return getCId(key, max); + } + } + } + + /** + * 添加到删除队列,延后删除 + */ + public long delCIdDelay(String key, String name, long cid) throws PDException { + byte[] delKey = getCIDDelayKey(key, name); + put(delKey, serialize(new long[]{cid, System.currentTimeMillis()})); + return cid; + } + + /** + * 获取自增循环不重复id, 达到上限后从0开始自增 + * + * @param key + * @param max id上限,达到该值后,重新从0开始自增 + * @return + * @throws PDException + */ + public long getCId(String key, long max) throws PDException { + Object probableLock = getLock(key); + byte[] keyBs = (CID_PREFIX + key).getBytes(Charset.defaultCharset()); + synchronized (probableLock) { + byte[] bs = getOne(keyBs); + long current = bs != null ? bytesToLong(bs) : 0L; + long last = current == 0 ? max - 1 : current - 1; + { // 查找一个未使用的cid + List kvs = scanRange(genCIDSlotKey(key, current), genCIDSlotKey(key, max)); + for (KV kv : kvs) { + if (current == bytesToLong(kv.getValue())) { + current++; + } else { + break; + } + } + } + if (current == max) { + current = 0; + List kvs = scanRange(genCIDSlotKey(key, current), genCIDSlotKey(key, last)); + for (KV kv : kvs) { + if (current == bytesToLong(kv.getValue())) { + current++; + } else { + break; + } + } + } + if (current == last) { + return -1; + } + put(genCIDSlotKey(key, current), longToBytes(current)); + put(keyBs, longToBytes(current + 1)); + return current; + } + } + + private byte[] genCIDSlotKey(String key, long value) { + byte[] keySlot = (CID_SLOT_PREFIX + key + SEPARATOR).getBytes(Charset.defaultCharset()); + ByteBuffer buf = ByteBuffer.allocate(keySlot.length + Long.SIZE); + buf.put(keySlot); + buf.put(longToBytes(value)); + return buf.array(); + } + + private byte[] getCIDDelayKey(String key, String name) { + byte[] bsKey = (CID_DEL_SLOT_PREFIX + + key + SEPARATOR + + name).getBytes(Charset.defaultCharset()); + return bsKey; + } + + /** + * 删除一个循环id,释放id值 + * + * @param key + * @param cid + * @return + * @throws PDException + */ + public long delCId(String key, long cid) throws PDException { + return remove(genCIDSlotKey(key, cid)); + } + + private byte[] serialize(Object obj) { + try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { + Hessian2Output output = new Hessian2Output(bos); + output.writeObject(obj); + output.flush(); + return bos.toByteArray(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Object deserialize(byte[] bytes) { + try (ByteArrayInputStream bis = new ByteArrayInputStream(bytes)) { + Hessian2Input input = new Hessian2Input(bis); + Object obj = input.readObject(); + input.close(); + return obj; + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/LogMeta.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/LogMeta.java new file mode 100644 index 0000000000..ee791d5e04 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/LogMeta.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.List; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; + +public class LogMeta extends MetadataRocksDBStore { + + private final PDConfig pdConfig; + + public LogMeta(PDConfig pdConfig) { + super(pdConfig); + this.pdConfig = pdConfig; + } + + public void insertLog(Metapb.LogRecord record) throws PDException { + byte[] storeLogKey = MetadataKeyHelper.getLogKey(record); + put(storeLogKey, record.toByteArray()); + + } + + public List getLog(String action, Long start, Long end) throws PDException { + byte[] keyStart = MetadataKeyHelper.getLogKeyPrefix(action, start); + byte[] keyEnd = MetadataKeyHelper.getLogKeyPrefix(action, end); + List stores = this.scanRange(Metapb.LogRecord.parser(), + keyStart, keyEnd); + return stores; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataFactory.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataFactory.java new file mode 100644 index 0000000000..cc247041cf --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataFactory.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.raft.RaftEngine; +import org.apache.hugegraph.pd.store.HgKVStore; +import org.apache.hugegraph.pd.store.HgKVStoreImpl; +import org.apache.hugegraph.pd.store.RaftKVStore; + +/** + * 存储工厂类,创建相关对象的存储类 + */ +public class MetadataFactory { + + private static HgKVStore store = null; + + public static HgKVStore getStore(PDConfig pdConfig) { + if (store == null) { + synchronized (MetadataFactory.class) { + if (store == null) { + HgKVStore proto = new HgKVStoreImpl(); + //proto.init(pdConfig); + store = pdConfig.getRaft().isEnable() ? + new RaftKVStore(RaftEngine.getInstance(), proto) : + proto; + store.init(pdConfig); + } + } + } + return store; + } + + public static void closeStore() { + if (store != null) { + store.close(); + } + } + + public static StoreInfoMeta newStoreInfoMeta(PDConfig pdConfig) { + return new StoreInfoMeta(pdConfig); + } + + public static PartitionMeta newPartitionMeta(PDConfig pdConfig) { + return new PartitionMeta(pdConfig); + } + + public static IdMetaStore newHugeServerMeta(PDConfig pdConfig) { + return new IdMetaStore(pdConfig); + } + + public static DiscoveryMetaStore newDiscoveryMeta(PDConfig pdConfig) { + return new DiscoveryMetaStore(pdConfig); + } + + public static ConfigMetaStore newConfigMeta(PDConfig pdConfig) { + return new ConfigMetaStore(pdConfig); + } + + public static TaskInfoMeta newTaskInfoMeta(PDConfig pdConfig) { + return new TaskInfoMeta(pdConfig); + } + + public static QueueStore newQueueStore(PDConfig pdConfig) { + return new QueueStore(pdConfig); + } + + public static LogMeta newLogMeta(PDConfig pdConfig) { + return new LogMeta(pdConfig); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataKeyHelper.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataKeyHelper.java new file mode 100644 index 0000000000..193b3b7229 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataKeyHelper.java @@ -0,0 +1,379 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.nio.charset.Charset; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hugegraph.pd.grpc.Metapb; + +public class MetadataKeyHelper { + + public static final char DELIMITER = '/'; + + private static final String STORE = "STORE"; + private static final String ACTIVESTORE = "ACTIVESTORE"; + private static final String STORESTATUS = "STORESTATUS"; + private static final String PARTITION = "PARTITION"; + private static final String PARTITION_V36 = "PARTITION_V36"; + private static final String SHARDGROUP = "SHARDGROUP"; + + private static final String PARTITION_STATUS = "PARTITION_STATUS"; + private static final String GRAPH = "GRAPH"; + private static final String GRAPHMETA = "GRAPHMETA"; + private static final String GRAPH_SPACE = "GRAPH_SPACE"; + private static final String PD_CONFIG = "PD_CONFIG"; + private static final String TASK_SPLIT = "TASK_SPLIT"; + private static final String TASK_MOVE = "TASK_MOVE"; + private static final String LOG_RECORD = "LOG_RECORD"; + + private static final String QUEUE = "QUEUE"; + + public static byte[] getStoreInfoKey(final long storeId) { + //STORE/{storeId} + String key = StringBuilderHelper.get() + .append(STORE).append(DELIMITER) + .append(storeId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getActiveStoreKey(final long storeId) { + //ACTIVESTORE/{storeId} + String key = StringBuilderHelper.get() + .append(ACTIVESTORE).append(DELIMITER) + .append(storeId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getActiveStorePrefix() { + //ACTIVESTORE + String key = StringBuilderHelper.get() + .append(ACTIVESTORE).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getStorePrefix() { + //STORE + String key = StringBuilderHelper.get() + .append(STORE).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getStoreStatusKey(final long storeId) { + //STORESTATUS/{storeId} + String key = StringBuilderHelper.get() + .append(STORESTATUS).append(DELIMITER) + .append(storeId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getShardGroupKey(final long groupId) { + //SHARDGROUP/{storeId} + String key = StringBuilderHelper.get() + .append(SHARDGROUP).append(DELIMITER) + .append(groupId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getShardGroupPrefix() { + //SHARDGROUP + String key = StringBuilderHelper.get() + .append(SHARDGROUP).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getPartitionKey(final String graphName, final int partId) { + //GRAPH/{graphName}/Partition/{partId} + String key = StringBuilderHelper.get() + .append(GRAPH).append(DELIMITER) + .append(graphName).append(DELIMITER) + .append(PARTITION).append(DELIMITER) + .append(partId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getPartitionV36Key(final String graphName, final int partId) { + // GRAPH/{graphName}/PartitionV36/{partId} + String key = StringBuilderHelper.get() + .append(GRAPH).append(DELIMITER) + .append(graphName).append(DELIMITER) + .append(PARTITION_V36).append(DELIMITER) + .append(partId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getPartitionPrefix(final String graphName) { + //GRAPH/{graph}/Partition + String key = StringBuilderHelper.get() + .append(GRAPH).append(DELIMITER) + .append(graphName).append(DELIMITER) + .append(PARTITION).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getShardKey(final long storeId, final int partId) { + //SHARD/{graphName}/{type} + String key = StringBuilderHelper.get() + .append(SHARDGROUP).append(DELIMITER) + .append(storeId).append(DELIMITER) + .append(partId) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getShardPrefix(final long storeId) { + //SHARD/{graphName}/{type} + String key = StringBuilderHelper.get() + .append(SHARDGROUP).append(DELIMITER) + .append(storeId).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getGraphKey(final String graphName) { + //GRAPHMETA/{graphName} + String key = StringBuilderHelper.get() + .append(GRAPHMETA).append(DELIMITER) + .append(graphName).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getGraphPrefix() { + //GRAPHMETA/{ + String key = StringBuilderHelper.get() + .append(GRAPHMETA).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getPartitionStatusKey(String graphName, int id) { + //PARTITION_STATUS/{ + String key = StringBuilderHelper.get() + .append(PARTITION_STATUS) + .append(DELIMITER) + .append(graphName).append(DELIMITER) + .append(id).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getPartitionStatusPrefixKey(String graphName) { + //PARTITION_STATUS/{ + StringBuilder builder = StringBuilderHelper.get().append(PARTITION_STATUS) + .append(DELIMITER); + if (!StringUtils.isEmpty(graphName)) { + builder.append(graphName).append(DELIMITER); + } + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getGraphSpaceKey(String graphSpace) { + //GRAPH_SPACE/{ + StringBuilder builder = StringBuilderHelper.get().append( + GRAPH_SPACE).append(DELIMITER); + if (!StringUtils.isEmpty(graphSpace)) { + builder.append(graphSpace).append(DELIMITER); + } + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getPdConfigKey(String configKey) { + //PD_CONFIG/{ + StringBuilder builder = StringBuilderHelper.get().append( + PD_CONFIG).append(DELIMITER); + if (!StringUtils.isEmpty(configKey)) { + builder.append(configKey).append(DELIMITER); + } + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getQueueItemPrefix() { + //QUEUE + String key = StringBuilderHelper.get() + .append(QUEUE).append(DELIMITER) + .toString(); + return key.getBytes(Charset.defaultCharset()); + } + + public static byte[] getQueueItemKey(String itemId) { + //QUEUE + StringBuilder builder = StringBuilderHelper.get() + .append(QUEUE).append(DELIMITER); + if (!StringUtils.isEmpty(itemId)) { + builder.append(itemId).append(DELIMITER); + } + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getSplitTaskKey(String graphName, int groupId) { + // TASK_SPLIT/{GraphName}/{partitionID} + StringBuilder builder = StringBuilderHelper.get() + .append(TASK_SPLIT).append(DELIMITER) + .append(graphName).append(DELIMITER) + .append(groupId); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getSplitTaskPrefix(String graphName) { + // TASK_SPLIT/{GraphName}/ + StringBuilder builder = StringBuilderHelper.get() + .append(TASK_SPLIT).append(DELIMITER) + .append(graphName); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getAllSplitTaskPrefix() { + // TASK_SPLIT/{GraphName}/ + StringBuilder builder = StringBuilderHelper.get() + .append(TASK_SPLIT).append(DELIMITER); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getMoveTaskKey(String graphName, int targetGroupId, int groupId) { + // TASK_MOVE/{GraphName}/to PartitionID/{source partitionID} + StringBuilder builder = StringBuilderHelper.get() + .append(TASK_MOVE).append(DELIMITER) + .append(graphName).append(DELIMITER) + .append(targetGroupId).append(DELIMITER) + .append(groupId); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getMoveTaskPrefix(String graphName) { + // TASK_MOVE/{graphName}/toPartitionId/ + StringBuilder builder = StringBuilderHelper.get() + .append(TASK_MOVE).append(DELIMITER) + .append(graphName); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getAllMoveTaskPrefix() { + // TASK_MOVE/{graphName}/toPartitionId/ + StringBuilder builder = StringBuilderHelper.get() + .append(TASK_MOVE).append(DELIMITER); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getLogKey(Metapb.LogRecord record) { + //LOG_RECORD/{action}/{time}/ + StringBuilder builder = StringBuilderHelper.get() + .append(LOG_RECORD) + .append(DELIMITER) + .append(record.getAction()) + .append(DELIMITER) + .append(record.getTimestamp()); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getLogKeyPrefix(String action, long time) { + //LOG_RECORD/{action}/{time}/ + StringBuilder builder = StringBuilderHelper.get() + .append(LOG_RECORD) + .append(DELIMITER) + .append(action) + .append(DELIMITER) + .append(time); + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getKVPrefix(String prefix, String key) { + //K@/{key} + StringBuilder builder = StringBuilderHelper.get() + .append(prefix).append(DELIMITER); + if (!StringUtils.isEmpty(key)) { + builder.append(key).append(DELIMITER); + } + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static byte[] getKVTTLPrefix(String ttlPrefix, String prefix, String key) { + StringBuilder builder = StringBuilderHelper.get().append(ttlPrefix) + .append(prefix).append(DELIMITER); + if (!StringUtils.isEmpty(key)) { + builder.append(key).append(DELIMITER); + } + return builder.toString().getBytes(Charset.defaultCharset()); + } + + public static String getKVWatchKeyPrefix(String key, String watchDelimiter, long clientId) { + StringBuilder builder = StringBuilderHelper.get(); + builder.append(watchDelimiter).append(DELIMITER); + builder.append(key == null ? "" : key).append(DELIMITER); + builder.append(clientId); + return builder.toString(); + } + + public static String getKVWatchKeyPrefix(String key, String watchDelimiter) { + StringBuilder builder = StringBuilderHelper.get(); + builder.append(watchDelimiter).append(DELIMITER); + builder.append(key == null ? "" : key).append(DELIMITER); + return builder.toString(); + } + + public static char getDelimiter() { + return DELIMITER; + } + + public static StringBuilder getStringBuilderHelper() { + return StringBuilderHelper.get(); + } + + static class StringBuilderHelper { + + private static final int DISCARD_LIMIT = 1024 << 3; // 8k + + private static final ThreadLocal holderThreadLocal = ThreadLocal + .withInitial(StringBuilderHolder::new); + + public static StringBuilder get() { + final StringBuilderHolder holder = holderThreadLocal.get(); + return holder.getStringBuilder(); + } + + public static void truncate() { + final StringBuilderHolder holder = holderThreadLocal.get(); + holder.truncate(); + } + + private static class StringBuilderHolder { + + private final StringBuilder buf = new StringBuilder(); + + private StringBuilder getStringBuilder() { + truncate(); + return buf; + } + + private void truncate() { + buf.setLength(0); + } + } + } + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataRocksDBStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataRocksDBStore.java new file mode 100644 index 0000000000..bf77e41c05 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataRocksDBStore.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.store.HgKVStore; +import org.apache.hugegraph.pd.store.KV; + +import com.google.protobuf.Parser; + +public class MetadataRocksDBStore extends MetadataStoreBase { + + HgKVStore store; + + PDConfig pdConfig; + + public MetadataRocksDBStore(PDConfig pdConfig) { + store = MetadataFactory.getStore(pdConfig); + this.pdConfig = pdConfig; + } + + public HgKVStore getStore() { + if (store == null) { + store = MetadataFactory.getStore(pdConfig); + } + return store; + } + + @Override + public byte[] getOne(byte[] key) throws PDException { + try { + byte[] bytes = store.get(key); + return bytes; + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + } + + @Override + public E getOne(Parser parser, byte[] key) throws PDException { + try { + byte[] bytes = store.get(key); + if (ArrayUtils.isEmpty(bytes)) { + return null; + } + return parser.parseFrom(bytes); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + } + + @Override + public void put(byte[] key, byte[] value) throws PDException { + try { + getStore().put(key, value); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_WRITE_ERROR_VALUE, e); + } + } + + @Override + public void putWithTTL(byte[] key, byte[] value, long ttl) throws PDException { + this.store.putWithTTL(key, value, ttl); + } + + @Override + public void putWithTTL(byte[] key, byte[] value, long ttl, TimeUnit timeUnit) throws + PDException { + this.store.putWithTTL(key, value, ttl, timeUnit); + } + + @Override + public byte[] getWithTTL(byte[] key) throws PDException { + return this.store.getWithTTL(key); + } + + @Override + public List getListWithTTL(byte[] key) throws PDException { + return this.store.getListWithTTL(key); + } + + @Override + public void removeWithTTL(byte[] key) throws PDException { + this.store.removeWithTTL(key); + } + + @Override + public List scanPrefix(byte[] prefix) throws PDException { + //TODO 使用rocksdb 前缀查询 + try { + return this.store.scanPrefix(prefix); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + } + + @Override + public List scanRange(byte[] start, byte[] end) throws PDException { + return this.store.scanRange(start, end); + } + + @Override + public List scanRange(Parser parser, byte[] start, byte[] end) throws PDException { + List stores = new LinkedList<>(); + try { + List kvs = this.scanRange(start, end); + for (KV keyValue : kvs) { + stores.add(parser.parseFrom(keyValue.getValue())); + } + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + return stores; + } + + @Override + public List scanPrefix(Parser parser, byte[] prefix) throws PDException { + List stores = new LinkedList<>(); + try { + List kvs = this.scanPrefix(prefix); + for (KV keyValue : kvs) { + stores.add(parser.parseFrom(keyValue.getValue())); + } + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + return stores; + } + + @Override + public boolean containsKey(byte[] key) throws PDException { + return !ArrayUtils.isEmpty(store.get(key)); + } + + @Override + public long remove(byte[] key) throws PDException { + try { + return this.store.remove(key); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_WRITE_ERROR_VALUE, e); + } + } + + @Override + public long removeByPrefix(byte[] prefix) throws PDException { + try { + return this.store.removeByPrefix(prefix); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_WRITE_ERROR_VALUE, e); + } + } + + @Override + public void clearAllCache() throws PDException { + this.store.clear(); + } + + @Override + public void close() { + + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataStoreBase.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataStoreBase.java new file mode 100644 index 0000000000..4cd9e1d364 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/MetadataStoreBase.java @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.store.KV; + +import com.google.protobuf.Parser; + +public abstract class MetadataStoreBase { + + // public long timeout = 3; // 请求超时时间,默认三秒 + + public abstract byte[] getOne(byte[] key) throws PDException; + + public abstract E getOne(Parser parser, byte[] key) throws PDException; + + public abstract void put(byte[] key, byte[] value) throws PDException; + + /** + * 带有过期时间的put + */ + + public abstract void putWithTTL(byte[] key, + byte[] value, + long ttl) throws PDException; + + public abstract void putWithTTL(byte[] key, + byte[] value, + long ttl, TimeUnit timeUnit) throws PDException; + + public abstract byte[] getWithTTL(byte[] key) throws PDException; + + public abstract List getListWithTTL(byte[] key) throws PDException; + + public abstract void removeWithTTL(byte[] key) throws PDException; + + /** + * 前缀查询 + * + * @param prefix + * @return + * @throws PDException + */ + public abstract List scanPrefix(byte[] prefix) throws PDException; + + /** + * 前缀查询 + * + * @param prefix + * @return + * @throws PDException + */ + + public abstract List scanPrefix(Parser parser, byte[] prefix) throws PDException; + + public abstract List scanRange(byte[] start, byte[] end) throws PDException; + + public abstract List scanRange(Parser parser, byte[] start, byte[] end) throws + PDException; + + /** + * 检查Key是否存在 + * + * @param key + * @return + * @throws PDException + */ + + public abstract boolean containsKey(byte[] key) throws PDException; + + public abstract long remove(byte[] key) throws PDException; + + public abstract long removeByPrefix(byte[] prefix) throws PDException; + + public abstract void clearAllCache() throws PDException; + + public abstract void close() throws IOException; + + public T getInstanceWithTTL(Parser parser, byte[] key) throws PDException { + try { + byte[] withTTL = this.getWithTTL(key); + return parser.parseFrom(withTTL); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + } + + public List getInstanceListWithTTL(Parser parser, byte[] key) + throws PDException { + try { + List withTTL = this.getListWithTTL(key); + LinkedList ts = new LinkedList<>(); + for (int i = 0; i < withTTL.size(); i++) { + ts.add(parser.parseFrom((byte[]) withTTL.get(i))); + } + return ts; + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/PartitionMeta.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/PartitionMeta.java new file mode 100644 index 0000000000..713a0046d7 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/PartitionMeta.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.common.PartitionCache; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; + +import lombok.extern.slf4j.Slf4j; + +/** + * 分区信息管理 + */ +@Slf4j +public class PartitionMeta extends MetadataRocksDBStore { + + static String CID_GRAPH_ID_KEY = "GraphID"; + static int CID_GRAPH_ID_MAX = 0xFFFE; + private final PDConfig pdConfig; + private final PartitionCache cache; + + public PartitionMeta(PDConfig pdConfig) { + super(pdConfig); + this.pdConfig = pdConfig; + //this.timeout = pdConfig.getEtcd().getTimeout(); + this.cache = new PartitionCache(); + } + + /** + * 初始化,加载所有的分区 + */ + public void init() throws PDException { + loadShardGroups(); + loadGraphs(); + } + + public void reload() throws PDException { + cache.clear(); + loadShardGroups(); + loadGraphs(); + } + + private void loadGraphs() throws PDException { + byte[] key = MetadataKeyHelper.getGraphPrefix(); + List graphs = scanPrefix(Metapb.Graph.parser(), key); + for (Metapb.Graph graph : graphs) { + cache.updateGraph(graph); + loadPartitions(graph); + } + } + + /** + * partition 和 shard group分开存储,再init的时候,需要加载进来 + * + * @throws PDException + */ + private void loadShardGroups() throws PDException { + byte[] shardGroupPrefix = MetadataKeyHelper.getShardGroupPrefix(); + for (var shardGroup : scanPrefix(Metapb.ShardGroup.parser(), shardGroupPrefix)) { + cache.updateShardGroup(shardGroup); + } + } + + private void loadPartitions(Metapb.Graph graph) throws PDException { + byte[] prefix = MetadataKeyHelper.getPartitionPrefix(graph.getGraphName()); + List partitions = scanPrefix(Metapb.Partition.parser(), prefix); + partitions.forEach(p -> { + cache.updatePartition(p); + }); + } + + /** + * 根据id查找分区 (先从缓存找,再到数据库中找) + * + * @param graphName + * @param partId + * @return + * @throws PDException + */ + public Metapb.Partition getPartitionById(String graphName, int partId) throws PDException { + var pair = cache.getPartitionById(graphName, partId); + Metapb.Partition partition; + if (pair == null) { + byte[] key = MetadataKeyHelper.getPartitionKey(graphName, partId); + partition = getOne(Metapb.Partition.parser(), key); + if (partition != null) { + cache.updatePartition(partition); + } + } else { + partition = pair.getKey(); + } + return partition; + } + + public List getPartitionById(int partId) throws PDException { + List partitions = new ArrayList<>(); + cache.getGraphs().forEach(graph -> { + cache.getPartitions(graph.getGraphName()).forEach(partition -> { + if (partition.getId() == partId) { + partitions.add(partition); + } + }); + }); + return partitions; + } + + /** + * 根据code查找分区 + */ + public Metapb.Partition getPartitionByCode(String graphName, long code) throws PDException { + var pair = cache.getPartitionByCode(graphName, code); + if (pair != null) { + return pair.getKey(); + } + return null; + } + + public Metapb.Graph getAndCreateGraph(String graphName) throws PDException { + return getAndCreateGraph(graphName, pdConfig.getPartition().getTotalCount()); + } + + public Metapb.Graph getAndCreateGraph(String graphName, int partitionCount) throws PDException { + + if (partitionCount > pdConfig.getPartition().getTotalCount()) { + partitionCount = pdConfig.getPartition().getTotalCount(); + } + + // 管理图,只有一个分区 + if (graphName.endsWith("/s") || graphName.endsWith("/m")) { + partitionCount = 1; + } + + Metapb.Graph graph = cache.getGraph(graphName); + if (graph == null) { + // 保存图信息 + graph = Metapb.Graph.newBuilder() + .setGraphName(graphName) + .setPartitionCount(partitionCount) + .setState(Metapb.PartitionState.PState_Normal) + .build(); + updateGraph(graph); + } + return graph; + } + + /** + * 保存分区信息 + * + * @param partition + * @return + * @throws PDException + */ + public Metapb.Partition updatePartition(Metapb.Partition partition) throws PDException { + if (!cache.hasGraph(partition.getGraphName())) { + getAndCreateGraph(partition.getGraphName()); + } + byte[] key = MetadataKeyHelper.getPartitionKey(partition.getGraphName(), partition.getId()); + put(key, partition.toByteString().toByteArray()); + cache.updatePartition(partition); + return partition; + } + + /** + * 检查数据库,是否存在对应的图,不存在,则创建。 + * 更新partition的 version, conf version 和 shard list + * + * @param partition + * @return + * @throws PDException + */ + public Metapb.Partition updateShardList(Metapb.Partition partition) throws PDException { + if (!cache.hasGraph(partition.getGraphName())) { + getAndCreateGraph(partition.getGraphName()); + } + + Metapb.Partition pt = getPartitionById(partition.getGraphName(), partition.getId()); + // pt = pt.toBuilder().setVersion(partition.getVersion()) + // .setConfVer(partition.getConfVer()) + // .clearShards() + // .addAllShards(partition.getShardsList()).build(); + + byte[] key = MetadataKeyHelper.getPartitionKey(pt.getGraphName(), pt.getId()); + put(key, pt.toByteString().toByteArray()); + cache.updatePartition(pt); + return partition; + } + + /** + * 删除所有分区 + */ + public long removeAllPartitions(String graphName) throws PDException { + cache.removeAll(graphName); + byte[] prefix = MetadataKeyHelper.getPartitionPrefix(graphName); + return removeByPrefix(prefix); + } + + public long removePartition(String graphName, int id) throws PDException { + cache.remove(graphName, id); + byte[] key = MetadataKeyHelper.getPartitionKey(graphName, id); + return remove(key); + } + + public void updatePartitionStats(Metapb.PartitionStats stats) throws PDException { + for (String graphName : stats.getGraphNameList()) { + byte[] prefix = MetadataKeyHelper.getPartitionStatusKey(graphName, stats.getId()); + put(prefix, stats.toByteArray()); + } + } + + /** + * 获取分区状态 + */ + public Metapb.PartitionStats getPartitionStats(String graphName, int id) throws PDException { + byte[] prefix = MetadataKeyHelper.getPartitionStatusKey(graphName, id); + return getOne(Metapb.PartitionStats.parser(), prefix); + } + + /** + * 获取分区状态 + */ + public List getPartitionStats(String graphName) throws PDException { + byte[] prefix = MetadataKeyHelper.getPartitionStatusPrefixKey(graphName); + return scanPrefix(Metapb.PartitionStats.parser(), prefix); + } + + /** + * 更新图信息 + * + * @param graph + * @return + */ + public Metapb.Graph updateGraph(Metapb.Graph graph) throws PDException { + log.info("updateGraph {}", graph); + byte[] key = MetadataKeyHelper.getGraphKey(graph.getGraphName()); + // 保存图信息 + put(key, graph.toByteString().toByteArray()); + cache.updateGraph(graph); + return graph; + } + + public List getPartitions() { + List partitions = new ArrayList<>(); + List graphs = cache.getGraphs(); + graphs.forEach(e -> { + partitions.addAll(cache.getPartitions(e.getGraphName())); + }); + return partitions; + } + + public List getPartitions(String graphName) { + return cache.getPartitions(graphName); + } + + public List getGraphs() throws PDException { + byte[] key = MetadataKeyHelper.getGraphPrefix(); + return scanPrefix(Metapb.Graph.parser(), key); + } + + public Metapb.Graph getGraph(String graphName) throws PDException { + byte[] key = MetadataKeyHelper.getGraphKey(graphName); + return getOne(Metapb.Graph.parser(), key); + } + + /** + * 删除图,并删除图id + */ + public long removeGraph(String graphName) throws PDException { + byte[] key = MetadataKeyHelper.getGraphKey(graphName); + long l = remove(key); + return l; + } + + public PartitionCache getPartitionCache() { + return cache; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/QueueStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/QueueStore.java new file mode 100644 index 0000000000..e1b8437a48 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/QueueStore.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.List; + +import org.apache.hugegraph.pd.common.HgAssert; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.raft.RaftEngine; +import org.apache.hugegraph.pd.store.RaftKVStore; + +public class QueueStore extends MetadataRocksDBStore { + + QueueStore(PDConfig pdConfig) { + super(pdConfig); + } + + public void addItem(Metapb.QueueItem queueItem) throws PDException { + HgAssert.isArgumentNotNull(queueItem, "queueItem"); + byte[] key = MetadataKeyHelper.getQueueItemKey(queueItem.getItemId()); + put(key, queueItem.toByteString().toByteArray()); + } + + public void removeItem(String itemId) throws PDException { + if (RaftEngine.getInstance().isLeader()) { + remove(MetadataKeyHelper.getQueueItemKey(itemId)); + } else { + var store = getStore(); + // todo: delete record via client + if (store instanceof RaftKVStore) { + ((RaftKVStore) store).doRemove(MetadataKeyHelper.getQueueItemKey(itemId)); + } + } + } + + public List getQueue() throws PDException { + byte[] prefix = MetadataKeyHelper.getQueueItemPrefix(); + return scanPrefix(Metapb.QueueItem.parser(), prefix); + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/StoreInfoMeta.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/StoreInfoMeta.java new file mode 100644 index 0000000000..2a50b0448c --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/StoreInfoMeta.java @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.LinkedList; +import java.util.List; +import java.util.ListIterator; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; + +import lombok.extern.slf4j.Slf4j; + +/** + * Store信息存储 + */ +@Slf4j +public class StoreInfoMeta extends MetadataRocksDBStore { + + private final PDConfig pdConfig; + + public StoreInfoMeta(PDConfig pdConfig) { + super(pdConfig); + this.pdConfig = pdConfig; + // this.timeout = pdConfig.getDiscovery().getHeartbeatOutTimes(); + } + + public static boolean shardGroupEquals(List g1, List g2) { + ListIterator e1 = g1.listIterator(); + ListIterator e2 = g2.listIterator(); + while (e1.hasNext() && e2.hasNext()) { + Metapb.Shard o1 = e1.next(); + Metapb.Shard o2 = e2.next(); + if (!(o1 == null ? o2 == null : o1.getStoreId() == o2.getStoreId())) { + return false; + } + } + return !(e1.hasNext() || e2.hasNext()); + } + + /** + * 更新Store信息 + * + * @param store + * @throws PDException + */ + public void updateStore(Metapb.Store store) throws PDException { + byte[] storeInfoKey = MetadataKeyHelper.getStoreInfoKey(store.getId()); + put(storeInfoKey, store.toByteArray()); + } + + /** + * 更新Store的存活状态 + * + * @param store + */ + public void keepStoreAlive(Metapb.Store store) throws PDException { + byte[] activeStoreKey = MetadataKeyHelper.getActiveStoreKey(store.getId()); + putWithTTL(activeStoreKey, store.toByteArray(), pdConfig.getStore().getKeepAliveTimeout()); + } + + public void removeActiveStore(Metapb.Store store) throws PDException { + byte[] activeStoreKey = MetadataKeyHelper.getActiveStoreKey(store.getId()); + removeWithTTL(activeStoreKey); + } + + public Metapb.Store getStore(Long storeId) throws PDException { + byte[] storeInfoKey = MetadataKeyHelper.getStoreInfoKey(storeId); + Metapb.Store store = getOne(Metapb.Store.parser(), storeInfoKey); + return store; + } + + /** + * 获取所有的store + * + * @param graphName + * @return + * @throws PDException + */ + public List getStores(String graphName) throws PDException { + byte[] storePrefix = MetadataKeyHelper.getStorePrefix(); + return scanPrefix(Metapb.Store.parser(), storePrefix); + } + + /** + * 获取活跃的Store + * + * @param graphName + * @return + * @throws PDException + */ + public List getActiveStores(String graphName) throws PDException { + byte[] activePrefix = MetadataKeyHelper.getActiveStorePrefix(); + List listWithTTL = getInstanceListWithTTL(Metapb.Store.parser(), + activePrefix); + return listWithTTL; + } + + public List getActiveStores() throws PDException { + byte[] activePrefix = MetadataKeyHelper.getActiveStorePrefix(); + List listWithTTL = getInstanceListWithTTL(Metapb.Store.parser(), + activePrefix); + return listWithTTL; + } + + /** + * 检查storeid是否存在 + * + * @param storeId + * @return + */ + public boolean storeExists(Long storeId) throws PDException { + byte[] storeInfoKey = MetadataKeyHelper.getStoreInfoKey(storeId); + return containsKey(storeInfoKey); + } + + /** + * 更新存储状态信息 + * + * @param storeStats + */ + public Metapb.StoreStats updateStoreStats(Metapb.StoreStats storeStats) throws PDException { + byte[] storeStatusKey = MetadataKeyHelper.getStoreStatusKey(storeStats.getStoreId()); + + put(storeStatusKey, storeStats.toByteArray()); + return storeStats; + } + + public long removeStore(long storeId) throws PDException { + byte[] storeInfoKey = MetadataKeyHelper.getStoreInfoKey(storeId); + return remove(storeInfoKey); + } + + public long removeAll() throws PDException { + byte[] storePrefix = MetadataKeyHelper.getStorePrefix(); + return this.removeByPrefix(storePrefix); + } + + public void updateShardGroup(Metapb.ShardGroup group) throws PDException { + byte[] shardGroupKey = MetadataKeyHelper.getShardGroupKey(group.getId()); + put(shardGroupKey, group.toByteArray()); + } + + public void deleteShardGroup(int groupId) throws PDException { + byte[] shardGroupKey = MetadataKeyHelper.getShardGroupKey(groupId); + remove(shardGroupKey); + } + + public Metapb.ShardGroup getShardGroup(int groupId) throws PDException { + byte[] shardGroupKey = MetadataKeyHelper.getShardGroupKey(groupId); + return getOne(Metapb.ShardGroup.parser(), shardGroupKey); + } + + public int getShardGroupCount() throws PDException { + byte[] shardGroupPrefix = MetadataKeyHelper.getShardGroupPrefix(); + return scanPrefix(Metapb.ShardGroup.parser(), shardGroupPrefix).size(); + } + + public List getShardGroups() throws PDException { + byte[] shardGroupPrefix = MetadataKeyHelper.getShardGroupPrefix(); + return scanPrefix(Metapb.ShardGroup.parser(), shardGroupPrefix); + } + + public Metapb.StoreStats getStoreStats(long storeId) throws PDException { + byte[] storeStatusKey = MetadataKeyHelper.getStoreStatusKey(storeId); + Metapb.StoreStats stats = getOne(Metapb.StoreStats.parser(), + storeStatusKey); + return stats; + } + + /** + * @return store及状态信息 + * @throws PDException + */ + public List getStoreStatus(boolean isActive) throws PDException { + byte[] storePrefix = MetadataKeyHelper.getStorePrefix(); + List stores = isActive ? getActiveStores() : + scanPrefix(Metapb.Store.parser(), storePrefix); + LinkedList list = new LinkedList<>(); + for (int i = 0; i < stores.size(); i++) { + Metapb.Store store = stores.get(i); + Metapb.StoreStats stats = getStoreStats(store.getId()); + if (stats != null) { + store = Metapb.Store.newBuilder(store).setStats(getStoreStats(store.getId())) + .build(); + } + list.add(store); + } + return list; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/TaskInfoMeta.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/TaskInfoMeta.java new file mode 100644 index 0000000000..756be71e98 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/meta/TaskInfoMeta.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.meta; + +import java.util.List; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.MetaTask; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.grpc.pulse.MovePartition; +import org.apache.hugegraph.pd.grpc.pulse.SplitPartition; + +/** + * 任务管理 + */ +public class TaskInfoMeta extends MetadataRocksDBStore { + + public TaskInfoMeta(PDConfig pdConfig) { + super(pdConfig); + } + + /** + * 添加分区分裂任务 + */ + public void addSplitTask(int groupID, Metapb.Partition partition, SplitPartition splitPartition) + throws PDException { + byte[] key = MetadataKeyHelper.getSplitTaskKey(partition.getGraphName(), groupID); + MetaTask.Task task = MetaTask.Task.newBuilder() + .setType(MetaTask.TaskType.Split_Partition) + .setState(MetaTask.TaskState.Task_Doing) + .setStartTimestamp(System.currentTimeMillis()) + .setPartition(partition) + .setSplitPartition(splitPartition) + .build(); + put(key, task.toByteString().toByteArray()); + } + + public void updateSplitTask(MetaTask.Task task) throws PDException { + var partition = task.getPartition(); + byte[] key = MetadataKeyHelper.getSplitTaskKey(partition.getGraphName(), partition.getId()); + put(key, task.toByteString().toByteArray()); + } + + public MetaTask.Task getSplitTask(String graphName, int groupID) throws PDException { + byte[] key = MetadataKeyHelper.getSplitTaskKey(graphName, groupID); + return getOne(MetaTask.Task.parser(), key); + } + + public List scanSplitTask(String graphName) throws PDException { + byte[] prefix = MetadataKeyHelper.getSplitTaskPrefix(graphName); + return scanPrefix(MetaTask.Task.parser(), prefix); + } + + public void removeSplitTaskPrefix(String graphName) throws PDException { + byte[] key = MetadataKeyHelper.getSplitTaskPrefix(graphName); + removeByPrefix(key); + } + + public boolean hasSplitTaskDoing() throws PDException { + byte[] key = MetadataKeyHelper.getAllSplitTaskPrefix(); + return scanPrefix(key).size() > 0; + } + + public void addMovePartitionTask(Metapb.Partition partition, MovePartition movePartition) + throws PDException { + byte[] key = MetadataKeyHelper.getMoveTaskKey(partition.getGraphName(), + movePartition.getTargetPartition().getId(), + partition.getId()); + + MetaTask.Task task = MetaTask.Task.newBuilder() + .setType(MetaTask.TaskType.Move_Partition) + .setState(MetaTask.TaskState.Task_Doing) + .setStartTimestamp(System.currentTimeMillis()) + .setPartition(partition) + .setMovePartition(movePartition) + .build(); + put(key, task.toByteArray()); + } + + public void updateMovePartitionTask(MetaTask.Task task) + throws PDException { + + byte[] key = MetadataKeyHelper.getMoveTaskKey(task.getPartition().getGraphName(), + task.getMovePartition().getTargetPartition() + .getId(), + task.getPartition().getId()); + put(key, task.toByteArray()); + } + + public MetaTask.Task getMovePartitionTask(String graphName, int targetId, int partId) throws + PDException { + byte[] key = MetadataKeyHelper.getMoveTaskKey(graphName, targetId, partId); + return getOne(MetaTask.Task.parser(), key); + } + + public List scanMoveTask(String graphName) throws PDException { + byte[] prefix = MetadataKeyHelper.getMoveTaskPrefix(graphName); + return scanPrefix(MetaTask.Task.parser(), prefix); + } + + /** + * 按照prefix删除迁移任务,一次分组的 + * + * @param graphName 图名称 + * @throws PDException io error + */ + public void removeMoveTaskPrefix(String graphName) throws PDException { + byte[] key = MetadataKeyHelper.getMoveTaskPrefix(graphName); + removeByPrefix(key); + } + + public boolean hasMoveTaskDoing() throws PDException { + byte[] key = MetadataKeyHelper.getAllMoveTaskPrefix(); + return scanPrefix(key).size() > 0; + } + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/FutureClosureAdapter.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/FutureClosureAdapter.java new file mode 100644 index 0000000000..d90c50c6c9 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/FutureClosureAdapter.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.util.concurrent.CompletableFuture; + +import com.alipay.sofa.jraft.Closure; +import com.alipay.sofa.jraft.Status; + +public class FutureClosureAdapter implements Closure { + + public final CompletableFuture future = new CompletableFuture<>(); + private T resp; + + public T getResponse() { + return this.resp; + } + + public void setResponse(T resp) { + this.resp = resp; + future.complete(resp); + run(Status.OK()); + } + + public void failure(Throwable t) { + future.completeExceptionally(t); + run(new Status(-1, t.getMessage())); + } + + @Override + public void run(Status status) { + + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/KVOperation.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/KVOperation.java new file mode 100644 index 0000000000..9169a248dc --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/KVOperation.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.concurrent.TimeUnit; + +import com.alipay.sofa.jraft.util.BytesUtil; +import com.alipay.sofa.jraft.util.Requires; +import com.caucho.hessian.io.Hessian2Input; +import com.caucho.hessian.io.Hessian2Output; + +import lombok.Data; + +@Data +public class KVOperation { + + /** + * Put operation + */ + public static final byte PUT = 0x01; + /** + * Get operation + */ + public static final byte GET = 0x02; + public static final byte DEL = 0x03; + public static final byte REMOVE_BY_PREFIX = 0x04; + public static final byte REMOVE = 0x05; + public static final byte PUT_WITH_TTL = 0x06; + public static final byte CLEAR = 0x07; + public static final byte PUT_WITH_TTL_UNIT = 0x08; + public static final byte REMOVE_WITH_TTL = 0x09; + /** + * Snapshot operation + */ + public static final byte SAVE_SNAPSHOT = 0x10; + public static final byte LOAD_SNAPSHOT = 0x11; + + private byte[] key; + private byte[] value; + private Object attach; // 原始对象,用于本机处理,减少一次反序列化操作 + private Object arg; + private byte op; + + public KVOperation() { + + } + + public KVOperation(byte[] key, byte[] value, Object attach, byte op) { + this.key = key; + this.value = value; + this.attach = attach; + this.op = op; + } + + public KVOperation(byte[] key, byte[] value, Object attach, byte op, Object arg) { + this.key = key; + this.value = value; + this.attach = attach; + this.op = op; + this.arg = arg; + } + + public static KVOperation fromByteArray(byte[] value) throws IOException { + + try (ByteArrayInputStream bis = new ByteArrayInputStream(value, 1, value.length - 1)) { + Hessian2Input input = new Hessian2Input(bis); + KVOperation op = new KVOperation(); + op.op = value[0]; + op.key = input.readBytes(); + op.value = input.readBytes(); + op.arg = input.readObject(); + input.close(); + return op; + } + } + + public static KVOperation createPut(final byte[] key, final byte[] value) { + Requires.requireNonNull(key, "key"); + Requires.requireNonNull(value, "value"); + return new KVOperation(key, value, null, PUT); + } + + public static KVOperation createGet(final byte[] key) { + Requires.requireNonNull(key, "key"); + return new KVOperation(key, BytesUtil.EMPTY_BYTES, null, GET); + } + + public static KVOperation createPutWithTTL(byte[] key, byte[] value, long ttl) { + Requires.requireNonNull(key, "key"); + Requires.requireNonNull(value, "value"); + return new KVOperation(key, value, value, PUT_WITH_TTL, + ttl); + } + + public static KVOperation createPutWithTTL(byte[] key, byte[] value, long ttl, + TimeUnit timeUnit) { + Requires.requireNonNull(key, "key"); + Requires.requireNonNull(value, "value"); + return new KVOperation(key, value, value, PUT_WITH_TTL_UNIT, + new Object[]{ttl, timeUnit}); + } + + public static KVOperation createRemoveWithTTL(byte[] key) { + Requires.requireNonNull(key, "key"); + return new KVOperation(key, key, null, REMOVE_WITH_TTL); + } + + public static KVOperation createRemoveByPrefix(byte[] key) { + Requires.requireNonNull(key, "key"); + return new KVOperation(key, key, null, REMOVE_BY_PREFIX); + } + + public static KVOperation createRemove(byte[] key) { + Requires.requireNonNull(key, "key"); + return new KVOperation(key, key, null, REMOVE); + } + + public static KVOperation createClear() { + return new KVOperation(null, null, null, CLEAR); + } + + public static KVOperation createSaveSnapshot(String snapshotPath) { + Requires.requireNonNull(snapshotPath, "snapshotPath"); + return new KVOperation(null, null, snapshotPath, SAVE_SNAPSHOT); + } + + public static KVOperation createLoadSnapshot(String snapshotPath) { + Requires.requireNonNull(snapshotPath, "snapshotPath"); + return new KVOperation(null, null, snapshotPath, LOAD_SNAPSHOT); + } + + public byte[] toByteArray() throws IOException { + try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) { + bos.write(op); + Hessian2Output output = new Hessian2Output(bos); + output.writeObject(key); + output.writeObject(value); + output.writeObject(arg); + output.flush(); + return bos.toByteArray(); + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/KVStoreClosure.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/KVStoreClosure.java new file mode 100644 index 0000000000..cb5291703a --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/KVStoreClosure.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import org.apache.hugegraph.pd.grpc.Pdpb; + +import com.alipay.sofa.jraft.Closure; + +public interface KVStoreClosure extends Closure { + + Pdpb.Error getError(); + + void setError(final Pdpb.Error error); + + Object getData(); + + void setData(final Object data); +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftEngine.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftEngine.java new file mode 100644 index 0000000000..9ed62b0e61 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftEngine.java @@ -0,0 +1,378 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.atomic.AtomicReference; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Metapb; +import org.apache.hugegraph.pd.grpc.Pdpb; + +import com.alipay.sofa.jraft.JRaftUtils; +import com.alipay.sofa.jraft.Node; +import com.alipay.sofa.jraft.RaftGroupService; +import com.alipay.sofa.jraft.ReplicatorGroup; +import com.alipay.sofa.jraft.Status; +import com.alipay.sofa.jraft.conf.Configuration; +import com.alipay.sofa.jraft.core.Replicator; +import com.alipay.sofa.jraft.entity.PeerId; +import com.alipay.sofa.jraft.entity.Task; +import com.alipay.sofa.jraft.error.RaftError; +import com.alipay.sofa.jraft.option.NodeOptions; +import com.alipay.sofa.jraft.option.RaftOptions; +import com.alipay.sofa.jraft.option.RpcOptions; +import com.alipay.sofa.jraft.rpc.RaftRpcServerFactory; +import com.alipay.sofa.jraft.rpc.RpcServer; +import com.alipay.sofa.jraft.util.Endpoint; +import com.alipay.sofa.jraft.util.ThreadId; +import com.alipay.sofa.jraft.util.internal.ThrowUtil; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class RaftEngine { + + private static final RaftEngine INSTANCE = new RaftEngine(); + private final RaftStateMachine stateMachine; + private PDConfig.Raft config; + private RaftGroupService raftGroupService; + private RpcServer rpcServer; + private Node raftNode; + private RaftRpcClient raftRpcClient; + + public RaftEngine() { + this.stateMachine = new RaftStateMachine(); + } + + public static RaftEngine getInstance() { + return INSTANCE; + } + + public boolean init(PDConfig.Raft config) { + if (this.raftNode != null) { + return false; + } + this.config = config; + + raftRpcClient = new RaftRpcClient(); + raftRpcClient.init(new RpcOptions()); + + String groupId = "pd_raft"; + String raftPath = config.getDataPath() + "/" + groupId; + new File(raftPath).mkdirs(); + + new File(config.getDataPath()).mkdirs(); + Configuration initConf = new Configuration(); + initConf.parse(config.getPeersList()); + if (config.isEnable() && config.getPeersList().length() < 3) { + log.error("The RaftEngine parameter is incorrect." + + " When RAFT is enabled, the number of peers " + "cannot be less than 3"); + } + // 设置 Node 参数,包括日志存储路径和状态机实例 + NodeOptions nodeOptions = new NodeOptions(); + nodeOptions.setFsm(stateMachine); + nodeOptions.setEnableMetrics(true); + // 日志路径 + nodeOptions.setLogUri(raftPath + "/log"); + // raft 元数据路径 + nodeOptions.setRaftMetaUri(raftPath + "/meta"); + // 快照路径 + nodeOptions.setSnapshotUri(raftPath + "/snapshot"); + // 初始集群 + nodeOptions.setInitialConf(initConf); + // 快照时间间隔 + nodeOptions.setSnapshotIntervalSecs(config.getSnapshotInterval()); + + nodeOptions.setRpcConnectTimeoutMs(config.getRpcTimeout()); + nodeOptions.setRpcDefaultTimeout(config.getRpcTimeout()); + nodeOptions.setRpcInstallSnapshotTimeout(config.getRpcTimeout()); + // 设置 raft 配置 + RaftOptions raftOptions = nodeOptions.getRaftOptions(); + + nodeOptions.setEnableMetrics(true); + + final PeerId serverId = JRaftUtils.getPeerId(config.getAddress()); + + rpcServer = createRaftRpcServer(config.getAddress()); + // 构建 raft 组并启动 raft + this.raftGroupService = + new RaftGroupService(groupId, serverId, nodeOptions, rpcServer, true); + this.raftNode = raftGroupService.start(false); + log.info("RaftEngine start successfully: id = {}, peers list = {}", groupId, + nodeOptions.getInitialConf().getPeers()); + return this.raftNode != null; + } + + /** + * 创建 raft rpc server,用于 pd 之间通讯 + */ + private RpcServer createRaftRpcServer(String raftAddr) { + Endpoint endpoint = JRaftUtils.getEndPoint(raftAddr); + RpcServer rpcServer = RaftRpcServerFactory.createRaftRpcServer(endpoint); + RaftRpcProcessor.registerProcessor(rpcServer, this); + rpcServer.init(null); + return rpcServer; + } + + public void shutDown() { + if (this.raftGroupService != null) { + this.raftGroupService.shutdown(); + try { + this.raftGroupService.join(); + } catch (final InterruptedException e) { + this.raftNode = null; + ThrowUtil.throwException(e); + } + this.raftGroupService = null; + } + if (this.rpcServer != null) { + this.rpcServer.shutdown(); + this.rpcServer = null; + } + if (this.raftNode != null) { + this.raftNode.shutdown(); + } + this.raftNode = null; + } + + public boolean isLeader() { + return this.raftNode.isLeader(true); + } + + /** + * 添加 Raft 任务,grpc 通过该接口给 raft 发送数据 + */ + public void addTask(Task task) { + if (!isLeader()) { + KVStoreClosure closure = (KVStoreClosure) task.getDone(); + closure.setError(Pdpb.Error.newBuilder().setType(Pdpb.ErrorType.NOT_LEADER).build()); + closure.run(new Status(RaftError.EPERM, "Not leader")); + return; + } + this.raftNode.apply(task); + } + + public void addStateListener(RaftStateListener listener) { + this.stateMachine.addStateListener(listener); + } + + public void addTaskHandler(RaftTaskHandler handler) { + this.stateMachine.addTaskHandler(handler); + } + + public PDConfig.Raft getConfig() { + return this.config; + } + + public PeerId getLeader() { + return raftNode.getLeaderId(); + } + + /** + * 向 leader 发消息,获取 grpc 地址; + */ + public String getLeaderGrpcAddress() throws ExecutionException, InterruptedException { + if (isLeader()) { + return config.getGrpcAddress(); + } + + if (raftNode.getLeaderId() == null) { + waitingForLeader(10000); + } + + return raftRpcClient.getGrpcAddress(raftNode.getLeaderId().getEndpoint().toString()).get() + .getGrpcAddress(); + } + + public Metapb.Member getLocalMember() { + Metapb.Member.Builder builder = Metapb.Member.newBuilder(); + builder.setClusterId(config.getClusterId()); + builder.setRaftUrl(config.getAddress()); + builder.setDataPath(config.getDataPath()); + builder.setGrpcUrl(config.getGrpcAddress()); + builder.setState(Metapb.StoreState.Up); + return builder.build(); + } + + public List getMembers() { + List members = new ArrayList<>(); + + List peers = raftNode.listPeers(); + peers.addAll(raftNode.listLearners()); + var learners = new HashSet<>(raftNode.listLearners()); + + for (PeerId peerId : peers) { + Metapb.Member.Builder builder = Metapb.Member.newBuilder(); + builder.setClusterId(config.getClusterId()); + CompletableFuture future = + raftRpcClient.getGrpcAddress(peerId.getEndpoint().toString()); + + Metapb.ShardRole role = Metapb.ShardRole.Follower; + if (peerEquals(peerId, raftNode.getLeaderId())) { + role = Metapb.ShardRole.Leader; + } else if (learners.contains(peerId)) { + role = Metapb.ShardRole.Learner; + var state = getReplicatorState(peerId); + if (state != null) { + builder.setReplicatorState(state.name()); + } + } + + builder.setRole(role); + + try { + if (future.isCompletedExceptionally()) { + log.error("failed to getGrpcAddress of {}", peerId.getEndpoint().toString()); + builder.setState(Metapb.StoreState.Offline); + builder.setRaftUrl(peerId.getEndpoint().toString()); + members.add(builder.build()); + } else { + RaftRpcProcessor.GetMemberResponse response = future.get(); + builder.setState(Metapb.StoreState.Up); + builder.setRaftUrl(response.getRaftAddress()); + builder.setDataPath(response.getDatePath()); + builder.setGrpcUrl(response.getGrpcAddress()); + builder.setRestUrl(response.getRestAddress()); + members.add(builder.build()); + } + } catch (Exception e) { + log.error("failed to getGrpcAddress of {}.", peerId.getEndpoint().toString(), e); + builder.setState(Metapb.StoreState.Offline); + builder.setRaftUrl(peerId.getEndpoint().toString()); + members.add(builder.build()); + } + + } + return members; + } + + public Status changePeerList(String peerList) { + AtomicReference result = new AtomicReference<>(); + try { + String[] peers = peerList.split(",", -1); + if ((peers.length & 1) != 1) { + throw new PDException(-1, "the number of peer list must be odd."); + } + Configuration newPeers = new Configuration(); + newPeers.parse(peerList); + CountDownLatch latch = new CountDownLatch(1); + this.raftNode.changePeers(newPeers, status -> { + result.set(status); + latch.countDown(); + }); + latch.await(); + } catch (Exception e) { + log.error("failed to changePeerList to {}", peerList, e); + result.set(new Status(-1, e.getMessage())); + } + return result.get(); + } + + public PeerId waitingForLeader(long timeOut) { + PeerId leader = getLeader(); + if (leader != null) { + return leader; + } + + synchronized (this) { + leader = getLeader(); + long start = System.currentTimeMillis(); + while ((System.currentTimeMillis() - start < timeOut) && (leader == null)) { + try { + this.wait(1000); + } catch (InterruptedException e) { + log.error("Raft wait for leader exception", e); + } + leader = getLeader(); + } + return leader; + } + + } + + public Node getRaftNode() { + return raftNode; + } + + private boolean peerEquals(PeerId p1, PeerId p2) { + if (p1 == null && p2 == null) { + return true; + } + if (p1 == null || p2 == null) { + return false; + } + return Objects.equals(p1.getIp(), p2.getIp()) && Objects.equals(p1.getPort(), p2.getPort()); + } + + private Replicator.State getReplicatorState(PeerId peerId) { + var replicateGroup = getReplicatorGroup(); + if (replicateGroup == null) { + return null; + } + + ThreadId threadId = replicateGroup.getReplicator(peerId); + if (threadId == null) { + return null; + } else { + Replicator r = (Replicator) threadId.lock(); + if (r == null) { + return Replicator.State.Probe; + } + Replicator.State result = getState(r); + threadId.unlock(); + return result; + } + } + + private ReplicatorGroup getReplicatorGroup() { + var clz = this.raftNode.getClass(); + try { + var f = clz.getDeclaredField("replicatorGroup"); + f.setAccessible(true); + var group = (ReplicatorGroup) f.get(this.raftNode); + f.setAccessible(false); + return group; + } catch (NoSuchFieldException | IllegalAccessException e) { + log.info("getReplicatorGroup: error {}", e.getMessage()); + return null; + } + } + + private Replicator.State getState(Replicator r) { + var clz = r.getClass(); + try { + var f = clz.getDeclaredField("state"); + f.setAccessible(true); + var state = (Replicator.State) f.get(this.raftNode); + f.setAccessible(false); + return state; + } catch (NoSuchFieldException | IllegalAccessException e) { + log.info("getReplicatorGroup: error {}", e.getMessage()); + return null; + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftRpcClient.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftRpcClient.java new file mode 100644 index 0000000000..6e47ce4e59 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftRpcClient.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.util.concurrent.CompletableFuture; + +import com.alipay.sofa.jraft.JRaftUtils; +import com.alipay.sofa.jraft.Status; +import com.alipay.sofa.jraft.option.RpcOptions; +import com.alipay.sofa.jraft.rpc.InvokeCallback; +import com.alipay.sofa.jraft.rpc.InvokeContext; +import com.alipay.sofa.jraft.rpc.RaftRpcFactory; +import com.alipay.sofa.jraft.rpc.RpcClient; +import com.alipay.sofa.jraft.util.Endpoint; +import com.alipay.sofa.jraft.util.RpcFactoryHelper; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class RaftRpcClient { + + protected volatile RpcClient rpcClient; + private RpcOptions rpcOptions; + + public synchronized boolean init(final RpcOptions rpcOptions) { + this.rpcOptions = rpcOptions; + final RaftRpcFactory factory = RpcFactoryHelper.rpcFactory(); + this.rpcClient = + factory.createRpcClient(factory.defaultJRaftClientConfigHelper(this.rpcOptions)); + return this.rpcClient.init(null); + } + + /** + * 请求快照 + */ + public CompletableFuture + getGrpcAddress(final String address) { + RaftRpcProcessor.GetMemberRequest request = new RaftRpcProcessor.GetMemberRequest(); + FutureClosureAdapter response = + new FutureClosureAdapter<>(); + internalCallAsyncWithRpc(JRaftUtils.getEndPoint(address), request, response); + return response.future; + } + + private void internalCallAsyncWithRpc(final Endpoint endpoint, + final RaftRpcProcessor.BaseRequest request, + final FutureClosureAdapter closure) { + final InvokeContext invokeCtx = new InvokeContext(); + final InvokeCallback invokeCallback = new InvokeCallback() { + + @Override + public void complete(final Object result, final Throwable err) { + if (err == null) { + final RaftRpcProcessor.BaseResponse response = + (RaftRpcProcessor.BaseResponse) result; + closure.setResponse((V) response); + } else { + closure.failure(err); + closure.run(new Status(-1, err.getMessage())); + } + } + }; + + try { + this.rpcClient.invokeAsync(endpoint, request, invokeCtx, invokeCallback, + this.rpcOptions.getRpcDefaultTimeout()); + } catch (final Throwable t) { + log.error("failed to call rpc to {}. {}", endpoint, t.getMessage()); + closure.failure(t); + closure.run(new Status(-1, t.getMessage())); + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftRpcProcessor.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftRpcProcessor.java new file mode 100644 index 0000000000..ed950a4ee1 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftRpcProcessor.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.io.Serializable; + +import com.alipay.sofa.jraft.rpc.RpcContext; +import com.alipay.sofa.jraft.rpc.RpcProcessor; +import com.alipay.sofa.jraft.rpc.RpcServer; + +import lombok.Data; + +public class RaftRpcProcessor implements RpcProcessor { + + private final Class requestClass; + private final RaftEngine raftEngine; + + public RaftRpcProcessor(Class requestClass, RaftEngine raftEngine) { + this.requestClass = requestClass; + this.raftEngine = raftEngine; + } + + public static void registerProcessor(final RpcServer rpcServer, RaftEngine raftEngine) { + rpcServer.registerProcessor(new RaftRpcProcessor<>(GetMemberRequest.class, raftEngine)); + } + + @Override + public void handleRequest(RpcContext rpcCtx, T request) { + if (request.magic() == BaseRequest.GET_GRPC_ADDRESS) { + rpcCtx.sendResponse(getGrpcAddress()); + } + } + + @Override + public String interest() { + return this.requestClass.getName(); + } + + private GetMemberResponse getGrpcAddress() { + GetMemberResponse rep = new GetMemberResponse(); + rep.setGrpcAddress(raftEngine.getConfig().getGrpcAddress()); + rep.setClusterId(raftEngine.getConfig().getClusterId()); + rep.setDatePath(raftEngine.getConfig().getDataPath()); + rep.setRaftAddress(raftEngine.getConfig().getAddress()); + rep.setRestAddress( + raftEngine.getConfig().getHost() + ":" + raftEngine.getConfig().getPort()); + rep.setStatus(Status.OK); + return rep; + } + + public enum Status implements Serializable { + UNKNOWN(-1, "unknown"), + OK(0, "ok"), + COMPLETE(0, "Transmission completed"), + INCOMPLETE(1, "Incomplete transmission"), + NO_PARTITION(10, "Partition not found"), + IO_ERROR(11, "io error"), + EXCEPTION(12, "exception"), + ABORT(100, "Transmission aborted"); + + private final int code; + private String msg; + + Status(int code, String msg) { + this.code = code; + this.msg = msg; + } + + public int getCode() { + return this.code; + } + + public Status setMsg(String msg) { + this.msg = msg; + return this; + } + + public boolean isOK() { + return this.code == 0; + } + } + + public abstract static class BaseRequest implements Serializable { + + public static final byte GET_GRPC_ADDRESS = 0x01; + + public abstract byte magic(); + } + + @Data + public abstract static class BaseResponse implements Serializable { + + private Status status; + + } + + @Data + public static class GetMemberRequest extends BaseRequest { + + @Override + public byte magic() { + return GET_GRPC_ADDRESS; + } + } + + @Data + public static class GetMemberResponse extends BaseResponse { + + private long clusterId; + private String raftAddress; + private String grpcAddress; + private String datePath; + private String restAddress; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftStateListener.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftStateListener.java new file mode 100644 index 0000000000..56f39e3ad4 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftStateListener.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +public interface RaftStateListener { + + void onRaftLeaderChanged(); +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftStateMachine.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftStateMachine.java new file mode 100644 index 0000000000..e747518668 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftStateMachine.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicLong; +import java.util.zip.Checksum; + +import org.apache.commons.io.FileUtils; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.springframework.util.CollectionUtils; + +import com.alipay.sofa.jraft.Closure; +import com.alipay.sofa.jraft.Iterator; +import com.alipay.sofa.jraft.Status; +import com.alipay.sofa.jraft.conf.Configuration; +import com.alipay.sofa.jraft.core.StateMachineAdapter; +import com.alipay.sofa.jraft.entity.LeaderChangeContext; +import com.alipay.sofa.jraft.entity.LocalFileMetaOutter; +import com.alipay.sofa.jraft.error.RaftError; +import com.alipay.sofa.jraft.error.RaftException; +import com.alipay.sofa.jraft.storage.snapshot.SnapshotReader; +import com.alipay.sofa.jraft.storage.snapshot.SnapshotWriter; +import com.alipay.sofa.jraft.util.CRC64; +import com.alipay.sofa.jraft.util.Utils; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class RaftStateMachine extends StateMachineAdapter { + + private static final String SNAPSHOT_DIR_NAME = "snapshot"; + private static final String SNAPSHOT_ARCHIVE_NAME = "snapshot.zip"; + private final AtomicLong leaderTerm = new AtomicLong(-1); + private final List taskHandlers; + private final List stateListeners; + + public RaftStateMachine() { + this.taskHandlers = new CopyOnWriteArrayList<>(); + this.stateListeners = new CopyOnWriteArrayList<>(); + } + + public void addTaskHandler(RaftTaskHandler handler) { + taskHandlers.add(handler); + } + + public void addStateListener(RaftStateListener listener) { + stateListeners.add(listener); + } + + public boolean isLeader() { + return this.leaderTerm.get() > 0; + } + + @Override + public void onApply(Iterator iter) { + while (iter.hasNext()) { + final RaftClosureAdapter done = (RaftClosureAdapter) iter.done(); + try { + KVOperation kvOp; + if (done != null) { + kvOp = done.op; + } else { + kvOp = KVOperation.fromByteArray(iter.getData().array()); + } + for (RaftTaskHandler taskHandler : taskHandlers) { + taskHandler.invoke(kvOp, done); + } + if (done != null) { + done.run(Status.OK()); + } + } catch (Throwable t) { + log.error("StateMachine meet critical error: {}.", t); + if (done != null) { + done.run(new Status(RaftError.EINTERNAL, t.getMessage())); + } + } + iter.next(); + } + } + + @Override + public void onError(final RaftException e) { + log.error("Raft StateMachine on error {}", e); + } + + @Override + public void onShutdown() { + super.onShutdown(); + } + + @Override + public void onLeaderStart(final long term) { + this.leaderTerm.set(term); + super.onLeaderStart(term); + + log.info("Raft becomes leader"); + Utils.runInThread(() -> { + if (!CollectionUtils.isEmpty(stateListeners)) { + stateListeners.forEach(listener -> { + listener.onRaftLeaderChanged(); + }); + } + }); + } + + @Override + public void onLeaderStop(final Status status) { + this.leaderTerm.set(-1); + super.onLeaderStop(status); + log.info("Raft lost leader "); + } + + @Override + public void onStartFollowing(final LeaderChangeContext ctx) { + super.onStartFollowing(ctx); + Utils.runInThread(() -> { + if (!CollectionUtils.isEmpty(stateListeners)) { + stateListeners.forEach(listener -> { + listener.onRaftLeaderChanged(); + }); + } + }); + } + + @Override + public void onStopFollowing(final LeaderChangeContext ctx) { + super.onStopFollowing(ctx); + } + + @Override + public void onConfigurationCommitted(final Configuration conf) { + log.info("Raft onConfigurationCommitted {}", conf); + } + + @Override + public void onSnapshotSave(final SnapshotWriter writer, final Closure done) { + + String snapshotDir = writer.getPath() + File.separator + SNAPSHOT_DIR_NAME; + try { + FileUtils.deleteDirectory(new File(snapshotDir)); + FileUtils.forceMkdir(new File(snapshotDir)); + } catch (IOException e) { + log.error("Failed to create snapshot directory {}", snapshotDir); + done.run(new Status(RaftError.EIO, e.toString())); + return; + } + + CountDownLatch latch = new CountDownLatch(taskHandlers.size()); + for (RaftTaskHandler taskHandler : taskHandlers) { + Utils.runInThread(() -> { + try { + KVOperation op = KVOperation.createSaveSnapshot(snapshotDir); + taskHandler.invoke(op, null); + log.info("Raft onSnapshotSave success"); + latch.countDown(); + } catch (PDException e) { + log.error("Raft onSnapshotSave failed. {}", e.toString()); + done.run(new Status(RaftError.EIO, e.toString())); + } + }); + } + try { + latch.await(); + } catch (InterruptedException e) { + log.error("Raft onSnapshotSave failed. {}", e.toString()); + done.run(new Status(RaftError.EIO, e.toString())); + return; + } + + // compress + try { + compressSnapshot(writer); + FileUtils.deleteDirectory(new File(snapshotDir)); + } catch (Exception e) { + log.error("Failed to delete snapshot directory {}, {}", snapshotDir, e.toString()); + done.run(new Status(RaftError.EIO, e.toString())); + return; + } + done.run(Status.OK()); + } + + @Override + public boolean onSnapshotLoad(final SnapshotReader reader) { + if (isLeader()) { + log.warn("Leader is not supposed to load snapshot"); + return false; + } + String snapshotDir = reader.getPath() + File.separator + SNAPSHOT_DIR_NAME; + String snapshotArchive = reader.getPath() + File.separator + SNAPSHOT_ARCHIVE_NAME; + // 2. decompress snapshot archive + try { + decompressSnapshot(reader); + } catch (PDException e) { + log.error("Failed to delete snapshot directory {}, {}", snapshotDir, e.toString()); + return true; + } + + CountDownLatch latch = new CountDownLatch(taskHandlers.size()); + for (RaftTaskHandler taskHandler : taskHandlers) { + try { + KVOperation op = KVOperation.createLoadSnapshot(snapshotDir); + taskHandler.invoke(op, null); + log.info("Raft onSnapshotLoad success"); + latch.countDown(); + } catch (PDException e) { + log.error("Raft onSnapshotLoad failed. {}", e.toString()); + return false; + } + } + try { + latch.await(); + } catch (InterruptedException e) { + log.error("Raft onSnapshotSave failed. {}", e.toString()); + return false; + } + + try { + // TODO: remove file from meta + // SnapshotReader 沒有提供刪除文件的接口 + FileUtils.deleteDirectory(new File(snapshotDir)); + File file = new File(snapshotArchive); + if (file.exists()) { + FileUtils.forceDelete(file); + } + } catch (IOException e) { + log.error("Failed to delete snapshot directory {} and file {}", snapshotDir, + snapshotArchive); + return false; + } + + return true; + } + + private void compressSnapshot(final SnapshotWriter writer) throws PDException { + final Checksum checksum = new CRC64(); + final String snapshotArchive = writer.getPath() + File.separator + SNAPSHOT_ARCHIVE_NAME; + try { + ZipUtils.compress(writer.getPath(), SNAPSHOT_DIR_NAME, snapshotArchive, checksum); + LocalFileMetaOutter.LocalFileMeta.Builder metaBuild = + LocalFileMetaOutter.LocalFileMeta.newBuilder(); + metaBuild.setChecksum(Long.toHexString(checksum.getValue())); + if (!writer.addFile(SNAPSHOT_ARCHIVE_NAME, metaBuild.build())) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_SAVE_SNAPSHOT_ERROR_VALUE, + "failed to add file to LocalFileMeta"); + } + } catch (IOException e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_SAVE_SNAPSHOT_ERROR_VALUE, e); + } + } + + private void decompressSnapshot(final SnapshotReader reader) throws PDException { + final LocalFileMetaOutter.LocalFileMeta meta = + (LocalFileMetaOutter.LocalFileMeta) reader.getFileMeta(SNAPSHOT_ARCHIVE_NAME); + final Checksum checksum = new CRC64(); + final String snapshotArchive = reader.getPath() + File.separator + SNAPSHOT_ARCHIVE_NAME; + try { + ZipUtils.decompress(snapshotArchive, new File(reader.getPath()), checksum); + if (meta.hasChecksum()) { + if (!meta.getChecksum().equals(Long.toHexString(checksum.getValue()))) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_LOAD_SNAPSHOT_ERROR_VALUE, + "Snapshot checksum failed"); + } + } + } catch (IOException e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_LOAD_SNAPSHOT_ERROR_VALUE, e); + } + } + + public static class RaftClosureAdapter implements KVStoreClosure { + + private final KVOperation op; + private final KVStoreClosure closure; + + public RaftClosureAdapter(KVOperation op, KVStoreClosure closure) { + this.op = op; + this.closure = closure; + } + + public KVStoreClosure getClosure() { + return closure; + } + + @Override + public void run(Status status) { + closure.run(status); + } + + @Override + public Pdpb.Error getError() { + return null; + } + + @Override + public void setError(Pdpb.Error error) { + + } + + @Override + public Object getData() { + return null; + } + + @Override + public void setData(Object data) { + + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftTaskHandler.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftTaskHandler.java new file mode 100644 index 0000000000..ec8120cc83 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/RaftTaskHandler.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import org.apache.hugegraph.pd.common.PDException; + +/** + * 接收raft发送的数据 + */ +public interface RaftTaskHandler { + + boolean invoke(final KVOperation op, KVStoreClosure response) throws PDException; +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/ZipUtils.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/ZipUtils.java new file mode 100644 index 0000000000..a570e0ba93 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/raft/ZipUtils.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.raft; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.zip.CheckedInputStream; +import java.util.zip.CheckedOutputStream; +import java.util.zip.Checksum; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.output.NullOutputStream; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public final class ZipUtils { + + public static void compress(final String rootDir, final String sourceDir, + final String outputFile, final Checksum checksum) throws + IOException { + try (final FileOutputStream fos = new FileOutputStream(outputFile); + final CheckedOutputStream cos = new CheckedOutputStream(fos, checksum); + final ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(cos))) { + ZipUtils.compressDirectoryToZipFile(rootDir, sourceDir, zos); + zos.flush(); + fos.getFD().sync(); + } + } + + private static void compressDirectoryToZipFile(final String rootDir, final String sourceDir, + final ZipOutputStream zos) throws IOException { + final String dir = Paths.get(rootDir, sourceDir).toString(); + final File[] files = new File(dir).listFiles(); + for (final File file : files) { + final String child = Paths.get(sourceDir, file.getName()).toString(); + if (file.isDirectory()) { + compressDirectoryToZipFile(rootDir, child, zos); + } else { + zos.putNextEntry(new ZipEntry(child)); + try (final FileInputStream fis = new FileInputStream(file); + final BufferedInputStream bis = new BufferedInputStream(fis)) { + IOUtils.copy(bis, zos); + } + } + } + } + + public static void decompress(final String sourceFile, final File outputDir, + final Checksum checksum) throws IOException { + try (final FileInputStream fis = new FileInputStream(sourceFile); + final CheckedInputStream cis = new CheckedInputStream(fis, checksum); + final ZipInputStream zis = new ZipInputStream(new BufferedInputStream(cis))) { + ZipEntry entry; + while ((entry = zis.getNextEntry()) != null) { + final String fileName = entry.getName(); + final File entryFile = new File(outputDir, fileName); + if (!entryFile.toPath().normalize().startsWith(outputDir.toPath())) { + throw new IOException("Bad zip entry"); + } + FileUtils.forceMkdir(entryFile.getParentFile()); + try (final FileOutputStream fos = new FileOutputStream(entryFile); + final BufferedOutputStream bos = new BufferedOutputStream(fos)) { + IOUtils.copy(zis, bos); + bos.flush(); + fos.getFD().sync(); + } + } + IOUtils.copy(cis, NullOutputStream.NULL_OUTPUT_STREAM); + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/BaseKVStoreClosure.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/BaseKVStoreClosure.java new file mode 100644 index 0000000000..3cc4dbb54a --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/BaseKVStoreClosure.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.store; + +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.raft.KVStoreClosure; + +public abstract class BaseKVStoreClosure implements KVStoreClosure { + + private Pdpb.Error error; + private Object data; + + @Override + public Pdpb.Error getError() { + return error; + } + + @Override + public void setError(Pdpb.Error error) { + this.error = error; + } + + @Override + public Object getData() { + return data; + } + + @Override + public void setData(Object data) { + this.data = data; + } + +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/HgKVStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/HgKVStore.java new file mode 100644 index 0000000000..263cb70b28 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/HgKVStore.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.store; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; + +public interface HgKVStore { + + void init(PDConfig config); + + void put(byte[] key, byte[] value) throws PDException; + + byte[] get(byte[] key) throws PDException; + + List scanPrefix(byte[] prefix); + + long remove(byte[] bytes) throws PDException; + + long removeByPrefix(byte[] bytes) throws PDException; + + void putWithTTL(byte[] key, byte[] value, long ttl) throws PDException; + + void putWithTTL(byte[] key, byte[] value, long ttl, TimeUnit timeUnit) throws PDException; + + byte[] getWithTTL(byte[] key) throws PDException; + + void removeWithTTL(byte[] key) throws PDException; + + List getListWithTTL(byte[] key) throws PDException; + + void clear() throws PDException; + + void saveSnapshot(String snapshotPath) throws PDException; + + void loadSnapshot(String snapshotPath) throws PDException; + + List scanRange(byte[] start, byte[] end); + + void close(); +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/HgKVStoreImpl.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/HgKVStoreImpl.java new file mode 100644 index 0000000000..bd2e7a9e22 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/HgKVStoreImpl.java @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.store; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.commons.io.FileUtils; +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.grpc.discovery.RegisterInfo; +import org.rocksdb.Checkpoint; +import org.rocksdb.Options; +import org.rocksdb.ReadOptions; +import org.rocksdb.RocksDB; +import org.rocksdb.RocksDBException; +import org.rocksdb.RocksIterator; +import org.rocksdb.Slice; + +import com.alipay.sofa.jraft.util.Utils; +import com.google.common.cache.CacheBuilder; +import com.google.common.primitives.Bytes; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class HgKVStoreImpl implements HgKVStore { + + private static final ConcurrentHashMap> CACHE = new ConcurrentHashMap(); + private final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(); + private RocksDB db; + private String dbPath; + private Options dbOptions; + + @Override + public void init(PDConfig config) { + dbOptions = new Options().setCreateIfMissing(true); + + final Lock writeLock = this.readWriteLock.writeLock(); + writeLock.lock(); + try { + this.dbPath = config.getDataPath() + "/rocksdb/"; + File file = new File(this.dbPath); + if (!file.exists()) { + try { + FileUtils.forceMkdir(file); + } catch (IOException e) { + log.warn("Failed to create data file,{}", e); + } + } + openRocksDB(dbPath); + } catch (PDException e) { + log.error("Failed to open data file,{}", e); + } finally { + writeLock.unlock(); + } + } + + @Override + public void put(byte[] key, byte[] value) throws PDException { + final Lock readLock = this.readWriteLock.readLock(); + readLock.lock(); + try { + db.put(key, value); + } catch (RocksDBException e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_WRITE_ERROR_VALUE, e); + } finally { + readLock.unlock(); + } + } + + @Override + public byte[] get(byte[] key) throws PDException { + final Lock readLock = this.readWriteLock.readLock(); + readLock.lock(); + try { + return db.get(key); + } catch (RocksDBException e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_READ_ERROR_VALUE, e); + } finally { + readLock.unlock(); + } + } + + @Override + public List scanPrefix(byte[] prefix) { + final Lock readLock = this.readWriteLock.readLock(); + readLock.lock(); + try (ReadOptions options = new ReadOptions() + .setIterateLowerBound(new Slice(prefix))) { + List kvs = new ArrayList<>(); + RocksIterator iterator = db.newIterator(options); + iterator.seekToFirst(); + while (iterator.isValid() && 0 == Bytes.indexOf(iterator.key(), prefix)) { + kvs.add(new KV(iterator.key(), iterator.value())); + iterator.next(); + } + return kvs; + } finally { + readLock.unlock(); + } + } + + @Override + public long remove(byte[] key) throws PDException { + final Lock readLock = this.readWriteLock.readLock(); + readLock.lock(); + try { + db.delete(key); + } catch (RocksDBException e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_DEL_ERROR_VALUE, e); + } finally { + readLock.unlock(); + } + return 0; + } + + @Override + public long removeByPrefix(byte[] prefix) throws PDException { + final Lock readLock = this.readWriteLock.readLock(); + readLock.lock(); + try (ReadOptions options = new ReadOptions() + .setIterateLowerBound(new Slice(prefix))) { + RocksIterator iterator = db.newIterator(options); + iterator.seekToFirst(); + + while (iterator.isValid()) { + if (0 == Bytes.indexOf(iterator.key(), prefix)) { + db.delete(iterator.key()); + } else { + break; + } + iterator.next(); + } + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_WRITE_ERROR_VALUE, e); + } finally { + readLock.unlock(); + } + return 0; + } + + @Override + public void clear() throws PDException { + CACHE.clear(); + } + + @Override + public List getListWithTTL(byte[] key) throws PDException { + String storeKey = new String(key, Charset.defaultCharset()); + LinkedList ts = new LinkedList<>(); + CACHE.keySet().forEach((cacheKey) -> { + if (cacheKey.startsWith(storeKey)) { + ConcurrentMap map; + if ((map = CACHE.get(cacheKey)) == null) { + return; + } + map.values().forEach((element) -> { + ts.add((byte[]) element); + }); + } + }); + return ts; + } + + @Override + public byte[] getWithTTL(byte[] key) throws PDException { + ConcurrentMap map; + String storeKey = new String(key, Charset.defaultCharset()); + if ((map = CACHE.get(storeKey)) == null) { + return null; + } + Object value = map.get(storeKey); + return value == null ? null : (byte[]) value; + } + + @Override + public void removeWithTTL(byte[] key) throws PDException { + ConcurrentMap map; + String storeKey = new String(key, Charset.defaultCharset()); + if ((map = CACHE.get(storeKey)) == null) { + return; + } + map.remove(storeKey); + } + + @Override + public void putWithTTL(byte[] key, byte[] value, long ttl) throws PDException { + this.putWithTTL(key, value, ttl, TimeUnit.SECONDS); + } + + @Override + public void putWithTTL(byte[] key, byte[] value, long ttl, TimeUnit timeUnit) throws + PDException { + try { + ConcurrentMap spaceNode = CacheBuilder.newBuilder().initialCapacity(200) + .expireAfterWrite(ttl, + timeUnit) + .build().asMap(); + String storeKey = new String(key, Charset.defaultCharset()); + ConcurrentMap space = CACHE.putIfAbsent(storeKey, spaceNode); + if (space == null) { + space = spaceNode; + } + space.put(storeKey, value); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.ROCKSDB_WRITE_ERROR_VALUE, e); + } + } + + @Override + public void saveSnapshot(String snapshotPath) throws PDException { + log.info("begin save snapshot at {}", snapshotPath); + final Lock writeLock = this.readWriteLock.writeLock(); + writeLock.lock(); + try (final Checkpoint checkpoint = Checkpoint.create(this.db)) { + final String tempPath = Paths.get(snapshotPath) + "_temp"; + final File tempFile = new File(tempPath); + FileUtils.deleteDirectory(tempFile); + checkpoint.createCheckpoint(tempPath); + final File snapshotFile = new File(snapshotPath); + FileUtils.deleteDirectory(snapshotFile); + if (!Utils.atomicMoveFile(tempFile, snapshotFile, true)) { + log.error("Fail to rename {} to {}", tempPath, snapshotPath); + throw new PDException(Pdpb.ErrorType.ROCKSDB_SAVE_SNAPSHOT_ERROR_VALUE, + String.format("Fail to rename %s to %s", tempPath, + snapshotPath)); + } + } catch (final PDException e) { + throw e; + } catch (final Exception e) { + log.error("Fail to write snapshot at path: {}", snapshotPath, e); + throw new PDException(Pdpb.ErrorType.ROCKSDB_SAVE_SNAPSHOT_ERROR_VALUE, e); + } finally { + writeLock.unlock(); + } + log.info("saved snapshot into {}", snapshotPath); + } + + @Override + public void loadSnapshot(String snapshotPath) throws PDException { + log.info("begin load snapshot from {}", snapshotPath); + final Lock writeLock = this.readWriteLock.writeLock(); + writeLock.lock(); + try { + final File snapshotFile = new File(snapshotPath); + if (!snapshotFile.exists()) { + log.error("Snapshot file {} not exists.", snapshotPath); + return; + } + // close DB + closeRocksDB(); + // replace rocksdb data with snapshot data + final File dbFile = new File(this.dbPath); + FileUtils.deleteDirectory(dbFile); + if (!Utils.atomicMoveFile(snapshotFile, dbFile, true)) { + log.error("Fail to rename {} to {}", snapshotPath, this.dbPath); + throw new PDException(Pdpb.ErrorType.ROCKSDB_LOAD_SNAPSHOT_ERROR_VALUE, + String.format("Fail to rename %s to %s", snapshotPath, + this.dbPath)); + } + // reopen the db + openRocksDB(this.dbPath); + } catch (final PDException e) { + throw e; + } catch (final Exception e) { + log.error("failed to load snapshot from {}", snapshotPath); + throw new PDException(Pdpb.ErrorType.ROCKSDB_LOAD_SNAPSHOT_ERROR_VALUE, e); + } finally { + writeLock.unlock(); + } + log.info("loaded snapshot from {}", snapshotPath); + } + + @Override + public List scanRange(byte[] start, byte[] end) { + final Lock readLock = this.readWriteLock.readLock(); + readLock.lock(); + try (ReadOptions options = new ReadOptions() + .setIterateLowerBound(new Slice(start)) + .setIterateUpperBound(new Slice(end))) { + List kvs = new ArrayList<>(); + RocksIterator iterator = db.newIterator(options); + iterator.seekToFirst(); + while (iterator.isValid()) { + kvs.add(new KV(iterator.key(), iterator.value())); + iterator.next(); + } + return kvs; + } finally { + readLock.unlock(); + } + } + + @Override + public void close() { + closeRocksDB(); + } + + private void closeRocksDB() { + if (this.db != null) { + this.db.close(); + this.db = null; + } + } + + private void openRocksDB(String dbPath) throws PDException { + try { + this.db = RocksDB.open(dbOptions, dbPath); + } catch (RocksDBException e) { + log.error("Failed to open RocksDB from {}", dbPath, e); + throw new PDException(Pdpb.ErrorType.ROCKSDB_LOAD_SNAPSHOT_ERROR_VALUE, e); + } + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/KV.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/KV.java new file mode 100644 index 0000000000..35dce065b5 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/KV.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.store; + +public class KV { + + private byte[] key; + private byte[] value; + + public KV(byte[] key, byte[] value) { + this.key = key; + this.value = value; + } + + public byte[] getKey() { + return key; + } + + public void setKey(byte[] key) { + this.key = key; + } + + public byte[] getValue() { + return value; + } + + public void setValue(byte[] value) { + this.value = value; + } +} diff --git a/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/RaftKVStore.java b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/RaftKVStore.java new file mode 100644 index 0000000000..ed97d13f71 --- /dev/null +++ b/hugegraph-pd/hg-pd-core/src/main/java/org/apache/hugegraph/pd/store/RaftKVStore.java @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hugegraph.pd.store; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; + +import org.apache.hugegraph.pd.common.PDException; +import org.apache.hugegraph.pd.config.PDConfig; +import org.apache.hugegraph.pd.grpc.Pdpb; +import org.apache.hugegraph.pd.raft.KVOperation; +import org.apache.hugegraph.pd.raft.KVStoreClosure; +import org.apache.hugegraph.pd.raft.RaftEngine; +import org.apache.hugegraph.pd.raft.RaftStateMachine; +import org.apache.hugegraph.pd.raft.RaftTaskHandler; + +import com.alipay.sofa.jraft.Status; +import com.alipay.sofa.jraft.entity.Task; +import com.alipay.sofa.jraft.error.RaftError; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class RaftKVStore implements HgKVStore, RaftTaskHandler { + + private final RaftEngine engine; + private final HgKVStore store; + + public RaftKVStore(RaftEngine engine, HgKVStore store) { + this.engine = engine; + this.store = store; + } + + @Override + public void init(PDConfig config) { + this.store.init(config); + this.engine.addTaskHandler(this); + } + + private BaseKVStoreClosure createClosure() { + return new BaseKVStoreClosure() { + @Override + public void run(Status status) { + if (!status.isOk()) { + log.error("An exception occurred while performing the RAFT,{}", + status.getErrorMsg()); + } else { + log.info("RAFT done!"); + } + } + }; + } + + @Override + public void put(byte[] key, byte[] value) throws PDException { + KVOperation operation = KVOperation.createPut(key, value); + try { + applyOperation(operation).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + } + + /** + * 查询可以不走raft,直接读取 + */ + @Override + public byte[] get(byte[] key) throws PDException { + return store.get(key); + + } + + @Override + public List scanPrefix(byte[] prefix) { + return store.scanPrefix(prefix); + } + + @Override + public long remove(byte[] bytes) throws PDException { + try { + applyOperation(KVOperation.createRemove(bytes)).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + return 0; + } + + @Override + public long removeByPrefix(byte[] bytes) throws PDException { + try { + applyOperation(KVOperation.createRemoveByPrefix(bytes)).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + return 0; + } + + @Override + public void clear() throws PDException { + try { + applyOperation(KVOperation.createClear()).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + } + + @Override + public void putWithTTL(byte[] key, byte[] value, long ttl) throws PDException { + try { + applyOperation(KVOperation.createPutWithTTL(key, value, ttl)).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + } + + @Override + public void putWithTTL(byte[] key, byte[] value, long ttl, TimeUnit timeUnit) throws + PDException { + try { + applyOperation(KVOperation.createPutWithTTL(key, value, ttl, timeUnit)).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + } + + @Override + public List getListWithTTL(byte[] key) throws PDException { + return store.getListWithTTL(key); + } + + @Override + public byte[] getWithTTL(byte[] key) throws PDException { + return store.getWithTTL(key); + } + + @Override + public void removeWithTTL(byte[] key) throws PDException { + try { + applyOperation(KVOperation.createRemoveWithTTL(key)).get(); + } catch (Exception e) { + throw new PDException(Pdpb.ErrorType.UNKNOWN_VALUE, e.getMessage()); + } + } + + @Override + public void saveSnapshot(String snapshotPath) throws PDException { + store.saveSnapshot(snapshotPath); + } + + @Override + public void loadSnapshot(String snapshotPath) throws PDException { + store.loadSnapshot(snapshotPath); + } + + @Override + public List scanRange(byte[] start, byte[] end) { + return store.scanRange(start, end); + } + + @Override + public void close() { + store.close(); + } + + /** + * 需要走Raft的真实操作 + */ + private void doPut(byte[] key, byte[] value) throws PDException { + + store.put(key, value); + } + + public long doRemove(byte[] bytes) throws PDException { + return this.store.remove(bytes); + } + + public long doRemoveByPrefix(byte[] bytes) throws PDException { + return this.store.removeByPrefix(bytes); + } + + public void doRemoveWithTTL(byte[] key) throws PDException { + this.store.removeWithTTL(key); + } + + public void doClear() throws PDException { + this.store.clear(); + } + + public void doPutWithTTL(byte[] key, byte[] value, long ttl) throws PDException { + this.store.putWithTTL(key, value, ttl); + } + + public void doPutWithTTL(byte[] key, byte[] value, long ttl, TimeUnit timeUnit) throws + PDException { + this.store.putWithTTL(key, value, ttl, timeUnit); + } + + public void doSaveSnapshot(String snapshotPath) throws PDException { + this.store.saveSnapshot(snapshotPath); + } + + public void doLoadSnapshot(String snapshotPath) throws PDException { + this.store.loadSnapshot(snapshotPath); + } + + private CompletableFuture applyOperation(final KVOperation op) throws PDException { + CompletableFuture future = new CompletableFuture<>(); + try { + final Task task = new Task(); + task.setData(ByteBuffer.wrap(op.toByteArray())); + task.setDone(new RaftStateMachine.RaftClosureAdapter(op, new KVStoreClosure() { + Object data; + Pdpb.Error error; + + @Override + public Pdpb.Error getError() { + return error; + } + + @Override + public void setError(Pdpb.Error error) { + this.error = error; + } + + @Override + public Object getData() { + return data; + } + + @Override + public void setData(Object data) { + this.data = data; + } + + @Override + public void run(Status status) { + if (status.isOk()) { + future.complete((T) data); + } else { + RaftError raftError = status.getRaftError(); + Pdpb.ErrorType type; + if (RaftError.EPERM.equals(raftError)) { + type = Pdpb.ErrorType.NOT_LEADER; + } else { + type = Pdpb.ErrorType.UNKNOWN; + } + error = Pdpb.Error.newBuilder().setType(type) + .setMessage(status.getErrorMsg()) + .build(); + future.completeExceptionally( + new PDException(error.getTypeValue())); + } + } + })); + this.engine.addTask(task); + return future; + } catch (Exception e) { + future.completeExceptionally(e); + return future; + } + } + + private boolean isLeader() { + return this.engine.isLeader(); + } + + @Override + public boolean invoke(KVOperation op, KVStoreClosure response) throws PDException { + switch (op.getOp()) { + case KVOperation.GET: + break; + case KVOperation.PUT: + doPut(op.getKey(), op.getValue()); + break; + case KVOperation.REMOVE: + doRemove(op.getKey()); + break; + case KVOperation.PUT_WITH_TTL: + doPutWithTTL(op.getKey(), op.getValue(), (long) op.getArg()); + break; + case KVOperation.PUT_WITH_TTL_UNIT: + Object[] arg = (Object[]) op.getArg(); + doPutWithTTL(op.getKey(), op.getValue(), (long) arg[0], (TimeUnit) arg[1]); + break; + case KVOperation.REMOVE_BY_PREFIX: + doRemoveByPrefix(op.getKey()); + break; + case KVOperation.REMOVE_WITH_TTL: + doRemoveWithTTL(op.getKey()); + break; + case KVOperation.CLEAR: + doClear(); + break; + case KVOperation.SAVE_SNAPSHOT: + doSaveSnapshot((String) op.getAttach()); + break; + case KVOperation.LOAD_SNAPSHOT: + doLoadSnapshot((String) op.getAttach()); + break; + default: + log.error("Err op {}", op.getOp()); + } + return false; + } +} diff --git a/hugegraph-pd/hg-pd-test/pom.xml b/hugegraph-pd/hg-pd-test/pom.xml index 31c0fd889d..f2e187cbe1 100644 --- a/hugegraph-pd/hg-pd-test/pom.xml +++ b/hugegraph-pd/hg-pd-test/pom.xml @@ -99,6 +99,16 @@ hg-pd-common ${revision} + + org.apache.hugegraph + hg-pd-client + ${revision} + + + org.apache.hugegraph + hg-pd-core + ${revision} + com.google.code.gson diff --git a/hugegraph-pd/pom.xml b/hugegraph-pd/pom.xml index 6253cfd443..8647775d3c 100644 --- a/hugegraph-pd/pom.xml +++ b/hugegraph-pd/pom.xml @@ -36,11 +36,10 @@ hg-pd-common hg-pd-client hg-pd-test + hg-pd-core - -