Skip to content

Commit

Permalink
HDDS-11243. SCM SafeModeRule Support EC.
Browse files Browse the repository at this point in the history
  • Loading branch information
slfan1989 committed Aug 4, 2024
1 parent 2dd4e15 commit fcdcf76
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,16 @@ public synchronized double getCurrentECContainerThreshold() {
}

public synchronized double getEcMaxContainer() {
if (ecMaxContainer == 0) {
return 1;
}
return ecMaxContainer;
}

private synchronized double getRatisMaxContainer() {
if (ratisMaxContainer == 0) {
return 1;
}
return ratisMaxContainer;
}

Expand All @@ -171,11 +177,11 @@ protected synchronized void process(
ratisContainerDNsMap.get(containerID).add(datanodeUUID);
if (!reportedConatinerIDSet.contains(containerID)) {
Set<UUID> uuids = ratisContainerDNsMap.get(containerID);
if (uuids != null && uuids.size() > 1) {
if (uuids != null && uuids.size() >= 1) {
ratisContainerWithMinReplicas.getAndAdd(1);
reportedConatinerIDSet.add(containerID);
getSafeModeMetrics()
.incCurrentContainersWithOneReplicaReportedCount();
.incCurrentContainersWithOneReplicaReportedCount();
}
}
}
Expand Down Expand Up @@ -205,8 +211,8 @@ protected synchronized void process(
SCMSafeModeManager.getLogger().info(
"SCM in safe mode. {} % containers [Ratis] have at least one"
+ " reported replica, {} % containers [EC] have at N reported replica.",
((ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer) * 100),
((ecContainerWithMinReplicas.doubleValue() / ecMaxContainer) * 100)
((ratisContainerWithMinReplicas.doubleValue() / getRatisMaxContainer()) * 100),
((ecContainerWithMinReplicas.doubleValue() / getEcMaxContainer()) * 100)
);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -828,10 +828,13 @@ public static Pipeline getRandomPipeline() {
*/
public static List<ContainerInfo> getContainerInfo(int numContainers) {
List<ContainerInfo> containerInfoList = new ArrayList<>();
RatisReplicationConfig ratisReplicationConfig =
RatisReplicationConfig.getInstance(ReplicationFactor.THREE);
for (int i = 0; i < numContainers; i++) {
ContainerInfo.Builder builder = new ContainerInfo.Builder();
containerInfoList.add(builder
.setContainerID(RandomUtils.nextLong())
.setReplicationConfig(ratisReplicationConfig)
.build());
}
return containerInfoList;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.apache.hadoop.hdds.scm.pipeline.PipelineProvider;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManagerImpl;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer;
import org.apache.hadoop.hdds.server.events.EventHandler;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.hdds.server.events.EventQueue;
Expand Down Expand Up @@ -138,8 +139,10 @@ private void testSafeMode(int numContainers) throws Exception {

assertTrue(scmSafeModeManager.getInSafeMode());
validateRuleStatus("DatanodeSafeModeRule", "registered datanodes 0");
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(containers));
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(containers);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationContainerReport);
queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport);

long cutOff = (long) Math.ceil(numContainers * config.getDouble(
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
Expand Down Expand Up @@ -180,7 +183,7 @@ public void testSafeModeExitRule() throws Exception {

assertTrue(scmSafeModeManager.getInSafeMode());
validateRuleStatus("ContainerSafeModeRule",
"% of containers with at least one reported");
"0.00% of [Ratis] Containers(0 / 100) with at least one reported");
testContainerThreshold(containers.subList(0, 25), 0.25);
assertEquals(25, scmSafeModeManager.getSafeModeMetrics()
.getCurrentContainersWithOneReplicaReportedCount().value());
Expand Down Expand Up @@ -331,6 +334,16 @@ public void testSafeModeExitRuleWithPipelineAvailabilityCheck(

List<Pipeline> pipelines = pipelineManager.getPipelines();

for (Pipeline pipeline : pipelines) {
List<DatanodeDetails> nodes = pipeline.getNodes();
for (DatanodeDetails node : nodes) {
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
new SCMDatanodeProtocolServer.NodeRegistrationContainerReport(node, null);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
nodeRegistrationContainerReport);
}
}

int healthyPipelineThresholdCount =
scmSafeModeManager.getHealthyPipelineSafeModeRule()
.getHealthyPipelineThresholdCount();
Expand Down Expand Up @@ -522,8 +535,10 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception {

// Register all DataNodes except last one and assert SCM is in safe mode.
for (int i = 0; i < numOfDns - 1; i++) {
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(containers));
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(containers);
queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationContainerReport);
assertTrue(scmSafeModeManager.getInSafeMode());
assertEquals(1, scmSafeModeManager.getCurrentContainerThreshold());
}
Expand All @@ -543,8 +558,12 @@ private void testSafeModeDataNodes(int numOfDns) throws Exception {
private void testContainerThreshold(List<ContainerInfo> dnContainers,
double expectedThreshold)
throws Exception {
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(dnContainers);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(dnContainers));
nodeRegistrationContainerReport);
queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT,
nodeRegistrationContainerReport);
GenericTestUtils.waitFor(() -> {
double threshold = scmSafeModeManager.getCurrentContainerThreshold();
return threshold == expectedThreshold;
Expand Down Expand Up @@ -589,10 +608,18 @@ public void testSafeModePipelineExitRule() throws Exception {
config, containers, null, pipelineManager, queue, serviceManager,
scmContext);

queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(containers));
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(containers);
queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport);

assertTrue(scmSafeModeManager.getInSafeMode());

for (DatanodeDetails datanodeDetail : pipeline.getNodes()) {
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationReport =
new SCMDatanodeProtocolServer.NodeRegistrationContainerReport(datanodeDetail, null);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, nodeRegistrationReport);
}

firePipelineEvent(pipelineManager, pipeline);

GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
Expand Down Expand Up @@ -647,8 +674,9 @@ public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception {

// Register all DataNodes except last one and assert SCM is in safe mode.
for (int i = 0; i < numOfDns - 1; i++) {
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
HddsTestUtils.createNodeRegistrationContainerReport(containers));
SCMDatanodeProtocolServer.NodeRegistrationContainerReport nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(containers);
queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, nodeRegistrationContainerReport);
assertTrue(scmSafeModeManager.getInSafeMode());
assertFalse(scmSafeModeManager.getPreCheckComplete());
}
Expand Down

0 comments on commit fcdcf76

Please sign in to comment.