Skip to content

Commit

Permalink
[Platform] Health-check script generates an alert for 2.5.x when usin…
Browse files Browse the repository at this point in the history
…g Python 3 #7082

Summary:
 - Fixed error in 'cluster_health.py' for execution with Python 3.x;
 - Added logic to mark all active health-check alerts as RESOLVED once a health-check succeeded;
 - Updated text of emails for alerts - previously state was absent there;
 - Updated junit tests.

Test Plan:
Test scenario 1.
  Check that health-check works without problems when Python 3 is used.

Test scenario 2.
  1. Initiate the health-check problem by any reason. You should receive an email with the alert. The alert should appear on the 'Alerts' page of YW UI.
  2. Resolve the problem from step 1. Ensure that another email is received where the alert state is mentioned as 'RESOLVED'. The alert should disappear from the 'Alerts' page of YW UI.

Reviewers: daniel

Reviewed By: daniel

Subscribers: jenkins-bot, yugaware

Differential Revision: https://phabricator.dev.yugabyte.com/D10543
  • Loading branch information
SergeyPotachev committed Feb 5, 2021
1 parent daafee2 commit 0858c82
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 44 deletions.
9 changes: 6 additions & 3 deletions managed/devops/bin/cluster_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def check_output(cmd, env):
return 'Error executing command {}: timeout occurred'.format(cmd)

output, stderr = command.communicate()
return output.decode('utf-8').encode("ascii", "ignore")
return output.decode('utf-8').encode("ascii", "ignore").decode("ascii")
except subprocess.CalledProcessError as e:
return 'Error executing command {}: {}'.format(
cmd, e.output.decode("utf-8").encode("ascii", "ignore"))
Expand Down Expand Up @@ -249,6 +249,9 @@ def check_disk_utilization(self):

# Do not process the headers.
lines = output.split('\n')
if len(lines) < 2:
return e.fill_and_return_entry([output], True)

msgs.append(lines[0])
for line in lines[1:]:
msgs.append(line)
Expand Down Expand Up @@ -480,11 +483,11 @@ def run(self):
checks_remaining += 1
sleep_interval = self.retry_interval_secs if check.tries > 0 else 0

check_func_name = check.__name__ if PY3 else check.func_name
if check.tries > 0:
logging.info("Retry # " + str(check.tries) +
" for check " + check.func_name)
" for check " + check_func_name)

check_func_name = check.__name__ if PY3 else check.func_name
if check.yb_process is None:
check.result = self.pool.apply_async(
multithreaded_caller,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

import java.util.ArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;

import javax.mail.MessagingException;

import java.util.Date;
Expand All @@ -31,6 +33,7 @@
import com.yugabyte.yw.forms.CustomerRegisterFormData.SmtpData;
import com.yugabyte.yw.models.AccessKey;
import com.yugabyte.yw.models.Alert;
import com.yugabyte.yw.models.Alert.State;
import com.yugabyte.yw.models.Customer;
import com.yugabyte.yw.models.CustomerTask;
import com.yugabyte.yw.models.CustomerConfig;
Expand Down Expand Up @@ -67,7 +70,8 @@ public class HealthChecker {
public static final String kCheckLabel = "check_name";
public static final String kNodeLabel = "node";

private static final String ALERT_ERROR_CODE = "HEALTH_CHECKER_FAILURE";
@VisibleForTesting
static final String ALERT_ERROR_CODE = "HEALTH_CHECKER_FAILURE";

play.Configuration config;

Expand All @@ -90,9 +94,11 @@ public class HealthChecker {

private final CollectorRegistry promRegistry;

private HealthCheckerReport healthCheckerReport;
private final HealthCheckerReport healthCheckerReport;

private EmailHelper emailHelper;
private final EmailHelper emailHelper;

private final AlertManager alertManager;

@VisibleForTesting
HealthChecker(
Expand All @@ -102,14 +108,16 @@ public class HealthChecker {
HealthManager healthManager,
CollectorRegistry promRegistry,
HealthCheckerReport healthCheckerReport,
EmailHelper emailHelper) {
EmailHelper emailHelper,
AlertManager alertManager) {
this.actorSystem = actorSystem;
this.config = config;
this.executionContext = executionContext;
this.healthManager = healthManager;
this.promRegistry = promRegistry;
this.healthCheckerReport = healthCheckerReport;
this.emailHelper = emailHelper;
this.alertManager = alertManager;

this.initialize();
}
Expand All @@ -122,10 +130,11 @@ public HealthChecker(
ExecutionContext executionContext,
HealthManager healthManager,
HealthCheckerReport healthCheckerReport,
EmailHelper emailHelper) {
EmailHelper emailHelper,
AlertManager alertManager) {
this(actorSystem, config, executionContext, healthManager,
CollectorRegistry.defaultRegistry, healthCheckerReport,
emailHelper);
emailHelper, alertManager);
}

private void initialize() {
Expand Down Expand Up @@ -189,6 +198,10 @@ private void processResults(Customer c, Universe u, String response, long durati
LOG.info("Health check for universe {} reported {}. [ {} ms ]", u.name,
(hasErrors ? "errors" : " success"), durationMs);

if (!hasErrors) {
resolveHealthCheckAlerts(c, u);
}

} catch (Exception e) {
LOG.warn("Failed to convert health check response to prometheus metrics " + e.getMessage());
createAlert(c, u,
Expand All @@ -210,6 +223,23 @@ private void processResults(Customer c, Universe u, String response, long durati
}
}

/**
* Updates states of all active health-check alerts to RESOLVED.
*
* @param c Customer
* @param u Universe
*/
private void resolveHealthCheckAlerts(Customer c, Universe u) {
List<Alert> activeAlerts = Alert.list(c.uuid, ALERT_ERROR_CODE, u.universeUUID).stream()
.filter(alert -> alert.state == State.ACTIVE || alert.state == State.CREATED)
.collect(Collectors.toList());
LOG.debug("Resetting health-check alerts, count: " + activeAlerts.size());
for (Alert alert : activeAlerts) {
alert.setState(State.RESOLVED);
alert.save();
}
}

private String sendEmailReport(Universe u, Customer c, SmtpData smtpData,
String emailDestinations, String subject, JsonNode report, boolean reportOnlyErrors) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import com.yugabyte.yw.commissioner.tasks.params.NodeTaskParams;
import com.yugabyte.yw.common.NodeManager;
import com.yugabyte.yw.common.CallHomeManager.CollectionLevel;
import com.yugabyte.yw.common.ShellProcessHandler;
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams;
import com.yugabyte.yw.common.ShellResponse;
import com.yugabyte.yw.models.helpers.NodeDetails;
Expand Down Expand Up @@ -99,7 +98,7 @@ public void run(Universe universe) {

// Create new alert or update existing alert with current node name if alert already exists.
if (Alert.exists(alertErrCode, universe.universeUUID)) {
Alert cronAlert = Alert.get(cust.uuid, alertErrCode, universe.universeUUID);
Alert cronAlert = Alert.list(cust.uuid, alertErrCode, universe.universeUUID).get(0);
List<String> failedNodesList = universe.getNodes().stream()
.map(nodeDetail -> nodeDetail.nodeName)
.collect(Collectors.toList());
Expand Down
24 changes: 13 additions & 11 deletions managed/src/main/java/com/yugabyte/yw/common/AlertManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,13 @@ public void sendEmail(Alert alert, String state) {
? Universe.find.byId(alert.targetUUID)
: null;
if (universe != null) {
content = String.format("Common failure for universe '%s':\n%s.", universe.name,
alert.message);
content = String.format(
"Common failure for universe '%s', state: %s\nFailure details:\n\n%s.",
universe.name, state, alert.message);
} else {
content = String.format("Common failure for customer '%s':\n%s.", customer.name,
alert.message);
content = String.format(
"Common failure for customer '%s', state: %s\nFailure details:\n\n%s.",
customer.name, state, alert.message);
}
}

Expand All @@ -105,17 +107,17 @@ public Alert transitionAlert(Alert alert) {
switch (alert.state) {
case CREATED:
LOG.info("Transitioning alert {} to active", alert.uuid);
sendEmail(alert, "firing");
alert.state = Alert.State.ACTIVE;
sendEmail(alert, "FIRING");
alert.setState(Alert.State.ACTIVE);
break;
case ACTIVE:
LOG.info("Transitioning alert {} to resolved", alert.uuid);
sendEmail(alert, "resolved");
alert.state = Alert.State.RESOLVED;
LOG.info("Transitioning alert {} to resolved (with email)", alert.uuid);
sendEmail(alert, "RESOLVED");
alert.setState(Alert.State.RESOLVED);
break;
case RESOLVED:
LOG.info("Transitioning alert {} to resolved", alert.uuid);
alert.state = Alert.State.RESOLVED;
LOG.info("Transitioning alert {} to resolved (no email)", alert.uuid);
alert.setState(Alert.State.RESOLVED);
break;
}

Expand Down
16 changes: 11 additions & 5 deletions managed/src/main/java/com/yugabyte/yw/models/Alert.java
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,11 @@ public JsonNode toJson() {
}

public static Boolean exists(String errCode) {
return find.query().where().eq("errCode", errCode).findCount() != 0;
return find.query().where().eq("err_code", errCode).findCount() != 0;
}

public static Boolean exists(String errCode, UUID targetUUID) {
return find.query().where().eq("errCode", errCode)
return find.query().where().eq("err_code", errCode)
.eq("target_uuid", targetUUID).findCount() != 0;
}

Expand Down Expand Up @@ -213,10 +213,12 @@ public static List<Alert> listActiveCustomerAlerts(UUID customerUUID) {
.findList();
}

public static Alert get(UUID customerUUID, String errCode, UUID targetUUID) {
public static List<Alert> list(UUID customerUUID, String errCode, UUID targetUUID) {
return find.query().where().eq("customer_uuid", customerUUID)
.eq("errCode", errCode)
.eq("target_uuid", targetUUID).findOne();
.eq("err_code", errCode)
.eq("target_uuid", targetUUID)
.orderBy("create_time desc")
.findList();
}

public static Alert get(UUID alertUUID) {
Expand All @@ -230,4 +232,8 @@ public static Alert get(UUID customerUUID, UUID targetUUID, TargetType targetTyp
.eq("target_type", targetType)
.findOne();
}

public void setState(State state) {
this.state = state;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import akka.actor.ActorSystem;
import akka.actor.Scheduler;

import com.yugabyte.yw.common.AlertManager;
import com.yugabyte.yw.common.ApiUtils;
import com.yugabyte.yw.common.EmailFixtures;
import com.yugabyte.yw.common.EmailHelper;
Expand All @@ -19,6 +20,8 @@
import com.yugabyte.yw.forms.UniverseDefinitionTaskParams.Cluster;
import com.yugabyte.yw.models.AccessKey;
import com.yugabyte.yw.models.Alert;
import com.yugabyte.yw.models.Alert.State;
import com.yugabyte.yw.models.Alert.TargetType;
import com.yugabyte.yw.models.Customer;
import com.yugabyte.yw.models.CustomerConfig;
import com.yugabyte.yw.models.HealthCheck;
Expand Down Expand Up @@ -88,7 +91,10 @@ public class HealthCheckerTest extends FakeDBApplication {
private HealthCheckerReport report;

@Mock
private EmailHelper emailHelper;
private EmailHelper mockEmailHelper;

@Mock
private AlertManager mockAlertManager;

@Before
public void setUp() {
Expand Down Expand Up @@ -118,7 +124,8 @@ public void setUp() {
mockHealthManager,
testRegistry,
report,
emailHelper
mockEmailHelper,
mockAlertManager
);
}

Expand Down Expand Up @@ -533,14 +540,15 @@ public void run(Universe univ) {
});
setupAlertingData(YB_ALERT_TEST_EMAIL, false, false);
// Imitate error while sending the email.
doThrow(new MessagingException("TestException")).when(emailHelper).sendEmail(any(), any(),
doThrow(new MessagingException("TestException")).when(mockEmailHelper).sendEmail(any(), any(),
any(), any(), any());
when(emailHelper.getSmtpData(defaultCustomer.uuid)).thenReturn(EmailFixtures.createSmtpData());
when(mockEmailHelper.getSmtpData(defaultCustomer.uuid))
.thenReturn(EmailFixtures.createSmtpData());

assertEquals(0, Alert.list(defaultCustomer.uuid).size());
healthChecker.checkSingleUniverse(u, defaultCustomer, true, false, YB_ALERT_TEST_EMAIL);

verify(emailHelper, times(1)).sendEmail(any(), any(), any(), any(), any());
verify(mockEmailHelper, times(1)).sendEmail(any(), any(), any(), any(), any());
// To check that alert is created.
List<Alert> alerts = Alert.list(defaultCustomer.uuid);
assertNotEquals(0, alerts.size());
Expand All @@ -550,11 +558,48 @@ public void run(Universe univ) {
@Test
public void testEmailSentWithTwoContentTypes() throws MessagingException {
Universe u = setupUniverse("test");
when(emailHelper.getSmtpData(defaultCustomer.uuid)).thenReturn(EmailFixtures.createSmtpData());
when(mockEmailHelper.getSmtpData(defaultCustomer.uuid))
.thenReturn(EmailFixtures.createSmtpData());
healthChecker.checkSingleUniverse(u, defaultCustomer, true, false, YB_ALERT_TEST_EMAIL);

verify(emailHelper, times(1)).sendEmail(any(), any(), any(), any(), any());
verify(mockEmailHelper, times(1)).sendEmail(any(), any(), any(), any(), any());
verify(report, times(1)).asHtml(eq(u), any(), anyBoolean());
verify(report, times(1)).asPlainText(any(), anyBoolean());
}

private void mockGoodHealthResponse() {
ShellResponse dummyShellResponse = ShellResponse.create(0,
("{''error'': false, ''data'': [ {''node'':''" + dummyNode
+ "'', ''has_error'': false, ''message'':''" + dummyCheck + "'' } ] }").replace("''",
"\""));
when(mockHealthManager.runCommand(any(), any(), any())).thenReturn(dummyShellResponse);
}

@Test
public void testEmailSent_RightAlertsResetted() throws MessagingException {
Universe u = setupUniverse("test");
when(mockEmailHelper.getSmtpData(defaultCustomer.uuid))
.thenReturn(EmailFixtures.createSmtpData());
setupAlertingData(YB_ALERT_TEST_EMAIL, false, false);
mockGoodHealthResponse();

// alert1 is in state ACTIVE.
Alert alert1 = Alert.create(defaultCustomer.uuid, u.universeUUID, TargetType.UniverseType,
HealthChecker.ALERT_ERROR_CODE, "Warning", "Test 1");
alert1.setState(State.ACTIVE);
alert1.save();
// alert1 is in state CREATED.
Alert alert2 = Alert.create(defaultCustomer.uuid, u.universeUUID, TargetType.UniverseType,
HealthChecker.ALERT_ERROR_CODE, "Warning", "Test 2");
// alert3 should not be updated as it has another errCode.
Alert alert3 = Alert.create(defaultCustomer.uuid, u.universeUUID, TargetType.UniverseType,
"Another Error Code", "Warning", "Test 3");

healthChecker.checkSingleUniverse(u, defaultCustomer, true, false, YB_ALERT_TEST_EMAIL);

assertEquals(State.RESOLVED, Alert.get(alert1.uuid).state);
assertEquals(State.RESOLVED, Alert.get(alert2.uuid).state);
// Alert3 is not related to health-check, so it should not be updated.
assertNotEquals(State.RESOLVED, Alert.get(alert3.uuid).state);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
@RunWith(MockitoJUnitRunner.class)
public class AlertManagerTest extends FakeDBApplication {

private static final String TEST_STATE = "test state";

private static final String ALERT_TEST_MESSAGE = "Test message";

private Customer defaultCustomer;
Expand All @@ -45,15 +47,17 @@ public void setUp() {

@Test
public void testSendEmail_DoesntFail_UniverseRemoved() throws MessagingException {
doTestSendEmail(UUID.randomUUID(), String.format("Common failure for customer '%s':\n%s.",
defaultCustomer.name, ALERT_TEST_MESSAGE));
doTestSendEmail(UUID.randomUUID(),
String.format("Common failure for customer '%s', state: %s\nFailure details:\n\n%s.",
defaultCustomer.name, TEST_STATE, ALERT_TEST_MESSAGE));
}

@Test
public void testSendEmail_UniverseExists() throws MessagingException {
Universe u = ModelFactory.createUniverse();
doTestSendEmail(u.universeUUID,
String.format("Common failure for universe '%s':\n%s.", u.name, ALERT_TEST_MESSAGE));
String.format("Common failure for universe '%s', state: %s\nFailure details:\n\n%s.",
u.name, TEST_STATE, ALERT_TEST_MESSAGE));
}

private void doTestSendEmail(UUID universeUUID, String expectedContent)
Expand All @@ -68,7 +72,7 @@ private void doTestSendEmail(UUID universeUUID, String expectedContent)
.thenReturn(Collections.singletonList(destination));
when(emailHelper.getSmtpData(defaultCustomer.uuid)).thenReturn(smtpData);

am.sendEmail(alert, "test state");
am.sendEmail(alert, TEST_STATE);

verify(emailHelper, times(1)).sendEmail(eq(defaultCustomer), anyString(), eq(destination),
eq(smtpData),
Expand Down
Loading

0 comments on commit 0858c82

Please sign in to comment.