Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[mongo] gracefully fail on operation samples colleciton when node is in recovering mode #19080

Merged
merged 3 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mongo/changelog.d/19080.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix crash in DBM operation samples collection when a node is in recovering mode.

18 changes: 14 additions & 4 deletions mongo/datadog_checks/mongo/dbm/operation_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import List, Optional

from bson import json_util
from pymongo.errors import NotPrimaryError

from datadog_checks.mongo.dbm.utils import (
format_key_name,
Expand Down Expand Up @@ -107,6 +108,9 @@ def _should_collect_operation_samples(self) -> bool:
if isinstance(deployment, ReplicaSetDeployment) and deployment.is_arbiter:
self._check.log.debug("Skipping operation samples collection on arbiter node")
return False
elif isinstance(deployment, ReplicaSetDeployment) and deployment.replset_state == 3:
self._check.log.debug("Skipping operation samples collection on node in recovering state")
return False
return True

def _get_operation_samples(self, now, databases_monitored: List[str]):
Expand Down Expand Up @@ -149,10 +153,16 @@ def _get_operation_samples(self, now, databases_monitored: List[str]):
continue

def _get_current_op(self):
operations = self._check.api_client.current_op()
for operation in operations:
self._check.log.debug("Found operation: %s", operation)
yield operation
try:
operations = self._check.api_client.current_op()
for operation in operations:
self._check.log.debug("Found operation: %s", operation)
yield operation
except NotPrimaryError as e:
# If the node is not primary or secondary, for example node is in recovering state
# we could not run the $currentOp command to collect operation samples.
self._check.log.warning("Could not collect operation samples, node is not primary or secondary")
self._check.log.debug("Error details: %s", e)

def _should_include_operation(self, operation: dict, databases_monitored: List[str]) -> bool:
# Skip operations from db that are not configured to be monitored
Expand Down
34 changes: 34 additions & 0 deletions mongo/tests/test_dbm_operation_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import json
import os

import mock
import pytest
from pymongo.errors import NotPrimaryError

from . import common
from .common import HERE
Expand Down Expand Up @@ -105,3 +107,35 @@ def test_mongo_operation_samples_arbiter(aggregator, instance_arbiter, check, dd

assert len(dbm_samples) == 0
assert len(dbm_activities) == 0


@mock_now(1715911398.1112723)
@common.shard
def test_mongo_operation_samples_not_primary(
aggregator, instance_integration_cluster_autodiscovery, check, dd_run_check
):
instance_integration_cluster_autodiscovery['dbm'] = True
instance_integration_cluster_autodiscovery['operation_samples'] = {'enabled': True, 'run_sync': True}
instance_integration_cluster_autodiscovery['slow_operations'] = {'enabled': False}
instance_integration_cluster_autodiscovery['schemas'] = {'enabled': False}

mongo_check = check(instance_integration_cluster_autodiscovery)
with mock_pymongo("standalone"):
with mock.patch(
'datadog_checks.mongo.api.MongoApi.current_op', new_callable=mock.PropertyMock
) as mock_current_op:
mock_current_op.side_effect = NotPrimaryError("node is recovering")
aggregator.reset()
run_check_once(mongo_check, dd_run_check)

dbm_activities = aggregator.get_event_platform_events("dbm-activity")
activity_samples = [event for event in dbm_activities if event['dbm_type'] == 'activity']
assert activity_samples is not None
assert len(activity_samples[0]['mongodb_activity']) == 0

aggregator.reset()
mongo_check.deployment_type.replset_state = 3
run_check_once(mongo_check, dd_run_check)
dbm_activities = aggregator.get_event_platform_events("dbm-activity")
activity_samples = [event for event in dbm_activities if event['dbm_type'] == 'activity']
assert len(activity_samples) == 0
Loading