From 9dbb2792230ab11f1e4f4a2915d61df2fff6d227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Mon, 27 May 2024 19:41:59 -0300 Subject: [PATCH 01/10] Fix juju_unit and juju_application labels --- lib/charms/nrpe_exporter/v0/nrpe_exporter.py | 29 +++++++++++++++++--- src/charm.py | 2 +- tests/unit/test_relation_monitors.py | 2 +- tox.ini | 2 +- 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/lib/charms/nrpe_exporter/v0/nrpe_exporter.py b/lib/charms/nrpe_exporter/v0/nrpe_exporter.py index 7ea3e8b..44e1dda 100644 --- a/lib/charms/nrpe_exporter/v0/nrpe_exporter.py +++ b/lib/charms/nrpe_exporter/v0/nrpe_exporter.py @@ -441,10 +441,14 @@ def _generate_data(self, relation) -> Tuple[list, list]: ) id = re.sub(r"^juju[-_]", "", id) + nagios_host_context = relation.data[unit].get("nagios_host_context", "") + alerts.append(self._generate_alert(relation, cmd, id, unit)) nrpe_endpoints.append( - self._generate_prometheus_job(relation, unit, cmd, exporter_address, id) + self._generate_prometheus_job( + relation, unit, cmd, exporter_address, id, nagios_host_context + ) ) else: logger.debug("No NRPE check is defined.") @@ -485,13 +489,17 @@ def _generate_alert(self, relation, cmd, id, unit) -> dict: }, } - def _generate_prometheus_job(self, relation, unit, cmd, exporter_address, id) -> dict: + def _generate_prometheus_job( + self, relation, unit, cmd, exporter_address, id, nagios_host_context + ) -> dict: """Generate an on-the-fly Prometheus scrape job.""" # IP address could be 'target-address' OR 'target_address' addr = relation.data[unit].get("target-address", "") or relation.data[unit].get( "target_address", "" ) + nagios_host_context = nagios_host_context + "-" if nagios_host_context else "" + return { "app_name": relation.app.name, "target": { @@ -511,8 +519,21 @@ def _generate_prometheus_job(self, relation, unit, cmd, exporter_address, id) -> }, { "target_label": "juju_unit", - # Turn sql-foo-0 or redis_bar_1 into sql-foo/0 or redis-bar/1 - "replacement": re.sub(r"^(.*?)[-_](\d+)$", r"\1/\2", id.replace("_", "-")), + # Turn nagios-host-context-sql-foo-0 into sql-foo/0 + "replacement": re.sub( + r"^(.*?)[-_](\d+)$", + r"\1/\2", + id.replace("_", "-").replace(nagios_host_context, ""), + ), + }, + { + "target_label": "juju_application", + # Turn nagios-host-context-sql-foo-0 into sql-foo + "replacement": re.sub( + r"^(.*?)[-_](\d+)$", + r"\1", + id.replace("_", "-").replace(nagios_host_context, ""), + ), }, ], "updates": { diff --git a/src/charm.py b/src/charm.py index 13619e0..ac4c011 100755 --- a/src/charm.py +++ b/src/charm.py @@ -130,7 +130,7 @@ def __init__(self, *args): self.metrics_aggregator = MetricsEndpointAggregator(self, resolve_addresses=True) self.cos_agent = COSAgentProvider( self, - scrape_configs=self._get_scrape_configs, + scrape_configs=self._get_scrape_configs(), metrics_rules_dir=RULES_DIR, dashboard_dirs=[COS_PROXY_DASHBOARDS_DIR, DASHBOARDS_DIR], refresh_events=[ diff --git a/tests/unit/test_relation_monitors.py b/tests/unit/test_relation_monitors.py index cf1f14a..6a5fd6a 100644 --- a/tests/unit/test_relation_monitors.py +++ b/tests/unit/test_relation_monitors.py @@ -97,7 +97,7 @@ def test_prometheus(self): # THEN alert rules are transferred to prometheus over relation data app_data = self.harness.get_relation_data(rel_id_prom, "cos-proxy") - self.assertIn("alert_rules", app_data) + self.assertIn("alert_rules", app_data) # pyright: ignore # AND status is "active" self.assertIsInstance( diff --git a/tox.ini b/tox.ini index 73829bc..fedc783 100644 --- a/tox.ini +++ b/tox.ini @@ -44,7 +44,7 @@ deps = codespell commands = codespell {[vars]lib_path} - codespell . --skip .git --skip .tox --skip build --skip lib --skip venv* --skip .mypy_cache + codespell . --skip .git --skip .tox --skip build --skip lib --skip venv* --skip .mypy_cache -L assertIn ruff {[vars]all_path} black --check --diff {[vars]all_path} From 7239298f5cc36b807d9eb7a03f2afdae6e30f640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Mon, 27 May 2024 20:58:51 -0300 Subject: [PATCH 02/10] increase LIBPATCH --- lib/charms/nrpe_exporter/v0/nrpe_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/nrpe_exporter/v0/nrpe_exporter.py b/lib/charms/nrpe_exporter/v0/nrpe_exporter.py index 44e1dda..51d231e 100644 --- a/lib/charms/nrpe_exporter/v0/nrpe_exporter.py +++ b/lib/charms/nrpe_exporter/v0/nrpe_exporter.py @@ -50,7 +50,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 5 +LIBPATCH = 6 logger = logging.getLogger(__name__) From 63f153c659ed16eccdaa0d70d58c804de53b0cae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 28 May 2024 10:44:21 -0300 Subject: [PATCH 03/10] add asserts in unit tests --- tests/unit/test_relation_monitors.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/test_relation_monitors.py b/tests/unit/test_relation_monitors.py index 6a5fd6a..37a28e0 100644 --- a/tests/unit/test_relation_monitors.py +++ b/tests/unit/test_relation_monitors.py @@ -1,3 +1,4 @@ +import json import tempfile import unittest from pathlib import Path @@ -24,6 +25,7 @@ def setUp(self): "private-address": "10.41.168.226", "target-address": "10.41.168.226", "target-id": "ubuntu-0", + "nagios_host_context": "my-nagios-host-context", } for p in [ @@ -97,6 +99,7 @@ def test_prometheus(self): # THEN alert rules are transferred to prometheus over relation data app_data = self.harness.get_relation_data(rel_id_prom, "cos-proxy") + self.assertIn("alert_rules", app_data) # pyright: ignore # AND status is "active" @@ -104,3 +107,10 @@ def test_prometheus(self): self.harness.model.unit.status, ActiveStatus, ) + # AND relabel configs are ok (we are removing nagios_host_context) + scrape_jobs = json.loads(app_data["scrape_jobs"]) + for job in scrape_jobs: + static_configs = job["static_configs"] + for config in static_configs: + self.assertEquals(config["labels"]["juju_application"], "nrpe") + self.assertEquals(config["labels"]["juju_unit"], "nrpe/0") From 08fd7dd6250d973db74f4cf08ed09992ace83c18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 28 May 2024 12:08:17 -0300 Subject: [PATCH 04/10] fix unit test --- tests/unit/test_relation_monitors.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_relation_monitors.py b/tests/unit/test_relation_monitors.py index 37a28e0..7d9a3df 100644 --- a/tests/unit/test_relation_monitors.py +++ b/tests/unit/test_relation_monitors.py @@ -110,7 +110,10 @@ def test_prometheus(self): # AND relabel configs are ok (we are removing nagios_host_context) scrape_jobs = json.loads(app_data["scrape_jobs"]) for job in scrape_jobs: - static_configs = job["static_configs"] - for config in static_configs: - self.assertEquals(config["labels"]["juju_application"], "nrpe") - self.assertEquals(config["labels"]["juju_unit"], "nrpe/0") + relabel_configs = job["relabel_configs"] + for config in relabel_configs: + if target_level := config.get("target_label"): + if target_level is "juju_application": + self.assertEquals(target_level, "ubuntu") + elif target_level is "juju_unit": + self.assertEquals(target_level, "ubuntu/0") From 93b42a44d05e474f74ed421a4b7819978d1de238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 28 May 2024 12:28:15 -0300 Subject: [PATCH 05/10] linsting --- tests/unit/test_relation_monitors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_relation_monitors.py b/tests/unit/test_relation_monitors.py index 7d9a3df..b7a4152 100644 --- a/tests/unit/test_relation_monitors.py +++ b/tests/unit/test_relation_monitors.py @@ -113,7 +113,7 @@ def test_prometheus(self): relabel_configs = job["relabel_configs"] for config in relabel_configs: if target_level := config.get("target_label"): - if target_level is "juju_application": + if target_level == "juju_application": self.assertEquals(target_level, "ubuntu") - elif target_level is "juju_unit": + elif target_level == "juju_unit": self.assertEquals(target_level, "ubuntu/0") From e34e44ad7a85daa6a3aafba18cebf54957bee090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 28 May 2024 12:28:53 -0300 Subject: [PATCH 06/10] update libs --- lib/charms/grafana_agent/v0/cos_agent.py | 109 ++++++++---------- .../prometheus_k8s/v0/prometheus_scrape.py | 21 ++-- 2 files changed, 57 insertions(+), 73 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 5bf9a3d..870ba62 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -211,14 +211,14 @@ def __init__(self, *args): from collections import namedtuple from itertools import chain from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Union +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union import pydantic from cosl import GrafanaDashboard, JujuTopology from cosl.rules import AlertRules from ops.charm import RelationChangedEvent from ops.framework import EventBase, EventSource, Object, ObjectEvents -from ops.model import Relation, Unit +from ops.model import Relation from ops.testing import CharmType if TYPE_CHECKING: @@ -234,7 +234,7 @@ class _MetricsEndpointDict(TypedDict): LIBID = "dc15fa84cef84ce58155fb84f6c6213a" LIBAPI = 0 -LIBPATCH = 7 +LIBPATCH = 8 PYDEPS = ["cosl", "pydantic < 2"] @@ -251,15 +251,16 @@ class _MetricsEndpointDict(TypedDict): class CosAgentProviderUnitData(pydantic.BaseModel): """Unit databag model for `cos-agent` relation.""" - class Config: - arbitrary_types_allowed = True + # The following entries are the same for all units of the same principal. # Note that the same grafana agent subordinate may be related to several apps. # this needs to make its way to the gagent leader metrics_alert_rules: dict log_alert_rules: dict dashboards: List[GrafanaDashboard] - subordinate: Optional[bool] + # subordinate is no longer used but we should keep it until we bump the library to ensure + # we don't break compatibility. + subordinate: Optional[bool] = None # The following entries may vary across units of the same principal app. # this data does not need to be forwarded to the gagent leader @@ -274,14 +275,13 @@ class Config: class CosAgentPeersUnitData(pydantic.BaseModel): """Unit databag model for `peers` cos-agent machine charm peer relation.""" - class Config: - arbitrary_types_allowed = True + # We need the principal unit name and relation metadata to be able to render identifiers # (e.g. topology) on the leader side, after all the data moves into peer data (the grafana # agent leader can only see its own principal, because it is a subordinate charm). - principal_unit_name: str - principal_relation_id: str - principal_relation_name: str + unit_name: str + relation_id: str + relation_name: str # The only data that is forwarded to the leader is data that needs to go into the app databags # of the outgoing o11y relations. @@ -301,7 +301,7 @@ def app_name(self) -> str: TODO: Switch to using `model_post_init` when pydantic v2 is released? https://github.com/pydantic/pydantic/issues/1729#issuecomment-1300576214 """ - return self.principal_unit_name.split("/")[0] + return self.unit_name.split("/")[0] class COSAgentProvider(Object): @@ -377,7 +377,6 @@ def _on_refresh(self, event): dashboards=self._dashboards, metrics_scrape_jobs=self._scrape_jobs, log_slots=self._log_slots, - subordinate=self._charm.meta.subordinate, ) relation.data[self._charm.unit][data.KEY] = data.json() except ( @@ -470,12 +469,6 @@ class COSAgentRequirerEvents(ObjectEvents): validation_error = EventSource(COSAgentValidationError) -class MultiplePrincipalsError(Exception): - """Custom exception for when there are multiple principal applications.""" - - pass - - class COSAgentRequirer(Object): """Integration endpoint wrapper for the Requirer side of the cos_agent interface.""" @@ -561,13 +554,13 @@ def _on_relation_data_changed(self, event: RelationChangedEvent): if not (provider_data := self._validated_provider_data(raw)): return - # Copy data from the principal relation to the peer relation, so the leader could + # Copy data from the cos_agent relation to the peer relation, so the leader could # follow up. # Save the originating unit name, so it could be used for topology later on by the leader. data = CosAgentPeersUnitData( # peer relation databag model - principal_unit_name=event.unit.name, - principal_relation_id=str(event.relation.id), - principal_relation_name=event.relation.name, + unit_name=event.unit.name, + relation_id=str(event.relation.id), + relation_name=event.relation.name, metrics_alert_rules=provider_data.metrics_alert_rules, log_alert_rules=provider_data.log_alert_rules, dashboards=provider_data.dashboards, @@ -594,39 +587,7 @@ def trigger_refresh(self, _): self.on.data_changed.emit() # pyright: ignore @property - def _principal_unit(self) -> Optional[Unit]: - """Return the principal unit for a relation. - - Assumes that the relation is of type subordinate. - Relies on the fact that, for subordinate relations, the only remote unit visible to - *this unit* is the principal unit that this unit is attached to. - """ - if relations := self._principal_relations: - # Technically it's a list, but for subordinates there can only be one relation - principal_relation = next(iter(relations)) - if units := principal_relation.units: - # Technically it's a list, but for subordinates there can only be one - return next(iter(units)) - - return None - - @property - def _principal_relations(self): - relations = [] - for relation in self._charm.model.relations[self._relation_name]: - if not json.loads(relation.data[next(iter(relation.units))]["config"]).get( - ["subordinate"], False - ): - relations.append(relation) - if len(relations) > 1: - logger.error( - "Multiple applications claiming to be principal. Update the cos-agent library in the client application charms." - ) - raise MultiplePrincipalsError("Multiple principal applications.") - return relations - - @property - def _remote_data(self) -> List[CosAgentProviderUnitData]: + def _remote_data(self) -> List[Tuple[CosAgentProviderUnitData, JujuTopology]]: """Return a list of remote data from each of the related units. Assumes that the relation is of type subordinate. @@ -643,7 +604,15 @@ def _remote_data(self) -> List[CosAgentProviderUnitData]: continue if not (provider_data := self._validated_provider_data(raw)): continue - all_data.append(provider_data) + + topology = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=unit.app.name, + unit=unit.name, + ) + + all_data.append((provider_data, topology)) return all_data @@ -713,7 +682,7 @@ def metrics_alerts(self) -> Dict[str, Any]: def metrics_jobs(self) -> List[Dict]: """Parse the relation data contents and extract the metrics jobs.""" scrape_jobs = [] - for data in self._remote_data: + for data, topology in self._remote_data: for job in data.metrics_scrape_jobs: # In #220, relation schema changed from a simplified dict to the standard # `scrape_configs`. @@ -729,6 +698,22 @@ def metrics_jobs(self) -> List[Dict]: "tls_config": {"insecure_skip_verify": True}, } + # Apply labels to the scrape jobs + for static_config in job.get("static_configs", []): + topo_as_dict = topology.as_dict(excluded_keys=["charm_name"]) + static_config["labels"] = { + # Be sure to keep labels from static_config + **static_config.get("labels", {}), + # TODO: We should add a new method in juju_topology.py + # that like `as_dict` method, returns the keys with juju_ prefix + # https://github.com/canonical/cos-lib/issues/18 + **{ + "juju_{}".format(key): value + for key, value in topo_as_dict.items() + if value + }, + } + scrape_jobs.append(job) return scrape_jobs @@ -737,7 +722,7 @@ def metrics_jobs(self) -> List[Dict]: def snap_log_endpoints(self) -> List[SnapEndpoint]: """Fetch logging endpoints exposed by related snaps.""" plugs = [] - for data in self._remote_data: + for data, _ in self._remote_data: targets = data.log_slots if targets: for target in targets: @@ -777,7 +762,7 @@ def logs_alerts(self) -> Dict[str, Any]: model=self._charm.model.name, model_uuid=self._charm.model.uuid, application=app_name, - # For the topology unit, we could use `data.principal_unit_name`, but that unit + # For the topology unit, we could use `data.unit_name`, but that unit # name may not be very stable: `_gather_peer_data` de-duplicates by app name so # the exact unit name that turns up first in the iterator may vary from time to # time. So using the grafana-agent unit name instead. @@ -810,9 +795,9 @@ def dashboards(self) -> List[Dict[str, str]]: dashboards.append( { - "relation_id": data.principal_relation_id, + "relation_id": data.relation_id, # We have the remote charm name - use it for the identifier - "charm": f"{data.principal_relation_name}-{app_name}", + "charm": f"{data.relation_name}-{app_name}", "content": content, "title": title, } diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 665af88..e3d35c6 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -178,7 +178,7 @@ def __init__(self, *args): - `scrape_timeout` - `proxy_url` - `relabel_configs` -- `metrics_relabel_configs` +- `metric_relabel_configs` - `sample_limit` - `label_limit` - `label_name_length_limit` @@ -362,7 +362,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 44 +LIBPATCH = 47 PYDEPS = ["cosl"] @@ -377,7 +377,7 @@ def _on_scrape_targets_changed(self, event): "scrape_timeout", "proxy_url", "relabel_configs", - "metrics_relabel_configs", + "metric_relabel_configs", "sample_limit", "label_limit", "label_name_length_limit", @@ -521,8 +521,8 @@ def expand_wildcard_targets_into_individual_jobs( # for such a target. Therefore labeling with Juju topology, excluding the # unit name. non_wildcard_static_config["labels"] = { - **non_wildcard_static_config.get("labels", {}), **topology.label_matcher_dict, + **non_wildcard_static_config.get("labels", {}), } non_wildcard_static_configs.append(non_wildcard_static_config) @@ -547,9 +547,9 @@ def expand_wildcard_targets_into_individual_jobs( if topology: # Add topology labels modified_static_config["labels"] = { - **modified_static_config.get("labels", {}), **topology.label_matcher_dict, **{"juju_unit": unit_name}, + **modified_static_config.get("labels", {}), } # Instance relabeling for topology should be last in order. @@ -1537,12 +1537,11 @@ def set_scrape_job_spec(self, _=None): relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) - if alert_rules_as_dict: - # Update relation data with the string representation of the rule file. - # Juju topology is already included in the "scrape_metadata" field above. - # The consumer side of the relation uses this information to name the rules file - # that is written to the filesystem. - relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. + relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) def _set_unit_ip(self, _=None): """Set unit host address. From f9163f1dd8f3881f7be6ec46660e7b4cf95ba5ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Tue, 28 May 2024 13:17:09 -0300 Subject: [PATCH 07/10] fix unit tests --- tests/unit/test_relation_monitors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_relation_monitors.py b/tests/unit/test_relation_monitors.py index b7a4152..47334f1 100644 --- a/tests/unit/test_relation_monitors.py +++ b/tests/unit/test_relation_monitors.py @@ -114,6 +114,6 @@ def test_prometheus(self): for config in relabel_configs: if target_level := config.get("target_label"): if target_level == "juju_application": - self.assertEquals(target_level, "ubuntu") + self.assertEquals(config["replacement"], "ubuntu") elif target_level == "juju_unit": - self.assertEquals(target_level, "ubuntu/0") + self.assertEquals(config["replacement"], "ubuntu/0") From 4cac17dee31248b9ca41f154cf5b1f3d4d39c6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Wed, 29 May 2024 12:09:42 -0300 Subject: [PATCH 08/10] downgrade cos_agent.py until we resolve pydant versions --- lib/charms/grafana_agent/v0/cos_agent.py | 109 +++++++++++++---------- 1 file changed, 62 insertions(+), 47 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 870ba62..5bf9a3d 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -211,14 +211,14 @@ def __init__(self, *args): from collections import namedtuple from itertools import chain from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Union import pydantic from cosl import GrafanaDashboard, JujuTopology from cosl.rules import AlertRules from ops.charm import RelationChangedEvent from ops.framework import EventBase, EventSource, Object, ObjectEvents -from ops.model import Relation +from ops.model import Relation, Unit from ops.testing import CharmType if TYPE_CHECKING: @@ -234,7 +234,7 @@ class _MetricsEndpointDict(TypedDict): LIBID = "dc15fa84cef84ce58155fb84f6c6213a" LIBAPI = 0 -LIBPATCH = 8 +LIBPATCH = 7 PYDEPS = ["cosl", "pydantic < 2"] @@ -251,16 +251,15 @@ class _MetricsEndpointDict(TypedDict): class CosAgentProviderUnitData(pydantic.BaseModel): """Unit databag model for `cos-agent` relation.""" - + class Config: + arbitrary_types_allowed = True # The following entries are the same for all units of the same principal. # Note that the same grafana agent subordinate may be related to several apps. # this needs to make its way to the gagent leader metrics_alert_rules: dict log_alert_rules: dict dashboards: List[GrafanaDashboard] - # subordinate is no longer used but we should keep it until we bump the library to ensure - # we don't break compatibility. - subordinate: Optional[bool] = None + subordinate: Optional[bool] # The following entries may vary across units of the same principal app. # this data does not need to be forwarded to the gagent leader @@ -275,13 +274,14 @@ class CosAgentProviderUnitData(pydantic.BaseModel): class CosAgentPeersUnitData(pydantic.BaseModel): """Unit databag model for `peers` cos-agent machine charm peer relation.""" - + class Config: + arbitrary_types_allowed = True # We need the principal unit name and relation metadata to be able to render identifiers # (e.g. topology) on the leader side, after all the data moves into peer data (the grafana # agent leader can only see its own principal, because it is a subordinate charm). - unit_name: str - relation_id: str - relation_name: str + principal_unit_name: str + principal_relation_id: str + principal_relation_name: str # The only data that is forwarded to the leader is data that needs to go into the app databags # of the outgoing o11y relations. @@ -301,7 +301,7 @@ def app_name(self) -> str: TODO: Switch to using `model_post_init` when pydantic v2 is released? https://github.com/pydantic/pydantic/issues/1729#issuecomment-1300576214 """ - return self.unit_name.split("/")[0] + return self.principal_unit_name.split("/")[0] class COSAgentProvider(Object): @@ -377,6 +377,7 @@ def _on_refresh(self, event): dashboards=self._dashboards, metrics_scrape_jobs=self._scrape_jobs, log_slots=self._log_slots, + subordinate=self._charm.meta.subordinate, ) relation.data[self._charm.unit][data.KEY] = data.json() except ( @@ -469,6 +470,12 @@ class COSAgentRequirerEvents(ObjectEvents): validation_error = EventSource(COSAgentValidationError) +class MultiplePrincipalsError(Exception): + """Custom exception for when there are multiple principal applications.""" + + pass + + class COSAgentRequirer(Object): """Integration endpoint wrapper for the Requirer side of the cos_agent interface.""" @@ -554,13 +561,13 @@ def _on_relation_data_changed(self, event: RelationChangedEvent): if not (provider_data := self._validated_provider_data(raw)): return - # Copy data from the cos_agent relation to the peer relation, so the leader could + # Copy data from the principal relation to the peer relation, so the leader could # follow up. # Save the originating unit name, so it could be used for topology later on by the leader. data = CosAgentPeersUnitData( # peer relation databag model - unit_name=event.unit.name, - relation_id=str(event.relation.id), - relation_name=event.relation.name, + principal_unit_name=event.unit.name, + principal_relation_id=str(event.relation.id), + principal_relation_name=event.relation.name, metrics_alert_rules=provider_data.metrics_alert_rules, log_alert_rules=provider_data.log_alert_rules, dashboards=provider_data.dashboards, @@ -587,7 +594,39 @@ def trigger_refresh(self, _): self.on.data_changed.emit() # pyright: ignore @property - def _remote_data(self) -> List[Tuple[CosAgentProviderUnitData, JujuTopology]]: + def _principal_unit(self) -> Optional[Unit]: + """Return the principal unit for a relation. + + Assumes that the relation is of type subordinate. + Relies on the fact that, for subordinate relations, the only remote unit visible to + *this unit* is the principal unit that this unit is attached to. + """ + if relations := self._principal_relations: + # Technically it's a list, but for subordinates there can only be one relation + principal_relation = next(iter(relations)) + if units := principal_relation.units: + # Technically it's a list, but for subordinates there can only be one + return next(iter(units)) + + return None + + @property + def _principal_relations(self): + relations = [] + for relation in self._charm.model.relations[self._relation_name]: + if not json.loads(relation.data[next(iter(relation.units))]["config"]).get( + ["subordinate"], False + ): + relations.append(relation) + if len(relations) > 1: + logger.error( + "Multiple applications claiming to be principal. Update the cos-agent library in the client application charms." + ) + raise MultiplePrincipalsError("Multiple principal applications.") + return relations + + @property + def _remote_data(self) -> List[CosAgentProviderUnitData]: """Return a list of remote data from each of the related units. Assumes that the relation is of type subordinate. @@ -604,15 +643,7 @@ def _remote_data(self) -> List[Tuple[CosAgentProviderUnitData, JujuTopology]]: continue if not (provider_data := self._validated_provider_data(raw)): continue - - topology = JujuTopology( - model=self._charm.model.name, - model_uuid=self._charm.model.uuid, - application=unit.app.name, - unit=unit.name, - ) - - all_data.append((provider_data, topology)) + all_data.append(provider_data) return all_data @@ -682,7 +713,7 @@ def metrics_alerts(self) -> Dict[str, Any]: def metrics_jobs(self) -> List[Dict]: """Parse the relation data contents and extract the metrics jobs.""" scrape_jobs = [] - for data, topology in self._remote_data: + for data in self._remote_data: for job in data.metrics_scrape_jobs: # In #220, relation schema changed from a simplified dict to the standard # `scrape_configs`. @@ -698,22 +729,6 @@ def metrics_jobs(self) -> List[Dict]: "tls_config": {"insecure_skip_verify": True}, } - # Apply labels to the scrape jobs - for static_config in job.get("static_configs", []): - topo_as_dict = topology.as_dict(excluded_keys=["charm_name"]) - static_config["labels"] = { - # Be sure to keep labels from static_config - **static_config.get("labels", {}), - # TODO: We should add a new method in juju_topology.py - # that like `as_dict` method, returns the keys with juju_ prefix - # https://github.com/canonical/cos-lib/issues/18 - **{ - "juju_{}".format(key): value - for key, value in topo_as_dict.items() - if value - }, - } - scrape_jobs.append(job) return scrape_jobs @@ -722,7 +737,7 @@ def metrics_jobs(self) -> List[Dict]: def snap_log_endpoints(self) -> List[SnapEndpoint]: """Fetch logging endpoints exposed by related snaps.""" plugs = [] - for data, _ in self._remote_data: + for data in self._remote_data: targets = data.log_slots if targets: for target in targets: @@ -762,7 +777,7 @@ def logs_alerts(self) -> Dict[str, Any]: model=self._charm.model.name, model_uuid=self._charm.model.uuid, application=app_name, - # For the topology unit, we could use `data.unit_name`, but that unit + # For the topology unit, we could use `data.principal_unit_name`, but that unit # name may not be very stable: `_gather_peer_data` de-duplicates by app name so # the exact unit name that turns up first in the iterator may vary from time to # time. So using the grafana-agent unit name instead. @@ -795,9 +810,9 @@ def dashboards(self) -> List[Dict[str, str]]: dashboards.append( { - "relation_id": data.relation_id, + "relation_id": data.principal_relation_id, # We have the remote charm name - use it for the identifier - "charm": f"{data.relation_name}-{app_name}", + "charm": f"{data.principal_relation_name}-{app_name}", "content": content, "title": title, } From 979b3667dae44fb9cf0bbc43206ebd5ae2a8996d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Wed, 29 May 2024 12:48:39 -0300 Subject: [PATCH 09/10] tidy up codespell --- pyproject.toml | 4 ++++ tox.ini | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8d112b7..4b0f882 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,3 +43,7 @@ pythonPlatfrom = "All" [tool.pytest.ini_options] minversion = "6.0" log_cli_level = "INFO" + +[tool.codespell] +skip = ".git,.tox,build,lib,venv*,.mypy_cache" +ignore-words-list = "assertIn" diff --git a/tox.ini b/tox.ini index 829ef4b..bfb2358 100644 --- a/tox.ini +++ b/tox.ini @@ -44,7 +44,7 @@ deps = codespell<2.3.0 # https://github.com/codespell-project/codespell/issues/3430 commands = codespell {[vars]lib_path} - codespell . --skip .git --skip .tox --skip build --skip lib --skip venv* --skip .mypy_cache -L assertIn + codespell . ruff {[vars]all_path} black --check --diff {[vars]all_path} From 0d9f2954420545aaa978f340dbf293febc1dd1f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jose=20C=2E=20Mass=C3=B3n?= Date: Wed, 29 May 2024 14:36:02 -0300 Subject: [PATCH 10/10] comment added explaining why we need nagios_host_content --- lib/charms/nrpe_exporter/v0/nrpe_exporter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/charms/nrpe_exporter/v0/nrpe_exporter.py b/lib/charms/nrpe_exporter/v0/nrpe_exporter.py index 0f80a6a..e36e5fb 100644 --- a/lib/charms/nrpe_exporter/v0/nrpe_exporter.py +++ b/lib/charms/nrpe_exporter/v0/nrpe_exporter.py @@ -498,6 +498,8 @@ def _generate_prometheus_job( "target_address", "" ) + # "nagios_host_content" is needed to extract it from the "id" parameter (target-id) + # so that we can correctly relabel juju_application and juju_unit. nagios_host_context = nagios_host_context + "-" if nagios_host_context else "" return {