-
Notifications
You must be signed in to change notification settings - Fork 4
/
cos_integration.py
528 lines (429 loc) · 19.1 KB
/
cos_integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
"""Utilities for testing COS integration with charms."""
import logging
from pathlib import Path
from typing import Any, Dict, Set
from urllib.parse import urlparse
import yaml
from juju.action import Action
from juju.application import Application
from juju.model import Model
from juju.relation import Relation
from juju.unit import Unit
log = logging.getLogger(__name__)
GRAFANA_AGENT_APP = "grafana-agent-k8s"
GRAFANA_AGENT_METRICS_ENDPOINT = "metrics-endpoint"
GRAFANA_AGENT_GRAFANA_DASHBOARD = "grafana-dashboards-consumer"
GRAFANA_AGENT_LOGGING_PROVIDER = "logging-provider"
# Note(rgildein): Grafana-agent-k8s does not currently configure this and it comes as a
# default value from upstream.
# Upstream documentation https://grafana.com/docs/agent/latest/static/configuration/flags/#server
# Related bug https://github.com/canonical/grafana-agent-k8s-operator/issues/308
GRAFANA_AGENT_API = "localhost:12345/agent/api/v1"
GRAFANA_AGENT_API_TARGETS = f"{GRAFANA_AGENT_API}/metrics/targets"
APP_METRICS_ENDPOINT = "metrics-endpoint"
APP_GRAFANA_DASHBOARD = "grafana-dashboard"
APP_LOGGING = "logging"
# Note(rgildein): We use an idle_period of 60 so we can be sure that the targets have already
# been scraped.
WAIT_IDLE_PERIOD = 60
WAIT_TIMEOUT = 5 * 60
# Note(rgildein, dnplas): The grafana agent charm will go to BlockedStatus if it is not
# related to any consumer (e.g. prometheus-k8s, grafana-k8s).
WAIT_STATUS = "blocked"
ALERT_RULES_DIRECTORY = Path("./src/prometheus_alert_rules")
GRAFANA_DASHBOARDS_DIRECTORY = Path("./src/grafana_dashboards")
PROVIDES = "provides"
REQUIRES = "requires"
async def deploy_and_assert_grafana_agent(
model: Model,
app: str,
channel: str = "latest/stable",
metrics: bool = False,
logging: bool = False,
dashboard: bool = False,
idle_period: int = WAIT_IDLE_PERIOD,
) -> None:
"""Deploy grafana-agent-k8s and add relate it with app.
Helper function to deploy and relate grafana-agent-k8s with provided app.
Args:
model (juju.model.Model): Juju model object.
app (str): Name of application with which the Grafana agent should be related.
channel (str): Channel name for grafana-agent-k8s. Defaults to latest/stable.
metrics (bool): Boolean that defines if the <app>:metrics-endpoint
grafana-agent-k8s:metrics-endpoint relation is created. Defaults to False.
logging (bool): Boolean that defines if the <app>:logging
grafana-agent-k8s:logging-provider relation is created. Defaults to False.
dashboard (bool): Boolean that defines if the <app>:grafana-dashboard
grafana-agent-k8s:grafana-dashboards-consumer relation is created. Defaults to False.
idle_period (int): How long, in seconds, the agent statuses of all units of all Grafana
agent need to be `idle`.
"""
assert app in model.applications, f"application {app} was not found in model {model.name}"
log.info("deploying %s from %s channel", GRAFANA_AGENT_APP, channel)
await model.deploy(GRAFANA_AGENT_APP, channel=channel)
if dashboard is True:
log.info(
"Adding relation: %s:%s and %s:%s",
app,
APP_GRAFANA_DASHBOARD,
GRAFANA_AGENT_APP,
GRAFANA_AGENT_GRAFANA_DASHBOARD,
)
await model.integrate(
f"{app}:{APP_GRAFANA_DASHBOARD}",
f"{GRAFANA_AGENT_APP}:{GRAFANA_AGENT_GRAFANA_DASHBOARD}",
)
if metrics is True:
log.info(
"Adding relation: %s:%s and %s:%s",
app,
APP_METRICS_ENDPOINT,
GRAFANA_AGENT_APP,
GRAFANA_AGENT_METRICS_ENDPOINT,
)
await model.integrate(
f"{app}:{APP_METRICS_ENDPOINT}",
f"{GRAFANA_AGENT_APP}:{GRAFANA_AGENT_METRICS_ENDPOINT}",
)
if logging is True:
log.info(
"Adding relation: %s:%s and %s:%s",
app,
APP_LOGGING,
GRAFANA_AGENT_APP,
GRAFANA_AGENT_LOGGING_PROVIDER,
)
await model.integrate(
f"{app}:{APP_LOGGING}",
f"{GRAFANA_AGENT_APP}:{GRAFANA_AGENT_LOGGING_PROVIDER}",
)
await model.wait_for_idle(
apps=[GRAFANA_AGENT_APP],
status=WAIT_STATUS,
timeout=WAIT_TIMEOUT,
idle_period=idle_period,
)
def _check_url(url: str, port: int, path: str) -> bool:
"""Return False if port and path are not defined in url, True otherwise.
Check that the expected port and path are in the url after parsing it.
"""
output = urlparse(url)
return output.port == port and output.path == path
async def _get_targets_from_grafana_agent(
app: Application, port: int, path: str
) -> Dict[str, Any]:
"""Return a dict with data if the charm is listed in the targets; otherwise an empty dict.
This method makes a request to the grafana-agent-k8s targets endpoint to retrieve the state
and data of the application under test and returns this data as a dictionary.
Example of Grafana agent API output:
$ curl localhost:12345/agent/api/v1/metrics/targets
{
"status": "success",
"data": [
{
"target_group": "integrations/agent",
...
},
{
"target_group": "juju_kubeflow_34eea852_dex-auth_prometheus_scrape-0",
"endpoint": "http://10.1.23.239:5558/metrics",
"state": "up",
"labels": {
"instance": "kubeflow_c8c8_dex-auth_dex-auth/0",
"job": "juju_kubeflow_34eea852_dex-auth_prometheus_scrape-0",
"juju_application": "dex-auth",
"juju_charm": "dex-auth",
"juju_model": "kubeflow",
"juju_model_uuid": "c8c8",
"juju_unit": "dex-auth/0"
},
"discovered_labels": {
"__address__": "10.1.23.239:5558",
"__metrics_path__": "/metrics",
"__scheme__": "http",
"__scrape_interval__": "1m",
"__scrape_timeout__": "10s",
"job": "juju_kubeflow_34eea852_dex-auth_prometheus_scrape-0",
"juju_application": "dex-auth",
"juju_charm": "dex-auth",
"juju_model": "kubeflow",
"juju_model_uuid": "c8c8",
"juju_unit": "dex-auth/0"
},
"last_scrape": "2024-06-28T12:04:58.60872202Z",
"scrape_duration_ms": 1,
"scrape_error": ""
}
]
}
"""
cmd = f"curl -m 5 -sS {GRAFANA_AGENT_API_TARGETS}"
grafana_agent_unit = app.model.applications[GRAFANA_AGENT_APP].units[0]
log.debug("testing metrics endpoint with cmd: `%s`", cmd)
output = await _run_on_unit(grafana_agent_unit, cmd)
targets = yaml.safe_load(output.results["stdout"])
log.debug("metrics targets definened at %s:\n%s", grafana_agent_unit.name, targets)
for data in targets["data"]:
if data["labels"]["juju_application"] == app.name:
log.debug(
"metrics targets definened at %s for %s:\n%s",
grafana_agent_unit.name,
app.name,
targets,
)
if _check_url(data["endpoint"], port, path):
return data
log.warning("no target data found for %s and %s:%s", app.name, port, path)
return {}
async def _get_charm_name(app: Application) -> str:
"""Get charm name for application from metadata.
We are getting from metadata, since locally built charms will return
`<charm_name>-<revision>` from `app.charm_name`. e.g. seldon-controller-manager-0
"""
unit = app.units[0]
result = await _run_on_unit(unit, "cat metadata.yaml")
meta = yaml.safe_load(result.results["stdout"])
return meta["name"]
async def _get_relation(app: Application, endpoint_name: str) -> Relation:
"""Get relation for endpoint."""
relations = [
relation
for relation in app.relations
if any(endpoint.name == endpoint_name for endpoint in relation.endpoints)
]
log.info("found relations %s for %s:%s", relations, app.name, endpoint_name)
assert not (len(relations) == 0), f"{endpoint_name} is missing"
assert not (len(relations) > 1), f"too many relations with {endpoint_name} endpoint"
return relations[0]
def _get_app_from_relation(relation: Relation, side: str) -> Application:
"""Get application from relation."""
if side == PROVIDES:
return relation.provides.application
elif side == REQUIRES:
return relation.requires.application
raise ValueError(f"{side} is invalid side of relation.")
async def _get_app_relation_data(
app: Application, endpoint_name: str, side: str
) -> Dict[str, Any]:
"""Get application relation data from endpoint name."""
relation = await _get_relation(app, endpoint_name)
# Note(rgildein): Getting the application from the relationship side so that we can get the
# relationship data from the requested relation side. Like this, we can use tested application
# instead of grafana-agent-k8s to get relation data from provides side of relation.
# For example, a logging endpoint is defined at provides side.
relation_app = _get_app_from_relation(relation, side)
# Note(rgildein): use first unit, since we are getting application data
assert len(relation_app.units) > 0, f"application {relation_app.name} has no units"
unit = relation_app.units[0]
cmd = f"relation-get --format=yaml -r {relation.entity_id} --app - {relation_app.name}"
result = await _run_on_unit(unit, cmd)
return yaml.safe_load(result.results["stdout"])
async def _get_unit_relation_data(
app: Application, endpoint_name: str, side: str
) -> Dict[str, Dict[str, Any]]:
"""Get units relation data from endpoint name."""
relation = await _get_relation(app, endpoint_name)
# Note(rgildein): Getting the application from the relationship side so that we can get the
# relationship data from the requested relation side. Like this, we can use tested application
# instead of grafana-agent-k8s to get relation data from provides side of relation.
# For example, a logging endpoint is defined at provides side.
relation_app = _get_app_from_relation(relation, side)
data = {}
for unit in relation_app.units:
cmd = f"relation-get --format=yaml -r {relation.entity_id} - {unit.name}"
result = await _run_on_unit(unit, cmd)
data[unit.name] = yaml.safe_load(result.results["stdout"])
return data
def _get_alert_rules(data: str) -> Set[str]:
"""Get all alert rules from string, e.g. file content or relation data.
Example of relations data of metrics-endpoint would be:
```Python
'alert_rules': '{"groups": [{rules": [{"alert": "my-alert", ...
```
Example of rule file with single alert rule:
```yaml
alert: my-alert
expr: up < 1
for: 5m
...
```
Example of rule file with multiple alert rules:
```yaml
groups:
- name: my-group
rules:
- alert: my-alert
...
```
"""
alert_rules = yaml.safe_load(data)
if "groups" in alert_rules:
return {rule["alert"] for group in alert_rules["groups"] for rule in group["rules"]}
return {alert_rules["alert"]}
def _get_dashboard_template(data: str) -> Set[Dict[str, dict]]:
"""Get all templates from relation data, where it's defined as string.
This function is parsing the templates define as yaml string and returns only relevant part
from it, which is filename as key and dictionary as value. Such a dictionary includes charm
and Juju topology.
Example of relations data of grafana-dashboard would be:
```Python
'dashboards': '{"templates": {"file:jupyter-notebook-controller.json": {"content": ...
```
"""
templates_raw = yaml.safe_load(data).get("templates", {})
templates = {}
for key, value in templates_raw.items():
file_name = key.replace("file:", "") # template key is defined as 'file:<file_name>'
templates[file_name] = {"charm": value["charm"], "juju_topology": value["juju_topology"]}
return templates
def _get_metrics_endpoint(data: str) -> Set[str]:
"""Get set of metrics endpoints from string.
This function is expecting data defined as string.
```json
[
{
"metrics_path": "/metrics",
"static_configs": [
{
"targets": [
"*:5000",
"*:8000"
]
}
]
}
]
```
"""
metrics_endpoints = set()
scrape_jobs = yaml.safe_load(data)
for job in scrape_jobs:
path = job["metrics_path"]
metrics_endpoints |= {
f"{target}{path}" for config in job["static_configs"] for target in config["targets"]
}
return metrics_endpoints
async def _run_on_unit(unit: Unit, cmd: str) -> Action:
"""Run command on unit."""
log.info("running cmd `%s` on unit %s", cmd, unit.name)
result = await unit.run(cmd, block=True) # Note(rgildein): Using block to wait for results
assert (
result.results["return-code"] == 0
), f"cmd `{cmd}` failed with error `{result.results.get('stderr')}`"
return result
def get_alert_rules(path: Path = ALERT_RULES_DIRECTORY) -> Set[str]:
"""Get all alert rules from files.
Args:
path (Path): Path of alert rules directory. Defaults to "./src/prometheus_alert_rules".
Returns:
set[str]: Set with all alert rules.
"""
alert_rules = set()
for file_type in ["*.rule", "*.rules"]:
for file in path.glob(file_type):
alert_rules |= _get_alert_rules(file.read_text())
return alert_rules
def get_grafana_dashboards(path: Path = GRAFANA_DASHBOARDS_DIRECTORY) -> Set[str]:
"""Get all Grafana dashboards from files.
Args:
path (Path): Path of Grafana dashboards directory. Defaults to "./src/grafana_dashboards".
Returns:
set[str]: Set with all Grafana dashboards.
"""
return {file.name.replace(".tmpl", "") for file in path.glob("*.json.tmpl")}
async def assert_alert_rules(app: Application, alert_rules: Set[str]) -> None:
"""Check alert rules in relation data bag.
This function compare alert rules defined in provides side of APP_METRICS_ENDPOINT relation
data bag and provided alert rules. e.g. {"my-alert1", "my-alert2"}
Args:
app (Application): Juju Applicatition object.
alert_rules (set[str]): Set of alert rules.
"""
relation_data = await _get_app_relation_data(app, APP_METRICS_ENDPOINT, side=PROVIDES)
assert (
"alert_rules" in relation_data
), f"{APP_METRICS_ENDPOINT} relation is missing 'alert_rules'"
relation_alert_rules = _get_alert_rules(relation_data["alert_rules"])
assert relation_alert_rules == alert_rules, f"{relation_alert_rules}\n!=\n{alert_rules}"
async def assert_metrics_endpoint(app: Application, metrics_port: int, metrics_path: str) -> None:
"""Check the endpoint in the relation data bag and verify its accessibility.
This function compare metrics endpoints defined in provides side of APP_METRICS_ENDPOINT
relation data bag and provided metrics endpoint.
e.g. `metrics_port=5000, metrics_path="/metrics"
At the same time it will check the accessibility of such endpoint from grafana-agent-k8s pod.
Args:
app (Application): Juju Applicatition object.
metrics_port (int): Metrics port to verify.
metrics_path (str): Metrics path to verify.
"""
relation_data = await _get_app_relation_data(app, APP_METRICS_ENDPOINT, side=PROVIDES)
assert (
"scrape_jobs" in relation_data
), f"{APP_METRICS_ENDPOINT} relation is missing 'scrape_jobs'"
relation_metrics_endpoints = _get_metrics_endpoint(relation_data["scrape_jobs"])
log.info("found endpoints: %s", relation_metrics_endpoints)
# Note(rgildein): adding // to endpoint so urlparser can parse it properly
assert any(
_check_url(f"//{endpoint}", metrics_port, metrics_path)
for endpoint in relation_metrics_endpoints
), f":{metrics_port}{metrics_path} was not found in any {relation_metrics_endpoints}"
# check that port and path is also defined in Grafana agent targets
target_data = await _get_targets_from_grafana_agent(app, metrics_port, metrics_path)
assert bool(target_data), f"no target found for {app.name} and :{metrics_port}/{metrics_path}"
assert target_data["state"] == "up", f"target for {app.name} is not in {target_data['state']}"
assert (
target_data["labels"]["juju_model"] == app.model.name
), f"label juju_model does not correspond to current model, {target_data['labels']['juju_model']} != {app.model.name}"
assert (
target_data["labels"]["juju_application"] == app.name
), f"label juju_application do not correspond with app name, {target_data['labels']['juju_application']} != {app.name}"
async def assert_logging(app: Application) -> None:
"""Check defined logging settings in relation data bag.
This function checks if endpoint is defined in provides side oflogging relation data bag,
the unit relation data bag and not application. e.g.
```yaml
related-units:
grafana-agent-k8s/0:
in-scope: true
data:
endpoint: |
'{"url": "http://grafana-agent-k8s-0.grafana-agent-k8s-endpoints.
my-model.svc.cluster.local:3500/loki/api/v1/push"}'
...
```
Args:
app (Application): Juju Applicatition object.
"""
unit_relation_data = await _get_unit_relation_data(app, APP_LOGGING, side=PROVIDES)
for unit_name, unit_data in unit_relation_data.items():
assert (
"endpoint" in unit_data
), f"{APP_LOGGING} unit '{unit_name}' relation data are missing 'endpoint'"
async def assert_grafana_dashboards(app: Application, dashboards: Set[str]) -> None:
"""Check Grafana dashboards in relation data bag.
This function compares the dashboards defined in APP_GRAFANA_DASHBOARD relation data bag and
provided dashboards. e.g. {"my-dashboard-1.json", "my-dashboard-2.json"}
Args:
app (Application): Juju Applicatition object.
dashboards (set[str]): Set of dashboard files.
"""
relation_data = await _get_app_relation_data(app, APP_GRAFANA_DASHBOARD, side=PROVIDES)
assert (
"dashboards" in relation_data
), f"{APP_GRAFANA_DASHBOARD} relation data is missing 'dashboards'"
relation_templates = _get_dashboard_template(relation_data["dashboards"])
# check dashboards
relation_dasboards = set(relation_templates.keys()) # template key is defined as file name
assert relation_dasboards == dashboards, f"\n{relation_dasboards}\n!=\n{dashboards}"
# check juju topology for each template
charm_name = await _get_charm_name(app)
for template in relation_templates.values():
assert template["charm"] == charm_name, f"{template['charm']} != {charm_name}"
assert (
template["juju_topology"]["model"] == app.model.name
), f"{template['juju_topology']['model']} != {app.model.name}"
assert (
template["juju_topology"]["application"] == app.name
), f"{template['juju_topology']['application']} != {app.name}"