From 3215c0d5be802e3e123518fb67499e570797aefa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9lian=20Raimbault?= <161456554+CelianR@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:37:31 +0200 Subject: [PATCH 1/9] Revert "[gitlab-use-module] Use gitlab python module instead of raw http requests (#24070)" (#24651) This reverts commit 0ba7f945968854166dd7bb0a6cd01246b476836e. This is causing timeouts in jobs that trigger child pipelines. --- .github/workflows/label-analysis.yml | 2 +- tasks/kernel_matrix_testing/ci.py | 55 +-- tasks/kmt.py | 2 +- tasks/libs/ciproviders/github_api.py | 3 + tasks/libs/ciproviders/gitlab.py | 545 +++++++++++++++++++++++++++ tasks/libs/ciproviders/gitlab_api.py | 243 ------------ tasks/libs/common/remote_api.py | 123 ++++++ tasks/libs/pipeline/data.py | 59 ++- tasks/libs/pipeline/notifications.py | 19 +- tasks/libs/pipeline/stats.py | 6 +- tasks/libs/pipeline/tools.py | 177 ++++----- tasks/libs/types/types.py | 18 +- tasks/linter.py | 19 +- tasks/notify.py | 2 +- tasks/pipeline.py | 218 +++++------ tasks/release.py | 11 +- tasks/unit-tests/gitlab_api_tests.py | 93 ++++- tasks/unit-tests/notify_tests.py | 175 ++++----- 18 files changed, 1117 insertions(+), 653 deletions(-) create mode 100644 tasks/libs/ciproviders/gitlab.py delete mode 100644 tasks/libs/ciproviders/gitlab_api.py create mode 100644 tasks/libs/common/remote_api.py diff --git a/.github/workflows/label-analysis.yml b/.github/workflows/label-analysis.yml index bbf262c9381bb..7d97b83595f71 100644 --- a/.github/workflows/label-analysis.yml +++ b/.github/workflows/label-analysis.yml @@ -21,7 +21,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - name: Install Python dependencies - run: pip install -r requirements.txt -r tasks/requirements.txt + run: pip install -r tasks/requirements.txt - name: Auto assign team label run: inv -e github.assign-team-label --pr-id='${{ github.event.pull_request.number }}' fetch-labels: diff --git a/tasks/kernel_matrix_testing/ci.py b/tasks/kernel_matrix_testing/ci.py index 364bf9d3c2845..8e03b74f5a293 100644 --- a/tasks/kernel_matrix_testing/ci.py +++ b/tasks/kernel_matrix_testing/ci.py @@ -6,11 +6,9 @@ import re import tarfile import xml.etree.ElementTree as ET -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, overload +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union, overload -from gitlab.v4.objects import ProjectJob - -from tasks.libs.ciproviders.gitlab_api import get_gitlab_repo +from tasks.libs.ciproviders.gitlab import Gitlab, get_gitlab_token if TYPE_CHECKING: from typing_extensions import Literal @@ -18,27 +16,31 @@ from tasks.kernel_matrix_testing.types import Arch, Component, StackOutput, VMConfig +def get_gitlab() -> Gitlab: + return Gitlab("DataDog/datadog-agent", str(get_gitlab_token())) + + class KMTJob: """Abstract class representing a Kernel Matrix Testing job, with common properties and methods for all job types""" - def __init__(self, job: ProjectJob): - self.gitlab = get_gitlab_repo() - self.job = job + def __init__(self, job_data: Dict[str, Any]): + self.gitlab = get_gitlab() + self.job_data = job_data def __str__(self): return f"" @property def id(self) -> int: - return self.job.id + return self.job_data["id"] @property def pipeline_id(self) -> int: - return self.job.pipeline["id"] + return self.job_data["pipeline"]["id"] @property def name(self) -> str: - return self.job.name + return self.job_data.get("name", "") @property def arch(self) -> Arch: @@ -50,11 +52,11 @@ def component(self) -> Component: @property def status(self) -> str: - return self.job.status + return self.job_data['status'] @property def failure_reason(self) -> str: - return self.job.failure_reason + return self.job_data["failure_reason"] @overload def artifact_file(self, file: str, ignore_not_found: Literal[True]) -> Optional[str]: # noqa: U100 @@ -88,14 +90,16 @@ def artifact_file_binary(self, file: str, ignore_not_found: bool = False) -> Opt ignore_not_found: if True, return None if the file is not found, otherwise raise an error """ try: - res = self.gitlab.jobs.get(self.id, lazy=True).artifact(file) - - return res.content + res = self.gitlab.artifact(self.id, file, ignore_not_found=ignore_not_found) + if res is None: + if not ignore_not_found: + raise RuntimeError("Invalid return value from gitlab.artifact") + else: + return None + res.raise_for_status() except Exception as e: - if ignore_not_found: - return None - raise RuntimeError(f"Could not retrieve artifact {file}") from e + return res.content class KMTSetupEnvJob(KMTJob): @@ -103,8 +107,8 @@ class KMTSetupEnvJob(KMTJob): the job name and output artifacts """ - def __init__(self, job: ProjectJob): - super().__init__(job) + def __init__(self, job_data: Dict[str, Any]): + super().__init__(job_data) self.associated_test_jobs: List[KMTTestRunJob] = [] @property @@ -161,8 +165,8 @@ class KMTTestRunJob(KMTJob): the job name and output artifacts """ - def __init__(self, job: ProjectJob): - super().__init__(job) + def __init__(self, job_data: Dict[str, Any]): + super().__init__(job_data) self.setup_job: Optional[KMTSetupEnvJob] = None @property @@ -227,10 +231,9 @@ def get_all_jobs_for_pipeline(pipeline_id: Union[int, str]) -> Tuple[List[KMTSet setup_jobs: List[KMTSetupEnvJob] = [] test_jobs: List[KMTTestRunJob] = [] - gitlab = get_gitlab_repo() - jobs = gitlab.pipelines.get(pipeline_id, lazy=True).jobs.list(per_page=100, all=True) - for job in jobs: - name = job.name + gitlab = get_gitlab() + for job in gitlab.all_jobs(pipeline_id): + name = job.get("name", "") if name.startswith("kmt_setup_env"): setup_jobs.append(KMTSetupEnvJob(job)) elif name.startswith("kmt_run_"): diff --git a/tasks/kmt.py b/tasks/kmt.py index a3abd373ebdd1..d7614ccc73a36 100644 --- a/tasks/kmt.py +++ b/tasks/kmt.py @@ -984,7 +984,7 @@ def explain_ci_failure(_, pipeline: str): failreason = testfail # By default, we assume it's a test failure # Now check the artifacts, we'll guess why the job failed based on the size - for artifact in job.job.artifacts: + for artifact in job.job_data.get("artifacts", []): if artifact.get("filename") == "artifacts.zip": fsize = artifact.get("size", 0) if fsize < 1500: diff --git a/tasks/libs/ciproviders/github_api.py b/tasks/libs/ciproviders/github_api.py index b9a186287dc6d..1d0e12f760b08 100644 --- a/tasks/libs/ciproviders/github_api.py +++ b/tasks/libs/ciproviders/github_api.py @@ -1,6 +1,7 @@ import base64 import os import platform +import re import subprocess from typing import List @@ -14,6 +15,8 @@ __all__ = ["GithubAPI"] +errno_regex = re.compile(r".*\[Errno (\d+)\] (.*)") + class GithubAPI: """ diff --git a/tasks/libs/ciproviders/gitlab.py b/tasks/libs/ciproviders/gitlab.py new file mode 100644 index 0000000000000..6e79edca40939 --- /dev/null +++ b/tasks/libs/ciproviders/gitlab.py @@ -0,0 +1,545 @@ +import json +import os +import platform +import subprocess +from collections import UserList +from urllib.parse import quote + +import yaml +from invoke.exceptions import Exit + +from tasks.libs.common.remote_api import APIError, RemoteAPI + +__all__ = ["Gitlab"] + + +class Gitlab(RemoteAPI): + """ + Helper class to perform API calls against the Gitlab API, using a Gitlab PAT. + """ + + BASE_URL = "https://gitlab.ddbuild.io/api/v4" + + def __init__(self, project_name="DataDog/datadog-agent", api_token=""): + super(Gitlab, self).__init__("Gitlab") + self.api_token = api_token + self.project_name = project_name + self.authorization_error_message = ( + "HTTP 401: Your GITLAB_TOKEN may have expired. You can " + "check and refresh it at " + "https://gitlab.ddbuild.io/-/profile/personal_access_tokens" + ) + + def test_project_found(self): + """ + Checks if a project can be found. This is useful for testing access permissions to projects. + """ + result = self.project() + + # name is arbitrary, just need to check if something is in the result + if "name" in result: + return + + print(f"Cannot find GitLab project {self.project_name}") + print("If you cannot see it in the GitLab WebUI, you likely need permission.") + raise Exit(code=1) + + def project(self): + """ + Gets the project info. + """ + path = f"/projects/{quote(self.project_name, safe='')}" + return self.make_request(path, json_output=True) + + def create_pipeline(self, ref, variables=None): + """ + Create a pipeline targeting a given reference of a project. + ref must be a branch or a tag. + """ + if variables is None: + variables = {} + + path = f"/projects/{quote(self.project_name, safe='')}/pipeline" + headers = {"Content-Type": "application/json"} + data = json.dumps({"ref": ref, "variables": [{"key": k, "value": v} for (k, v) in variables.items()]}) + return self.make_request(path, headers=headers, data=data, json_output=True) + + def all_pipelines_for_ref(self, ref, sha=None): + """ + Gets all pipelines for a given reference (+ optionally git sha). + """ + page = 1 + + # Go through all pages + results = self.pipelines_for_ref(ref, sha=sha, page=page) + while results: + yield from results + page += 1 + results = self.pipelines_for_ref(ref, sha=sha, page=page) + + def pipelines_for_ref(self, ref, sha=None, page=1, per_page=100): + """ + Gets one page of pipelines for a given reference (+ optionally git sha). + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipelines?ref={quote(ref, safe='')}&per_page={per_page}&page={page}" + if sha: + path = f"{path}&sha={sha}" + return self.make_request(path, json_output=True) + + def last_pipeline_for_ref(self, ref, per_page=100): + """ + Gets the last pipeline for a given reference. + per_page cannot exceed 100. + """ + pipelines = self.pipelines_for_ref(ref, per_page=per_page) + + if len(pipelines) == 0: + return None + + return sorted(pipelines, key=lambda pipeline: pipeline['created_at'], reverse=True)[0] + + def last_pipelines(self): + """ + Get the last 100 pipelines + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipelines?per_page=100&page=1" + return self.make_request(path, json_output=True) + + def trigger_pipeline(self, data): + """ + Trigger a pipeline on a project using the trigger endpoint. + Requires a trigger token in the data object, in the 'token' field. + """ + path = f"/projects/{quote(self.project_name, safe='')}/trigger/pipeline" + + if 'token' not in data: + raise Exit("Missing 'token' field in data object to trigger child pipelines", 1) + + return self.make_request(path, data=data, json_input=True, json_output=True) + + def pipeline(self, pipeline_id): + """ + Gets info for a given pipeline. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipelines/{pipeline_id}" + return self.make_request(path, json_output=True) + + def cancel_pipeline(self, pipeline_id): + """ + Cancels a given pipeline. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipelines/{pipeline_id}/cancel" + return self.make_request(path, json_output=True, method="POST") + + def cancel_job(self, job_id): + """ + Cancels a given job + """ + path = f"/projects/{quote(self.project_name, safe='')}/jobs/{job_id}/cancel" + return self.make_request(path, json_output=True, method="POST") + + def commit(self, commit_sha): + """ + Gets info for a given commit sha. + """ + path = f"/projects/{quote(self.project_name, safe='')}/repository/commits/{commit_sha}" + return self.make_request(path, json_output=True) + + def artifact(self, job_id, artifact_name, ignore_not_found=False): + path = f"/projects/{quote(self.project_name, safe='')}/jobs/{job_id}/artifacts/{artifact_name}" + try: + response = self.make_request(path, stream_output=True) + return response + except APIError as e: + if e.status_code == 404 and ignore_not_found: + return None + raise e + + def all_jobs(self, pipeline_id): + """ + Gets all the jobs for a pipeline. + """ + page = 1 + + # Go through all pages + results = self.jobs(pipeline_id, page) + while results: + yield from results + page += 1 + results = self.jobs(pipeline_id, page) + + def jobs(self, pipeline_id, page=1, per_page=100): + """ + Gets one page of the jobs for a pipeline. + per_page cannot exceed 100. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipelines/{pipeline_id}/jobs?per_page={per_page}&page={page}" + return self.make_request(path, json_output=True) + + def job_log(self, job_id): + """ + Gets the log file for a given job. + """ + + path = f"/projects/{quote(self.project_name, safe='')}/jobs/{job_id}/trace" + return self.make_request(path) + + def all_pipeline_schedules(self): + """ + Gets all pipelines schedules for the given project. + """ + page = 1 + + # Go through all pages + results = self.pipeline_schedules(page) + while results: + yield from results + page += 1 + results = self.pipeline_schedules(page) + + def pipeline_schedules(self, page=1, per_page=100): + """ + Gets one page of the pipeline schedules for the given project. + per_page cannot exceed 100 + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules?per_page={per_page}&page={page}" + return self.make_request(path, json_output=True) + + def pipeline_schedule(self, schedule_id): + """ + Gets a single pipeline schedule. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules/{schedule_id}" + return self.make_request(path, json_output=True) + + def create_pipeline_schedule(self, description, ref, cron, cron_timezone=None, active=None): + """ + Create a new pipeline schedule with given attributes. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules" + data = { + "description": description, + "ref": ref, + "cron": cron, + "cron_timezone": cron_timezone, + "active": active, + } + no_none_data = {k: v for k, v in data.items() if v is not None} + return self.make_request(path, data=no_none_data, json_output=True, json_input=True) + + def edit_pipeline_schedule( + self, schedule_id, description=None, ref=None, cron=None, cron_timezone=None, active=None + ): + """ + Edit an existing pipeline schedule with given attributes. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules/{schedule_id}" + data = { + "description": description, + "ref": ref, + "cron": cron, + "cron_timezone": cron_timezone, + "active": active, + } + no_none_data = {k: v for k, v in data.items() if v is not None} + return self.make_request(path, json_input=True, json_output=True, data=no_none_data, method="PUT") + + def delete_pipeline_schedule(self, schedule_id): + """ + Delete an existing pipeline schedule. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules/{schedule_id}" + # Gitlab API docs claim that this returns the JSON representation of the deleted schedule, + # but it actually returns an empty string + result = self.make_request(path, json_output=False, method="DELETE") + return f"Pipeline schedule deleted; result: {result if result else '(empty)'}" + + def create_pipeline_schedule_variable(self, schedule_id, key, value): + """ + Create a variable for an existing pipeline schedule. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules/{schedule_id}/variables" + data = { + "key": key, + "value": value, + } + return self.make_request(path, data=data, json_output=True, json_input=True) + + def edit_pipeline_schedule_variable(self, schedule_id, key, value): + """ + Edit an existing variable for a pipeline schedule. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules/{schedule_id}/variables/{key}" + return self.make_request(path, json_input=True, data={"value": value}, json_output=True, method="PUT") + + def delete_pipeline_schedule_variable(self, schedule_id, key): + """ + Delete an existing variable for a pipeline schedule. + """ + path = f"/projects/{quote(self.project_name, safe='')}/pipeline_schedules/{schedule_id}/variables/{key}" + return self.make_request(path, json_output=True, method="DELETE") + + def find_tag(self, tag_name): + """ + Look up a tag by its name. + """ + path = f"/projects/{quote(self.project_name, safe='')}/repository/tags/{tag_name}" + try: + response = self.make_request(path, json_output=True) + return response + except APIError as e: + # If Gitlab API returns a "404 not found" error we return an empty dict + if e.status_code == 404: + print( + f"Couldn't find the {tag_name} tag: Gitlab returned a 404 Not Found instead of a 200 empty response." + ) + return dict() + else: + raise e + + def lint(self, configuration): + """ + Lint a gitlab-ci configuration. + """ + path = f"/projects/{quote(self.project_name, safe='')}/ci/lint?dry_run=true&include_jobs=true" + headers = {"Content-Type": "application/json"} + data = {"content": configuration} + return self.make_request(path, headers=headers, data=data, json_input=True, json_output=True) + + def make_request( + self, path, headers=None, data=None, json_input=False, json_output=False, stream_output=False, method=None + ): + """ + Utility to make a request to the Gitlab API. + See RemoteAPI#request. + + Adds "PRIVATE-TOKEN: {self.api_token}" to the headers to be able to authenticate ourselves to GitLab. + """ + headers = dict(headers or []) + headers["PRIVATE-TOKEN"] = self.api_token + + return self.request( + path=path, + headers=headers, + data=data, + json_input=json_input, + json_output=json_output, + stream_output=stream_output, + raw_output=False, + method=method, + ) + + +def get_gitlab_token(): + if "GITLAB_TOKEN" not in os.environ: + print("GITLAB_TOKEN not found in env. Trying keychain...") + if platform.system() == "Darwin": + try: + output = subprocess.check_output( + ['security', 'find-generic-password', '-a', os.environ["USER"], '-s', 'GITLAB_TOKEN', '-w'] + ) + if len(output) > 0: + return output.strip() + except subprocess.CalledProcessError: + print("GITLAB_TOKEN not found in keychain...") + pass + print( + "Please create an 'api' access token at " + "https://gitlab.ddbuild.io/-/profile/personal_access_tokens and " + "add it as GITLAB_TOKEN in your keychain " + "or export it from your .bashrc or equivalent." + ) + raise Exit(code=1) + return os.environ["GITLAB_TOKEN"] + + +def get_gitlab_bot_token(): + if "GITLAB_BOT_TOKEN" not in os.environ: + print("GITLAB_BOT_TOKEN not found in env. Trying keychain...") + if platform.system() == "Darwin": + try: + output = subprocess.check_output( + ['security', 'find-generic-password', '-a', os.environ["USER"], '-s', 'GITLAB_BOT_TOKEN', '-w'] + ) + if output: + return output.strip() + except subprocess.CalledProcessError: + print("GITLAB_BOT_TOKEN not found in keychain...") + pass + print( + "Please make sure that the GITLAB_BOT_TOKEN is set or that " "the GITLAB_BOT_TOKEN keychain entry is set." + ) + raise Exit(code=1) + return os.environ["GITLAB_BOT_TOKEN"] + + +class ReferenceTag(yaml.YAMLObject): + """ + Custom yaml tag to handle references in gitlab-ci configuration + """ + + yaml_tag = u'!reference' + + def __init__(self, references): + self.references = references + + @classmethod + def from_yaml(cls, loader, node): + return UserList(loader.construct_sequence(node)) + + @classmethod + def to_yaml(cls, dumper, data): + return dumper.represent_sequence(cls.yaml_tag, data.data, flow_style=True) + + +def generate_gitlab_full_configuration(input_file, context=None, compare_to=None): + """ + Generate a full gitlab-ci configuration by resolving all includes + """ + # Update loader/dumper to handle !reference tag + yaml.SafeLoader.add_constructor(ReferenceTag.yaml_tag, ReferenceTag.from_yaml) + yaml.SafeDumper.add_representer(UserList, ReferenceTag.to_yaml) + + yaml_contents = [] + read_includes(input_file, yaml_contents) + full_configuration = {} + for yaml_file in yaml_contents: + full_configuration.update(yaml_file) + # Override some variables with a dedicated context + if context: + full_configuration["variables"].update(context) + if compare_to: + for value in full_configuration.values(): + if ( + isinstance(value, dict) + and "changes" in value + and isinstance(value["changes"], dict) + and "compare_to" in value["changes"] + ): + value["changes"]["compare_to"] = compare_to + elif isinstance(value, list): + for v in value: + if ( + isinstance(v, dict) + and "changes" in v + and isinstance(v["changes"], dict) + and "compare_to" in v["changes"] + ): + v["changes"]["compare_to"] = compare_to + return yaml.safe_dump(full_configuration) + + +def read_includes(yaml_file, includes): + """ + Recursive method to read all includes from yaml files and store them in a list + """ + current_file = read_content(yaml_file) + if 'include' not in current_file: + includes.append(current_file) + else: + for include in current_file['include']: + read_includes(include, includes) + del current_file['include'] + includes.append(current_file) + + +def read_content(file_path): + """ + Read the content of a file, either from a local file or from an http endpoint + """ + content = None + if file_path.startswith('http'): + import requests + + response = requests.get(file_path) + response.raise_for_status() + content = response.text + else: + with open(file_path) as f: + content = f.read() + return yaml.safe_load(content) + + +def get_preset_contexts(required_tests): + possible_tests = ["all", "main", "release", "mq"] + required_tests = required_tests.casefold().split(",") + if set(required_tests) | set(possible_tests) != set(possible_tests): + raise Exit(f"Invalid test required: {required_tests} must contain only values from {possible_tests}", 1) + main_contexts = [ + ("BUCKET_BRANCH", ["nightly"]), # ["dev", "nightly", "beta", "stable", "oldnightly"] + ("CI_COMMIT_BRANCH", ["main"]), # ["main", "mq-working-branch-main", "7.42.x", "any/name"] + ("CI_COMMIT_TAG", [""]), # ["", "1.2.3-rc.4", "6.6.6"] + ("CI_PIPELINE_SOURCE", ["pipeline"]), # ["trigger", "pipeline", "schedule"] + ("DEPLOY_AGENT", ["true"]), + ("RUN_ALL_BUILDS", ["true"]), + ("RUN_E2E_TESTS", ["auto"]), + ("RUN_KMT_TESTS", ["on"]), + ("RUN_UNIT_TESTS", ["on"]), + ("TESTING_CLEANUP", ["true"]), + ] + release_contexts = [ + ("BUCKET_BRANCH", ["stable"]), + ("CI_COMMIT_BRANCH", ["7.42.x"]), + ("CI_COMMIT_TAG", ["3.2.1", "1.2.3-rc.4"]), + ("CI_PIPELINE_SOURCE", ["schedule"]), + ("DEPLOY_AGENT", ["true"]), + ("RUN_ALL_BUILDS", ["true"]), + ("RUN_E2E_TESTS", ["auto"]), + ("RUN_KMT_TESTS", ["on"]), + ("RUN_UNIT_TESTS", ["on"]), + ("TESTING_CLEANUP", ["true"]), + ] + mq_contexts = [ + ("BUCKET_BRANCH", ["dev"]), + ("CI_COMMIT_BRANCH", ["mq-working-branch-main"]), + ("CI_PIPELINE_SOURCE", ["pipeline"]), + ("DEPLOY_AGENT", ["false"]), + ("RUN_ALL_BUILDS", ["false"]), + ("RUN_E2E_TESTS", ["auto"]), + ("RUN_KMT_TESTS", ["off"]), + ("RUN_UNIT_TESTS", ["off"]), + ("TESTING_CLEANUP", ["false"]), + ] + all_contexts = [] + for test in required_tests: + if test in ["all", "main"]: + generate_contexts(main_contexts, [], all_contexts) + if test in ["all", "release"]: + generate_contexts(release_contexts, [], all_contexts) + if test in ["all", "mq"]: + generate_contexts(mq_contexts, [], all_contexts) + return all_contexts + + +def generate_contexts(contexts, context, all_contexts): + """ + Recursive method to generate all possible contexts from a list of tuples + """ + if len(contexts) == 0: + all_contexts.append(context[:]) + return + for value in contexts[0][1]: + context.append((contexts[0][0], value)) + generate_contexts(contexts[1:], context, all_contexts) + context.pop() + + +def load_context(context): + """ + Load a context either from a yaml file or from a json string + """ + if os.path.exists(context): + with open(context) as f: + y = yaml.safe_load(f) + if "variables" not in y: + raise Exit( + f"Invalid context file: {context}, missing 'variables' key. Input file must be similar to tasks/unit-tests/testdata/gitlab_main_context_template.yml", + 1, + ) + return [[(k, v) for k, v in y["variables"].items()]] + else: + try: + j = json.loads(context) + return [[(k, v) for k, v in j.items()]] + except json.JSONDecodeError: + raise Exit(f"Invalid context: {context}, must be a valid json, or a path to a yaml file", 1) diff --git a/tasks/libs/ciproviders/gitlab_api.py b/tasks/libs/ciproviders/gitlab_api.py deleted file mode 100644 index 74136486a6cf3..0000000000000 --- a/tasks/libs/ciproviders/gitlab_api.py +++ /dev/null @@ -1,243 +0,0 @@ -import json -import os -import platform -import subprocess -from collections import UserList - -import gitlab -import yaml -from gitlab.v4.objects import Project -from invoke.exceptions import Exit - -BASE_URL = "https://gitlab.ddbuild.io" - - -def get_gitlab_token(): - if "GITLAB_TOKEN" not in os.environ: - print("GITLAB_TOKEN not found in env. Trying keychain...") - if platform.system() == "Darwin": - try: - output = subprocess.check_output( - ['security', 'find-generic-password', '-a', os.environ["USER"], '-s', 'GITLAB_TOKEN', '-w'] - ) - if len(output) > 0: - return output.strip() - except subprocess.CalledProcessError: - print("GITLAB_TOKEN not found in keychain...") - pass - print( - "Please create an 'api' access token at " - "https://gitlab.ddbuild.io/-/profile/personal_access_tokens and " - "add it as GITLAB_TOKEN in your keychain " - "or export it from your .bashrc or equivalent." - ) - raise Exit(code=1) - return os.environ["GITLAB_TOKEN"] - - -def get_gitlab_bot_token(): - if "GITLAB_BOT_TOKEN" not in os.environ: - print("GITLAB_BOT_TOKEN not found in env. Trying keychain...") - if platform.system() == "Darwin": - try: - output = subprocess.check_output( - ['security', 'find-generic-password', '-a', os.environ["USER"], '-s', 'GITLAB_BOT_TOKEN', '-w'] - ) - if output: - return output.strip() - except subprocess.CalledProcessError: - print("GITLAB_BOT_TOKEN not found in keychain...") - pass - print( - "Please make sure that the GITLAB_BOT_TOKEN is set or that " "the GITLAB_BOT_TOKEN keychain entry is set." - ) - raise Exit(code=1) - return os.environ["GITLAB_BOT_TOKEN"] - - -def get_gitlab_api(token=None) -> gitlab.Gitlab: - """ - Returns the gitlab api object with the api token. - The token is the one of get_gitlab_token() by default. - """ - token = token or get_gitlab_token() - - return gitlab.Gitlab(BASE_URL, private_token=token) - - -def get_gitlab_repo(repo='DataDog/datadog-agent', token=None) -> Project: - api = get_gitlab_api(token) - repo = api.projects.get(repo) - - return repo - - -class ReferenceTag(yaml.YAMLObject): - """ - Custom yaml tag to handle references in gitlab-ci configuration - """ - - yaml_tag = u'!reference' - - def __init__(self, references): - self.references = references - - @classmethod - def from_yaml(cls, loader, node): - return UserList(loader.construct_sequence(node)) - - @classmethod - def to_yaml(cls, dumper, data): - return dumper.represent_sequence(cls.yaml_tag, data.data, flow_style=True) - - -def generate_gitlab_full_configuration(input_file, context=None, compare_to=None): - """ - Generate a full gitlab-ci configuration by resolving all includes - """ - # Update loader/dumper to handle !reference tag - yaml.SafeLoader.add_constructor(ReferenceTag.yaml_tag, ReferenceTag.from_yaml) - yaml.SafeDumper.add_representer(UserList, ReferenceTag.to_yaml) - yaml_contents = [] - read_includes(input_file, yaml_contents) - full_configuration = {} - for yaml_file in yaml_contents: - full_configuration.update(yaml_file) - # Override some variables with a dedicated context - if context: - full_configuration["variables"].update(context) - if compare_to: - for value in full_configuration.values(): - if ( - isinstance(value, dict) - and "changes" in value - and isinstance(value["changes"], dict) - and "compare_to" in value["changes"] - ): - value["changes"]["compare_to"] = compare_to - elif isinstance(value, list): - for v in value: - if ( - isinstance(v, dict) - and "changes" in v - and isinstance(v["changes"], dict) - and "compare_to" in v["changes"] - ): - v["changes"]["compare_to"] = compare_to - return yaml.safe_dump(full_configuration) - - -def read_includes(yaml_file, includes): - """ - Recursive method to read all includes from yaml files and store them in a list - """ - current_file = read_content(yaml_file) - if 'include' not in current_file: - includes.append(current_file) - else: - for include in current_file['include']: - read_includes(include, includes) - del current_file['include'] - includes.append(current_file) - - -def read_content(file_path): - """ - Read the content of a file, either from a local file or from an http endpoint - """ - content = None - if file_path.startswith('http'): - import requests - - response = requests.get(file_path) - response.raise_for_status() - content = response.text - else: - with open(file_path) as f: - content = f.read() - return yaml.safe_load(content) - - -def get_preset_contexts(required_tests): - possible_tests = ["all", "main", "release", "mq"] - required_tests = required_tests.casefold().split(",") - if set(required_tests) | set(possible_tests) != set(possible_tests): - raise Exit(f"Invalid test required: {required_tests} must contain only values from {possible_tests}", 1) - main_contexts = [ - ("BUCKET_BRANCH", ["nightly"]), # ["dev", "nightly", "beta", "stable", "oldnightly"] - ("CI_COMMIT_BRANCH", ["main"]), # ["main", "mq-working-branch-main", "7.42.x", "any/name"] - ("CI_COMMIT_TAG", [""]), # ["", "1.2.3-rc.4", "6.6.6"] - ("CI_PIPELINE_SOURCE", ["pipeline"]), # ["trigger", "pipeline", "schedule"] - ("DEPLOY_AGENT", ["true"]), - ("RUN_ALL_BUILDS", ["true"]), - ("RUN_E2E_TESTS", ["auto"]), - ("RUN_KMT_TESTS", ["on"]), - ("RUN_UNIT_TESTS", ["on"]), - ("TESTING_CLEANUP", ["true"]), - ] - release_contexts = [ - ("BUCKET_BRANCH", ["stable"]), - ("CI_COMMIT_BRANCH", ["7.42.x"]), - ("CI_COMMIT_TAG", ["3.2.1", "1.2.3-rc.4"]), - ("CI_PIPELINE_SOURCE", ["schedule"]), - ("DEPLOY_AGENT", ["true"]), - ("RUN_ALL_BUILDS", ["true"]), - ("RUN_E2E_TESTS", ["auto"]), - ("RUN_KMT_TESTS", ["on"]), - ("RUN_UNIT_TESTS", ["on"]), - ("TESTING_CLEANUP", ["true"]), - ] - mq_contexts = [ - ("BUCKET_BRANCH", ["dev"]), - ("CI_COMMIT_BRANCH", ["mq-working-branch-main"]), - ("CI_PIPELINE_SOURCE", ["pipeline"]), - ("DEPLOY_AGENT", ["false"]), - ("RUN_ALL_BUILDS", ["false"]), - ("RUN_E2E_TESTS", ["auto"]), - ("RUN_KMT_TESTS", ["off"]), - ("RUN_UNIT_TESTS", ["off"]), - ("TESTING_CLEANUP", ["false"]), - ] - all_contexts = [] - for test in required_tests: - if test in ["all", "main"]: - generate_contexts(main_contexts, [], all_contexts) - if test in ["all", "release"]: - generate_contexts(release_contexts, [], all_contexts) - if test in ["all", "mq"]: - generate_contexts(mq_contexts, [], all_contexts) - return all_contexts - - -def generate_contexts(contexts, context, all_contexts): - """ - Recursive method to generate all possible contexts from a list of tuples - """ - if len(contexts) == 0: - all_contexts.append(context[:]) - return - for value in contexts[0][1]: - context.append((contexts[0][0], value)) - generate_contexts(contexts[1:], context, all_contexts) - context.pop() - - -def load_context(context): - """ - Load a context either from a yaml file or from a json string - """ - if os.path.exists(context): - with open(context) as f: - y = yaml.safe_load(f) - if "variables" not in y: - raise Exit( - f"Invalid context file: {context}, missing 'variables' key. Input file must be similar to tasks/unit-tests/testdata/gitlab_main_context_template.yml", - 1, - ) - return [[(k, v) for k, v in y["variables"].items()]] - else: - try: - j = json.loads(context) - return [[(k, v) for k, v in j.items()]] - except json.JSONDecodeError: - raise Exit(f"Invalid context: {context}, must be a valid json, or a path to a yaml file", 1) diff --git a/tasks/libs/common/remote_api.py b/tasks/libs/common/remote_api.py new file mode 100644 index 0000000000000..20f4008abed1f --- /dev/null +++ b/tasks/libs/common/remote_api.py @@ -0,0 +1,123 @@ +import errno +import re +import time + +from invoke.exceptions import Exit + +errno_regex = re.compile(r".*\[Errno (\d+)\] (.*)") + + +class APIError(Exception): + def __init__(self, request, api_name): + super(APIError, self).__init__(f"{api_name} says: {request.content}") + self.status_code = request.status_code + self.request = request + + +class RemoteAPI(object): + """ + Helper class to perform calls against a given remote API. + """ + + BASE_URL = "" + + def __init__(self, api_name, sleep_time=1, retry_count=5): + self.api_name = api_name + self.authorization_error_message = "HTTP 401 Unauthorized" + self.requests_sleep_time = sleep_time + self.requests_500_retry_count = retry_count + + def request( + self, + path, + headers=None, + data=None, + json_input=False, + json_output=False, + stream_output=False, + raw_output=False, + method=None, + ): + """ + Utility to make a request to a remote API. + + headers: A hash of headers to pass to the request. + data: An object containing the body of the request. + json_input: If set to true, data is passed with the json parameter of requests.post instead of the data parameter. + + By default, the request method is GET, or POST if data is not empty. + method: Can be set to GET, POST, PUT or DELETE to force the REST method used. + + By default, we return the text field of the response object. The following fields can alter this behavior: + json_output: the json field of the response object is returned. + stream_output: the request asks for a stream response, and the raw response object is returned. + raw_output: the content field of the resposne object is returned. + """ + import requests + + url = self.BASE_URL + path + + # TODO: Use the param argument of requests instead of handling URL params + # manually + try: + # If json_input is true, we specifically want to send data using the json + # parameter of requests.post / requests.put + for retry_count in range(self.requests_500_retry_count): + if method == "PUT": + if json_input: + r = requests.put(url, headers=headers, json=data, stream=stream_output) + else: + r = requests.put(url, headers=headers, data=data, stream=stream_output) + elif method == "DELETE": + r = requests.delete(url, headers=headers, stream=stream_output) + elif data or method == "POST": + if json_input: + r = requests.post(url, headers=headers, json=data, stream=stream_output) + else: + r = requests.post(url, headers=headers, data=data, stream=stream_output) + else: + r = requests.get(url, headers=headers, stream=stream_output) + if r.status_code >= 400: + if r.status_code == 401: + print(self.authorization_error_message) + elif 500 <= r.status_code < 600: + sleep_time = self.requests_sleep_time + retry_count * self.requests_sleep_time + if sleep_time > 0: + print( + f"Request failed with error {r.status_code}, retrying in {sleep_time} seconds (retry {retry_count}/{self.requests_500_retry_count}" + ) + time.sleep(sleep_time) + continue + raise APIError(r, self.api_name) + else: + break + except requests.exceptions.Timeout: + print(f"Connection to {self.api_name} ({url}) timed out.") + raise Exit(code=1) + except requests.exceptions.RequestException as e: + m = errno_regex.match(str(e)) + if not m: + print(f"Unknown error raised connecting to {self.api_name} ({url}): {e}") + raise e + + # Parse errno to give a better explanation + # Requests doesn't have granularity at the level we want: + # http://docs.python-requests.org/en/master/_modules/requests/exceptions/ + errno_code = int(m.group(1)) + message = m.group(2) + + if errno_code == errno.ENOEXEC: + exit_msg = f"Error resolving {url}: {message}" + elif errno_code == errno.ECONNREFUSED: + exit_msg = f"Connection to {self.api_name} ({url}) refused" + else: + exit_msg = f"Error while connecting to {url}: {str(e)}" + raise Exit(message=exit_msg, code=1) + + if json_output: + return r.json() + if raw_output: + return r.content + if stream_output: + return r + return r.text diff --git a/tasks/libs/pipeline/data.py b/tasks/libs/pipeline/data.py index acaf9ccdff05b..4e5b5fa1c9fe5 100644 --- a/tasks/libs/pipeline/data.py +++ b/tasks/libs/pipeline/data.py @@ -1,9 +1,6 @@ import re -from collections import defaultdict -from gitlab.v4.objects import ProjectJob - -from tasks.libs.ciproviders.gitlab_api import get_gitlab_repo +from tasks.libs.ciproviders.gitlab import Gitlab, get_gitlab_token from tasks.libs.types.types import FailedJobReason, FailedJobs, FailedJobType @@ -11,47 +8,47 @@ def get_failed_jobs(project_name: str, pipeline_id: str) -> FailedJobs: """ Retrieves the list of failed jobs for a given pipeline id in a given project. """ - repo = get_gitlab_repo(project_name) - pipeline = repo.pipelines.get(pipeline_id) - jobs = pipeline.jobs.list(per_page=100, all=True) - # Get instances of failed jobs grouped by name - failed_jobs = defaultdict(list) + gitlab = Gitlab(project_name=project_name, api_token=get_gitlab_token()) + + # gitlab.all_jobs yields a generator, it needs to be converted to a list to be able to + # go through it twice + jobs = list(gitlab.all_jobs(pipeline_id)) + + # Get instances of failed jobs + failed_jobs = {job["name"]: [] for job in jobs if job["status"] == "failed"} + + # Group jobs per name for job in jobs: - if job.status == "failed": - failed_jobs[job.name].append(job) + if job["name"] in failed_jobs: + failed_jobs[job["name"]].append(job) # There, we now have the following map: # job name -> list of jobs with that name, including at least one failed job processed_failed_jobs = FailedJobs() for job_name, jobs in failed_jobs.items(): # We sort each list per creation date - jobs.sort(key=lambda x: x.created_at) + jobs.sort(key=lambda x: x["created_at"]) # We truncate the job name to increase readability job_name = truncate_job_name(job_name) - job = jobs[-1] # Check the final job in the list: it contains the current status of the job # This excludes jobs that were retried and succeeded - trace = str(repo.jobs.get(job.id, lazy=True).trace(), 'utf-8') - failure_type, failure_reason = get_job_failure_context(trace) - final_status = ProjectJob( - repo.manager, - attrs={ - "name": job_name, - "id": job.id, - "stage": job.stage, - "status": job.status, - "tag_list": job.tag_list, - "allow_failure": job.allow_failure, - "web_url": job.web_url, - "retry_summary": [ijob.status for ijob in jobs], - "failure_type": failure_type, - "failure_reason": failure_reason, - }, - ) + failure_type, failure_reason = get_job_failure_context(gitlab.job_log(jobs[-1]["id"])) + final_status = { + "name": job_name, + "id": jobs[-1]["id"], + "stage": jobs[-1]["stage"], + "status": jobs[-1]["status"], + "tag_list": jobs[-1]["tag_list"], + "allow_failure": jobs[-1]["allow_failure"], + "url": jobs[-1]["web_url"], + "retry_summary": [job["status"] for job in jobs], + "failure_type": failure_type, + "failure_reason": failure_reason, + } # Also exclude jobs allowed to fail - if final_status.status == "failed" and should_report_job(job_name, final_status.allow_failure): + if final_status["status"] == "failed" and should_report_job(job_name, final_status["allow_failure"]): processed_failed_jobs.add_failed_job(final_status) return processed_failed_jobs diff --git a/tasks/libs/pipeline/notifications.py b/tasks/libs/pipeline/notifications.py index c35282f1cea94..960eb5a283234 100644 --- a/tasks/libs/pipeline/notifications.py +++ b/tasks/libs/pipeline/notifications.py @@ -6,12 +6,10 @@ from collections import defaultdict from typing import Dict -import gitlab import yaml -from gitlab.v4.objects import ProjectJob from invoke.context import Context -from tasks.libs.ciproviders.gitlab_api import get_gitlab_repo +from tasks.libs.ciproviders.gitlab import Gitlab, get_gitlab_token from tasks.libs.owners.parsing import read_owners from tasks.libs.types.types import FailedJobReason, FailedJobs, Test @@ -53,16 +51,13 @@ def check_for_missing_owners_slack_and_jira(print_missing_teams=True, owners_fil return error -def get_failed_tests(project_name, job: ProjectJob, owners_file=".github/CODEOWNERS"): - repo = get_gitlab_repo(project_name) +def get_failed_tests(project_name, job, owners_file=".github/CODEOWNERS"): + gitlab = Gitlab(project_name=project_name, api_token=get_gitlab_token()) owners = read_owners(owners_file) - try: - test_output = str(repo.jobs.get(job.id, lazy=True).artifact('test_output.json'), 'utf-8') - except gitlab.exceptions.GitlabGetError: - test_output = '' + test_output = gitlab.artifact(job["id"], "test_output.json", ignore_not_found=True) failed_tests = {} # type: dict[tuple[str, str], Test] if test_output: - for line in test_output.splitlines(): + for line in test_output.iter_lines(): json_test = json.loads(line) if 'Test' in json_test: name = json_test['Test'] @@ -91,11 +86,11 @@ def find_job_owners(failed_jobs: FailedJobs, owners_file: str = ".gitlab/JOBOWNE # For e2e test infrastructure errors, notify the agent-e2e-testing team for job in failed_jobs.mandatory_infra_job_failures: - if job.failure_type == FailedJobReason.E2E_INFRA_FAILURE: + if job["failure_type"] == FailedJobReason.E2E_INFRA_FAILURE: owners_to_notify["@datadog/agent-e2e-testing"].add_failed_job(job) for job in failed_jobs.all_non_infra_failures(): - job_owners = owners.of(job.name) + job_owners = owners.of(job["name"]) # job_owners is a list of tuples containing the type of owner (eg. USERNAME, TEAM) and the name of the owner # eg. [('TEAM', '@DataDog/agent-ci-experience')] diff --git a/tasks/libs/pipeline/stats.py b/tasks/libs/pipeline/stats.py index 8bc9e1b0f9113..46a862bfbb94e 100644 --- a/tasks/libs/pipeline/stats.py +++ b/tasks/libs/pipeline/stats.py @@ -31,10 +31,10 @@ def get_failed_jobs_stats(project_name, pipeline_id): global_failure_reason = FailedJobType.INFRA_FAILURE.name for job in failed_jobs.all_mandatory_failures(): - failure_type = job.failure_type - failure_reason = job.failure_reason + failure_type = job["failure_type"] + failure_reason = job["failure_reason"] - key = tuple(sorted(job.tag_list + [f"type:{failure_type.name}", f"reason:{failure_reason.name}"])) + key = tuple(sorted(job["tag_list"] + [f"type:{failure_type.name}", f"reason:{failure_reason.name}"])) job_failure_stats[key] += 1 return global_failure_reason, job_failure_stats diff --git a/tasks/libs/pipeline/tools.py b/tasks/libs/pipeline/tools.py index 513abfa14b85c..d026d61b5f6a6 100644 --- a/tasks/libs/pipeline/tools.py +++ b/tasks/libs/pipeline/tools.py @@ -3,10 +3,6 @@ import platform import sys from time import sleep, time -from typing import List - -from gitlab import GitlabError -from gitlab.v4.objects import Project, ProjectJob, ProjectPipeline from tasks.libs.common.color import color_message from tasks.libs.common.user_interactions import yes_no_question @@ -19,11 +15,11 @@ class FilteredOutException(Exception): pass -def get_running_pipelines_on_same_ref(repo: Project, ref, sha=None) -> List[ProjectPipeline]: - pipelines = repo.pipelines.list(ref=ref, sha=sha, per_page=100, all=True) +def get_running_pipelines_on_same_ref(gitlab, ref, sha=None): + pipelines = gitlab.all_pipelines_for_ref(ref, sha=sha) RUNNING_STATUSES = ["created", "pending", "running"] - running_pipelines = [pipeline for pipeline in pipelines if pipeline.status in RUNNING_STATUSES] + running_pipelines = [pipeline for pipeline in pipelines if pipeline["status"] in RUNNING_STATUSES] return running_pipelines @@ -36,37 +32,37 @@ def parse_datetime(dt): return datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S.%f%z") -def cancel_pipelines_with_confirmation(repo: Project, pipelines: List[ProjectPipeline]): +def cancel_pipelines_with_confirmation(gitlab, pipelines): for pipeline in pipelines: - commit = repo.commits.get(pipeline.sha) + commit_author, commit_short_sha, commit_title = get_commit_for_pipeline(gitlab, pipeline['id']) print( color_message("Pipeline", "blue"), - color_message(pipeline.id, "bold"), - color_message(f"({repo.web_url}/pipelines/{pipeline.id})", "green"), + color_message(pipeline['id'], "bold"), + color_message(f"(https://gitlab.ddbuild.io/{gitlab.project_name}/pipelines/{pipeline['id']})", "green"), ) - pipeline_creation_date = pipeline.created_at + pipeline_creation_date = pipeline['created_at'] print( f"{color_message('Started at', 'blue')} {parse_datetime(pipeline_creation_date).astimezone():%c} ({pipeline_creation_date})" ) print( color_message("Commit:", "blue"), - color_message(commit.title, "green"), - color_message(f"({commit.short_id})", "grey"), + color_message(commit_title, "green"), + color_message(f"({commit_short_sha})", "grey"), color_message("by", "blue"), - color_message(commit.author_name, "bold"), + color_message(commit_author, "bold"), ) if yes_no_question("Do you want to cancel this pipeline?", color="orange", default=True): - pipeline.cancel() - print(f"Pipeline {color_message(pipeline.id, 'bold')} has been cancelled.\n") + gitlab.cancel_pipeline(pipeline['id']) + print(f"Pipeline {color_message(pipeline['id'], 'bold')} has been cancelled.\n") else: - print(f"Pipeline {color_message(pipeline.id, 'bold')} will keep running.\n") + print(f"Pipeline {color_message(pipeline['id'], 'bold')} will keep running.\n") -def gracefully_cancel_pipeline(repo: Project, pipeline: ProjectPipeline, force_cancel_stages): +def gracefully_cancel_pipeline(gitlab, pipeline, force_cancel_stages): """ Gracefully cancel pipeline - Cancel all the jobs that did not start to run yet @@ -74,17 +70,17 @@ def gracefully_cancel_pipeline(repo: Project, pipeline: ProjectPipeline, force_c - Jobs in the stages specified in 'force_cancel_stages' variables will always be canceled even if running """ - jobs = pipeline.jobs.list(per_page=100, all=True) + jobs = gitlab.all_jobs(pipeline["id"]) for job in jobs: - if job.stage in force_cancel_stages or ( - job.status not in ["running", "canceled"] and "cleanup" not in job.name + if job["stage"] in force_cancel_stages or ( + job["status"] not in ["running", "canceled"] and "cleanup" not in job["name"] ): - repo.jobs.get(job.id, lazy=True).cancel() + gitlab.cancel_job(job["id"]) def trigger_agent_pipeline( - repo: Project, + gitlab, ref=DEFAULT_BRANCH, release_version_6="nightly", release_version_7="nightly-a7", @@ -94,7 +90,7 @@ def trigger_agent_pipeline( e2e_tests=False, rc_build=False, rc_k8s_deployments=False, -) -> ProjectPipeline: +): """ Trigger a pipeline on the datadog-agent repositories. Multiple options are available: - run a pipeline with all builds (by default, a pipeline only runs a subset of all available builds), @@ -141,40 +137,39 @@ def trigger_agent_pipeline( ref, "\n".join(f" - {k}: {args[k]}" for k in args) ) ) - try: - variables = [{'key': key, 'value': value} for (key, value) in args.items()] + result = gitlab.create_pipeline(ref, args) - return repo.pipelines.create({'ref': ref, 'variables': variables}) - except GitlabError as e: - if "filtered out by workflow rules" in e.error_message: - raise FilteredOutException + if result and "id" in result: + return result["id"] - raise RuntimeError(f"Invalid response from Gitlab API: {e}") + if result and "filtered out by workflow rules" in result.get("message", {}).get("base", [""])[0]: + raise FilteredOutException + raise RuntimeError(f"Invalid response from Gitlab: {result}") -def wait_for_pipeline( - repo: Project, pipeline: ProjectPipeline, pipeline_finish_timeout_sec=PIPELINE_FINISH_TIMEOUT_SEC -): + +def wait_for_pipeline(gitlab, pipeline_id, pipeline_finish_timeout_sec=PIPELINE_FINISH_TIMEOUT_SEC): """ Follow a given pipeline, periodically checking the pipeline status and printing changes to the job statuses. """ - commit = repo.commits.get(pipeline.sha) + commit_author, commit_short_sha, commit_title = get_commit_for_pipeline(gitlab, pipeline_id) print( color_message( "Commit: " - + color_message(commit.title, "green") - + color_message(f" ({commit.short_id})", "grey") + + color_message(commit_title, "green") + + color_message(f" ({commit_short_sha})", "grey") + " by " - + color_message(commit.author_name, "bold"), + + color_message(commit_author, "bold"), "blue", ), flush=True, ) print( color_message( - "Pipeline Link: " + color_message(pipeline.web_url, "green"), + "Pipeline Link: " + + color_message(f"https://gitlab.ddbuild.io/{gitlab.project_name}/pipelines/{pipeline_id}", "green"), "blue", ), flush=True, @@ -182,10 +177,19 @@ def wait_for_pipeline( print(color_message("Waiting for pipeline to finish. Exiting won't cancel it.", "blue"), flush=True) - f = functools.partial(pipeline_status, pipeline) + f = functools.partial(pipeline_status, gitlab, pipeline_id) loop_status(f, pipeline_finish_timeout_sec) + return pipeline_id + + +def get_commit_for_pipeline(gitlab, pipeline_id): + pipeline = gitlab.pipeline(pipeline_id) + sha = pipeline['sha'] + commit = gitlab.commit(sha) + return commit['author_name'], commit['short_id'], commit['title'] + def loop_status(callable, timeout_sec): """ @@ -202,49 +206,50 @@ def loop_status(callable, timeout_sec): sleep(10) -def pipeline_status(pipeline: ProjectPipeline, job_status): +def pipeline_status(gitlab, pipeline_id, job_status): """ Checks the pipeline status and updates job statuses. """ - jobs = pipeline.jobs.list(per_page=100, all=True) + jobs = gitlab.all_jobs(pipeline_id) job_status = update_job_status(jobs, job_status) # Check pipeline status - pipestatus = pipeline.status.lower().strip() - ref = pipeline.ref + pipeline = gitlab.pipeline(pipeline_id) + pipestatus = pipeline["status"].lower().strip() + ref = pipeline["ref"] if pipestatus == "success": print( color_message( - f"Pipeline {pipeline.web_url} for {ref} succeeded", + f"Pipeline https://gitlab.ddbuild.io/{gitlab.project_name}/pipelines/{pipeline_id} for {ref} succeeded", "green", ), flush=True, ) - notify("Pipeline success", f"Pipeline {pipeline.id} for {ref} succeeded.") + notify("Pipeline success", f"Pipeline {pipeline_id} for {ref} succeeded.") return True, job_status if pipestatus == "failed": print( color_message( - f"Pipeline {pipeline.web_url} for {ref} failed", + f"Pipeline https://gitlab.ddbuild.io/{gitlab.project_name}/pipelines/{pipeline_id} for {ref} failed", "red", ), flush=True, ) - notify("Pipeline failure", f"Pipeline {pipeline.id} for {ref} failed.") + notify("Pipeline failure", f"Pipeline {pipeline_id} for {ref} failed.") return True, job_status if pipestatus == "canceled": print( color_message( - f"Pipeline {pipeline.web_url} for {ref} was canceled", + f"Pipeline https://gitlab.ddbuild.io/{gitlab.project_name}/pipelines/{pipeline_id} for {ref} was canceled", "grey", ), flush=True, ) - notify("Pipeline canceled", f"Pipeline {pipeline.id} for {ref} was canceled.") + notify("Pipeline canceled", f"Pipeline {pipeline_id} for {ref} was canceled.") return True, job_status if pipestatus not in ["created", "running", "pending"]: @@ -253,36 +258,36 @@ def pipeline_status(pipeline: ProjectPipeline, job_status): return False, job_status -def update_job_status(jobs: List[ProjectJob], job_status): +def update_job_status(jobs, job_status): """ Updates job statuses and notify on changes. """ notify = {} for job in jobs: - if job_status.get(job.name, None) is None: - job_status[job.name] = job - notify[job.id] = job + if job_status.get(job['name'], None) is None: + job_status[job['name']] = job + notify[job['id']] = job else: # There are two reasons why we want to notify: # - status change on job (when we refresh) # - another job with the same name exists (when a job is retried) # Check for id to see if we're in the first case. - old_job = job_status[job.name] - if job.id == old_job.id and job.status != old_job.status: - job_status[job.name] = job - notify[job.id] = job - if job.id != old_job.id and job.created_at > old_job.created_at: - job_status[job.name] = job + old_job = job_status[job['name']] + if job['id'] == old_job['id'] and job['status'] != old_job['status']: + job_status[job['name']] = job + notify[job['id']] = job + if job['id'] != old_job['id'] and job['created_at'] > old_job['created_at']: + job_status[job['name']] = job # Check if old job already in notification list, to append retry message - notify_old_job = notify.get(old_job.id, None) + notify_old_job = notify.get(old_job['id'], None) if notify_old_job is not None: - notify_old_job.retried_old = True # Add message to say the job got retried - notify_old_job.retried_created_at = job.created_at - notify[old_job.id] = notify_old_job + notify_old_job['retried_old'] = True # Add message to say the job got retried + notify_old_job['retried_created_at'] = job['created_at'] + notify[old_job['id']] = notify_old_job # If not (eg. previous job was notified in last refresh), add retry message to new job else: - job.retried_new = True - notify[job.id] = job + job['retried_new'] = True + notify[job['id']] = job for job in notify.values(): print_job_status(job) @@ -307,49 +312,53 @@ def print_job(name, stage, color, date, duration, status, link): def print_retry(name, date): print(color_message(f"[{date}] Job {name} was retried", "grey")) - duration = job.duration - date = job.finished_at # Date that is printed in the console log. In most cases, it's when the job finished. + name = job['name'] + stage = job['stage'] + allow_failure = job['allow_failure'] + duration = job['duration'] + date = job['finished_at'] # Date that is printed in the console log. In most cases, it's when the job finished. + status = job['status'] # Gitlab job status job_status = None # Status string printed in the console link = '' # Link to the pipeline. Only filled for failing jobs, to be able to quickly go to the failing job. color = 'grey' # Log output color # A None duration is set by Gitlab when the job gets canceled before it was started. # In that case, set a duration of 0s. - if job.duration is None: + if duration is None: duration = 0 - if job.status == 'success': + if status == 'success': job_status = 'succeeded' color = 'green' - elif job.status == 'failed': - if job.allow_failure: + elif status == 'failed': + if allow_failure: job_status = 'failed (allowed to fail)' color = 'orange' else: job_status = 'failed' color = 'red' - link = f"Link: {job.web_url}" + link = f"Link: {job['web_url']}" # Only notify on real (not retried) failures # Best-effort, as there can be situations where the retried # job didn't get created yet - if getattr(job, 'retried_old', None) is None: - notify("Job failure", f"Job {job.name} failed.") - elif job.status == 'canceled': + if job.get('retried_old', None) is None: + notify("Job failure", f"Job {name} failed.") + elif status == 'canceled': job_status = 'was canceled' color = 'grey' - elif job.status == 'running': + elif status == 'running': job_status = 'started running' - date = job.started_at + date = job['started_at'] color = 'blue' else: return # Some logic to print the retry message in the correct order (before the new job or after the old job) - if getattr(job, 'retried_new', None) is not None: - print_retry(job.name, job.created_at) - print_job(job.name, job.stage, color, date, duration, job_status, link) - if getattr(job, 'retried_old', None) is not None: - print_retry(job.name, job.retried_created_at) + if job.get('retried_new', None) is not None: + print_retry(name, job['created_at']) + print_job(name, stage, color, date, duration, job_status, link) + if job.get('retried_old', None) is not None: + print_retry(name, job['retried_created_at']) def notify(title, info_text, sound=True): diff --git a/tasks/libs/types/types.py b/tasks/libs/types/types.py index 10bc2eaeea10e..3c0d4c103e565 100644 --- a/tasks/libs/types/types.py +++ b/tasks/libs/types/types.py @@ -3,8 +3,6 @@ from collections import defaultdict from enum import Enum -from gitlab.v4.objects import ProjectJob - class Test: PACKAGE_PREFIX = "github.com/DataDog/datadog-agent/" @@ -62,12 +60,12 @@ def __init__(self): self.mandatory_infra_job_failures = [] self.optional_infra_job_failures = [] - def add_failed_job(self, job: ProjectJob): - if job.failure_type == FailedJobType.INFRA_FAILURE and job.allow_failure: + def add_failed_job(self, job): + if job["failure_type"] == FailedJobType.INFRA_FAILURE and job["allow_failure"]: self.optional_infra_job_failures.append(job) - elif job.failure_type == FailedJobType.INFRA_FAILURE and not job.allow_failure: + elif job["failure_type"] == FailedJobType.INFRA_FAILURE and not job["allow_failure"]: self.mandatory_infra_job_failures.append(job) - elif job.allow_failure: + elif job["allow_failure"]: self.optional_job_failures.append(job) else: self.mandatory_job_failures.append(job) @@ -113,13 +111,13 @@ def __render_jobs_section(self, header: str, jobs: list, buffer: io.StringIO): jobs_per_stage = defaultdict(list) for job in jobs: - jobs_per_stage[job.stage].append(job) + jobs_per_stage[job["stage"]].append(job) for stage, jobs in jobs_per_stage.items(): jobs_info = [] for job in jobs: - num_retries = len(job.retry_summary) - 1 - job_info = f"<{job.web_url}|{job.name}>" + num_retries = len(job["retry_summary"]) - 1 + job_info = f"<{job['url']}|{job['name']}>" if num_retries > 0: job_info += f" ({num_retries} retries)" @@ -133,7 +131,7 @@ def __render_jobs_section(self, header: str, jobs: list, buffer: io.StringIO): def __render_tests_section(self, buffer): print(self.TEST_SECTION_HEADER, file=buffer) for (test_name, test_package), jobs in self.failed_tests.items(): - job_list = ", ".join(f"<{job.web_url}|{job.name}>" for job in jobs[: self.MAX_JOBS_PER_TEST]) + job_list = ", ".join(f"<{job['url']}|{job['name']}>" for job in jobs[: self.MAX_JOBS_PER_TEST]) if len(jobs) > self.MAX_JOBS_PER_TEST: job_list += f" and {len(jobs) - self.MAX_JOBS_PER_TEST} more" print(f"- `{test_name}` from package `{test_package}` (in {job_list})", file=buffer) diff --git a/tasks/linter.py b/tasks/linter.py index d79cf02a7aea7..c6906ffaceeb1 100644 --- a/tasks/linter.py +++ b/tasks/linter.py @@ -8,9 +8,10 @@ from tasks.build_tags import compute_build_tags_for_flavor from tasks.flavor import AgentFlavor from tasks.go import run_golangci_lint -from tasks.libs.ciproviders.gitlab_api import ( +from tasks.libs.ciproviders.gitlab import ( + Gitlab, generate_gitlab_full_configuration, - get_gitlab_repo, + get_gitlab_token, get_preset_contexts, load_context, ) @@ -380,15 +381,15 @@ def gitlab_ci(_, test="all", custom_context=None): else: all_contexts = get_preset_contexts(test) print(f"We will tests {len(all_contexts)} contexts.") - agent = get_gitlab_repo() for context in all_contexts: print("Test gitlab configuration with context: ", context) config = generate_gitlab_full_configuration(".gitlab-ci.yml", dict(context)) - res = agent.ci_lint.create({"content": config}) - status = color_message("valid", "green") if res.valid else color_message("invalid", "red") + gitlab = Gitlab(api_token=get_gitlab_token()) + res = gitlab.lint(config) + status = color_message("valid", "green") if res["valid"] else color_message("invalid", "red") print(f"Config is {status}") - if len(res.warnings) > 0: - print(color_message(f"Warnings: {res.warnings}", "orange"), file=sys.stderr) - if not res.valid: - print(color_message(f"Errors: {res.errors}", "red"), file=sys.stderr) + if len(res["warnings"]) > 0: + print(color_message(f"Warnings: {res['warnings']}", "orange"), file=sys.stderr) + if not res["valid"]: + print(color_message(f"Errors: {res['errors']}", "red"), file=sys.stderr) raise Exit(code=1) diff --git a/tasks/notify.py b/tasks/notify.py index a5c8da6f26ce8..037f16be047a6 100644 --- a/tasks/notify.py +++ b/tasks/notify.py @@ -291,7 +291,7 @@ def update_statistics(job_executions): # Update statistics and collect consecutive failed jobs alert_jobs = {"consecutive": [], "cumulative": []} failed_jobs = get_failed_jobs(PROJECT_NAME, os.getenv("CI_PIPELINE_ID")) - failed_set = {job.name for job in failed_jobs.all_failures()} + failed_set = {job["name"] for job in failed_jobs.all_failures()} current_set = set(job_executions["jobs"].keys()) # Insert data for newly failing jobs new_failed_jobs = failed_set - current_set diff --git a/tasks/pipeline.py b/tasks/pipeline.py index 260d860f66f35..9a9478e7f9e8e 100644 --- a/tasks/pipeline.py +++ b/tasks/pipeline.py @@ -6,13 +6,11 @@ from datetime import datetime, timedelta, timezone import yaml -from gitlab import GitlabError -from gitlab.v4.objects import Project from invoke import task from invoke.exceptions import Exit from tasks.libs.ciproviders.github_api import GithubAPI -from tasks.libs.ciproviders.gitlab_api import get_gitlab_bot_token, get_gitlab_repo +from tasks.libs.ciproviders.gitlab import Gitlab, get_gitlab_bot_token, get_gitlab_token from tasks.libs.common.color import color_message from tasks.libs.common.utils import ( DEFAULT_BRANCH, @@ -56,7 +54,7 @@ def GitlabYamlLoader(): # Tasks to trigger pipelines -def check_deploy_pipeline(repo: Project, git_ref, release_version_6, release_version_7, repo_branch): +def check_deploy_pipeline(gitlab, git_ref, release_version_6, release_version_7, repo_branch): """ Run checks to verify a deploy pipeline is valid: - it targets a valid repo branch @@ -83,9 +81,9 @@ def check_deploy_pipeline(repo: Project, git_ref, release_version_6, release_ver if release_version_6 and match: # release_version_6 is not empty and git_ref matches v7 pattern, construct v6 tag and check. tag_name = "6." + "".join(match.groups()) - try: - repo.tags.get(tag_name) - except GitlabError: + gitlab_tag = gitlab.find_tag(tag_name) + + if ("name" not in gitlab_tag) or gitlab_tag["name"] != tag_name: print(f"Cannot find GitLab v6 tag {tag_name} while trying to build git ref {git_ref}") raise Exit(code=1) @@ -96,9 +94,9 @@ def check_deploy_pipeline(repo: Project, git_ref, release_version_6, release_ver if release_version_7 and match: # release_version_7 is not empty and git_ref matches v6 pattern, construct v7 tag and check. tag_name = "7." + "".join(match.groups()) - try: - repo.tags.get(tag_name) - except GitlabError: + gitlab_tag = gitlab.find_tag(tag_name) + + if ("name" not in gitlab_tag) or gitlab_tag["name"] != tag_name: print(f"Cannot find GitLab v7 tag {tag_name} while trying to build git ref {git_ref}") raise Exit(code=1) @@ -112,7 +110,8 @@ def clean_running_pipelines(ctx, git_ref=DEFAULT_BRANCH, here=False, use_latest_ should be cancelled. """ - agent = get_gitlab_repo() + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.test_project_found() if here: git_ref = ctx.run("git rev-parse --abbrev-ref HEAD", hide=True).stdout.strip() @@ -125,14 +124,14 @@ def clean_running_pipelines(ctx, git_ref=DEFAULT_BRANCH, here=False, use_latest_ elif not sha: print(f"Git sha not provided, fetching all running pipelines on {git_ref}") - pipelines = get_running_pipelines_on_same_ref(agent, git_ref, sha) + pipelines = get_running_pipelines_on_same_ref(gitlab, git_ref, sha) print( f"Found {len(pipelines)} running pipeline(s) matching the request.", "They are ordered from the newest one to the oldest one.\n", sep='\n', ) - cancel_pipelines_with_confirmation(agent, pipelines) + cancel_pipelines_with_confirmation(gitlab, pipelines) def workflow_rules(gitlab_file=".gitlab-ci.yml"): @@ -176,33 +175,37 @@ def auto_cancel_previous_pipelines(ctx): if not os.environ.get('GITLAB_TOKEN'): raise Exit("GITLAB_TOKEN variable needed to cancel pipelines on the same ref.", 1) + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.test_project_found() + git_ref = os.getenv("CI_COMMIT_REF_NAME") git_sha = os.getenv("CI_COMMIT_SHA") - repo = get_gitlab_repo() - pipelines = get_running_pipelines_on_same_ref(repo, git_ref) - pipelines_without_current = [p for p in pipelines if p.sha != git_sha] + pipelines = get_running_pipelines_on_same_ref(gitlab, git_ref) + pipelines_without_current = [p for p in pipelines if p["sha"] != git_sha] for pipeline in pipelines_without_current: # We cancel pipeline only if it correspond to a commit that is an ancestor of the current commit - is_ancestor = ctx.run(f'git merge-base --is-ancestor {pipeline.sha} {git_sha}', warn=True, hide="both") + is_ancestor = ctx.run(f'git merge-base --is-ancestor {pipeline["sha"]} {git_sha}', warn=True, hide="both") if is_ancestor.exited == 0: - print(f'Gracefully canceling jobs that are not canceled on pipeline {pipeline.id} ({pipeline.web_url})') - gracefully_cancel_pipeline(repo, pipeline, force_cancel_stages=["package_build"]) + print( + f'Gracefully canceling jobs that are not canceled on pipeline {pipeline["id"]} ({pipeline["web_url"]})' + ) + gracefully_cancel_pipeline(gitlab, pipeline, force_cancel_stages=["package_build"]) elif is_ancestor.exited == 1: - print(f'{pipeline.sha} is not an ancestor of {git_sha}, not cancelling pipeline {pipeline.id}') + print(f'{pipeline["sha"]} is not an ancestor of {git_sha}, not cancelling pipeline {pipeline["id"]}') elif is_ancestor.exited == 128: min_time_before_cancel = 5 print( - f'Could not determine if {pipeline.sha} is an ancestor of {git_sha}, probably because it has been deleted from the history because of force push' + f'Could not determine if {pipeline["sha"]} is an ancestor of {git_sha}, probably because it has been deleted from the history because of force push' ) - if datetime.strptime(pipeline.created_at, "%Y-%m-%dT%H:%M:%S.%fZ") < datetime.now() - timedelta( + if datetime.strptime(pipeline["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") < datetime.now() - timedelta( minutes=min_time_before_cancel ): print( - f'Pipeline started earlier than {min_time_before_cancel} minutes ago, gracefully canceling pipeline {pipeline.id}' + f'Pipeline started earlier than {min_time_before_cancel} minutes ago, gracefully canceling pipeline {pipeline["id"]}' ) - gracefully_cancel_pipeline(repo, pipeline, force_cancel_stages=["package_build"]) + gracefully_cancel_pipeline(gitlab, pipeline, force_cancel_stages=["package_build"]) else: print(is_ancestor.stderr) raise Exit(1) @@ -263,7 +266,8 @@ def run( inv pipeline.run --deploy --use-release-entries --major-versions "6,7" --git-ref "7.32.0" --repo-branch "stable" """ - repo = get_gitlab_repo() + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.test_project_found() if (not git_ref and not here) or (git_ref and here): raise Exit("ERROR: Exactly one of --here or --git-ref must be specified.", code=1) @@ -286,7 +290,7 @@ def run( if deploy: # Check the validity of the deploy pipeline - check_deploy_pipeline(repo, git_ref, release_version_6, release_version_7, repo_branch) + check_deploy_pipeline(gitlab, git_ref, release_version_6, release_version_7, repo_branch) # Force all builds and kitchen tests to be run if not all_builds: print( @@ -305,7 +309,7 @@ def run( ) e2e_tests = True - pipelines = get_running_pipelines_on_same_ref(repo, git_ref) + pipelines = get_running_pipelines_on_same_ref(gitlab, git_ref) if pipelines: print( @@ -315,11 +319,11 @@ def run( "They are ordered from the newest one to the oldest one.\n", sep='\n', ) - cancel_pipelines_with_confirmation(repo, pipelines) + cancel_pipelines_with_confirmation(gitlab, pipelines) try: - pipeline = trigger_agent_pipeline( - repo, + pipeline_id = trigger_agent_pipeline( + gitlab, git_ref, release_version_6, release_version_7, @@ -334,7 +338,7 @@ def run( print(color_message(f"ERROR: pipeline does not match any workflow rule. Rules:\n{workflow_rules()}", "red")) return - wait_for_pipeline(repo, pipeline) + wait_for_pipeline(gitlab, pipeline_id) @task @@ -352,7 +356,8 @@ def follow(ctx, id=None, git_ref=None, here=False, project_name="DataDog/datadog inv pipeline.follow --id 1234567 """ - repo = get_gitlab_repo(project_name) + gitlab = Gitlab(project_name=project_name, api_token=get_gitlab_token()) + gitlab.test_project_found() args_given = 0 if id is not None: @@ -368,25 +373,22 @@ def follow(ctx, id=None, git_ref=None, here=False, project_name="DataDog/datadog ) if id is not None: - pipeline = repo.pipelines.get(id) - wait_for_pipeline(repo, pipeline) + wait_for_pipeline(gitlab, id) elif git_ref is not None: - wait_for_pipeline_from_ref(repo, git_ref) + wait_for_pipeline_from_ref(gitlab, git_ref) elif here: git_ref = ctx.run("git rev-parse --abbrev-ref HEAD", hide=True).stdout.strip() - wait_for_pipeline_from_ref(repo, git_ref) + wait_for_pipeline_from_ref(gitlab, git_ref) -def wait_for_pipeline_from_ref(repo: Project, ref): - # Get last updated pipeline - pipelines = repo.pipelines.list(ref=ref, per_page=1, order_by='updated_at') - if len(pipelines) == 0: +def wait_for_pipeline_from_ref(gitlab, ref): + pipeline = gitlab.last_pipeline_for_ref(ref) + if pipeline is not None: + wait_for_pipeline(gitlab, pipeline['id']) + else: print(f"No pipelines found for {ref}") raise Exit(code=1) - pipeline = pipelines[0] - wait_for_pipeline(repo, pipeline) - @task(iterable=['variable']) def trigger_child_pipeline(_, git_ref, project_name, variable=None, follow=True): @@ -400,9 +402,9 @@ def trigger_child_pipeline(_, git_ref, project_name, variable=None, follow=True) Use --follow to make this task wait for the pipeline to finish, and return 1 if it fails. (requires GITLAB_TOKEN). Examples: - inv pipeline.trigger-child-pipeline --git-ref "main" --project-name "DataDog/agent-release-management" --variable "RELEASE_VERSION" + inv pipeline.trigger-child-pipeline --git-ref "master" --project-name "DataDog/agent-release-management" --variables "RELEASE_VERSION" - inv pipeline.trigger-child-pipeline --git-ref "main" --project-name "DataDog/agent-release-management" --variable "VAR1" --variable "VAR2" --variable "VAR3" + inv pipeline.trigger-child-pipeline --git-ref "master" --project-name "DataDog/agent-release-management" --variables "VAR1,VAR2,VAR3" """ if not os.environ.get('CI_JOB_TOKEN'): @@ -416,7 +418,7 @@ def trigger_child_pipeline(_, git_ref, project_name, variable=None, follow=True) # set, but trigger_pipeline doesn't use it os.environ["GITLAB_TOKEN"] = os.environ['CI_JOB_TOKEN'] - repo = get_gitlab_repo(project_name) + gitlab = Gitlab(project_name=project_name, api_token=get_gitlab_token()) data = {"token": os.environ['CI_JOB_TOKEN'], "ref": git_ref, "variables": {}} @@ -441,22 +443,23 @@ def trigger_child_pipeline(_, git_ref, project_name, variable=None, follow=True) flush=True, ) - try: - data['variables'] = [{'key': key, 'value': value} for (key, value) in data['variables'].items()] + res = gitlab.trigger_pipeline(data) - pipeline = repo.pipelines.create(data) - except GitlabError as e: - raise Exit(f"Failed to create child pipeline: {e}", code=1) + if 'id' not in res: + raise Exit(f"Failed to create child pipeline: {res}", code=1) - print(f"Created a child pipeline with id={pipeline.id}, url={pipeline.web_url}", flush=True) + pipeline_id = res['id'] + pipeline_url = res['web_url'] + print(f"Created a child pipeline with id={pipeline_id}, url={pipeline_url}", flush=True) if follow: print("Waiting for child pipeline to finish...", flush=True) - wait_for_pipeline(repo, pipeline) + wait_for_pipeline(gitlab, pipeline_id) # Check pipeline status - pipestatus = pipeline.status.lower().strip() + pipeline = gitlab.pipeline(pipeline_id) + pipestatus = pipeline["status"].lower().strip() if pipestatus != "success": raise Exit(f"Error: child pipeline status {pipestatus.title()}", code=1) @@ -579,16 +582,21 @@ def changelog(ctx, new_commit_sha): ) +def _init_pipeline_schedule_task(): + gitlab = Gitlab(api_token=get_gitlab_bot_token()) + gitlab.test_project_found() + return gitlab + + @task def get_schedules(_): """ Pretty-print all pipeline schedules on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - for sched in repo.pipelineschedules.list(per_page=100, all=True): - sched.pprint() + gitlab = _init_pipeline_schedule_task() + for ps in gitlab.all_pipeline_schedules(): + pprint.pprint(ps) @task @@ -597,11 +605,9 @@ def get_schedule(_, schedule_id): Pretty-print a single pipeline schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.get(schedule_id) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.pipeline_schedule(schedule_id) + pprint.pprint(result) @task @@ -612,13 +618,9 @@ def create_schedule(_, description, ref, cron, cron_timezone=None, active=False) Note that unless you explicitly specify the --active flag, the schedule will be created as inactive. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.create( - {'description': description, 'ref': ref, 'cron': cron, 'cron_timezone': cron_timezone, 'active': active} - ) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.create_pipeline_schedule(description, ref, cron, cron_timezone, active) + pprint.pprint(result) @task @@ -627,14 +629,9 @@ def edit_schedule(_, schedule_id, description=None, ref=None, cron=None, cron_ti Edit an existing pipeline schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - data = {'description': description, 'ref': ref, 'cron': cron, 'cron_timezone': cron_timezone} - data = {key: value for (key, value) in data.items() if value is not None} - - sched = repo.pipelineschedules.update(schedule_id, data) - - pprint.pprint(sched) + gitlab = _init_pipeline_schedule_task() + result = gitlab.edit_pipeline_schedule(schedule_id, description, ref, cron, cron_timezone) + pprint.pprint(result) @task @@ -643,11 +640,9 @@ def activate_schedule(_, schedule_id): Activate an existing pipeline schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.update(schedule_id, {'active': True}) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.edit_pipeline_schedule(schedule_id, active=True) + pprint.pprint(result) @task @@ -656,11 +651,9 @@ def deactivate_schedule(_, schedule_id): Deactivate an existing pipeline schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.update(schedule_id, {'active': False}) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.edit_pipeline_schedule(schedule_id, active=False) + pprint.pprint(result) @task @@ -669,11 +662,9 @@ def delete_schedule(_, schedule_id): Delete an existing pipeline schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - repo.pipelineschedules.delete(schedule_id) - - print('Deleted schedule', schedule_id) + gitlab = _init_pipeline_schedule_task() + result = gitlab.delete_pipeline_schedule(schedule_id) + pprint.pprint(result) @task @@ -682,12 +673,9 @@ def create_schedule_variable(_, schedule_id, key, value): Create a variable for an existing schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.get(schedule_id) - sched.variables.create({'key': key, 'value': value}) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.create_pipeline_schedule_variable(schedule_id, key, value) + pprint.pprint(result) @task @@ -696,12 +684,9 @@ def edit_schedule_variable(_, schedule_id, key, value): Edit an existing variable for a schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.get(schedule_id) - sched.variables.update(key, {'value': value}) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.edit_pipeline_schedule_variable(schedule_id, key, value) + pprint.pprint(result) @task @@ -710,12 +695,9 @@ def delete_schedule_variable(_, schedule_id, key): Delete an existing variable for a schedule on the repository. """ - repo = get_gitlab_repo(token=get_gitlab_bot_token()) - - sched = repo.pipelineschedules.get(schedule_id) - sched.variables.delete(key) - - sched.pprint() + gitlab = _init_pipeline_schedule_task() + result = gitlab.delete_pipeline_schedule_variable(schedule_id, key) + pprint.pprint(result) @task( @@ -926,28 +908,28 @@ def test_merge_queue(ctx): pr.create_issue_comment("/merge") # Search for the generated pipeline print(f"PR {pr.html_url} is waiting for MQ pipeline generation") - agent = get_gitlab_repo() + gitlab = Gitlab(api_token=get_gitlab_token()) max_attempts = 5 for attempt in range(max_attempts): time.sleep(30) - pipelines = agent.pipelines.list(per_page=100) + pipelines = gitlab.last_pipelines() try: - pipeline = next(p for p in pipelines if p.ref.startswith(f"mq-working-branch-{test_main}")) - print(f"Pipeline found: {pipeline.web_url}") + pipeline = next(p for p in pipelines if p["ref"].startswith(f"mq-working-branch-{test_main}")) + print(f"Pipeline found: {pipeline['web_url']}") break except StopIteration: if attempt == max_attempts - 1: raise RuntimeError("No pipeline found for the merge queue") continue - success = pipeline.status == "running" + success = pipeline["status"] == "running" if success: print("Pipeline correctly created, congrats") else: - print(f"[ERROR] Impossible to generate a pipeline for the merge queue, please check {pipeline.web_url}") + print(f"[ERROR] Impossible to generate a pipeline for the merge queue, please check {pipeline['web_url']}") # Clean up print("Cleaning up") if success: - pipeline.cancel() + gitlab.cancel_pipeline(pipeline["id"]) pr.edit(state="closed") ctx.run(f"git checkout {current_branch}", hide=True) ctx.run(f"git branch -D {test_main}", hide=True) diff --git a/tasks/release.py b/tasks/release.py index 27eb8813640da..9b882ccabdbb0 100644 --- a/tasks/release.py +++ b/tasks/release.py @@ -11,12 +11,11 @@ from datetime import date from time import sleep -from gitlab import GitlabError from invoke import Failure, task from invoke.exceptions import Exit from tasks.libs.ciproviders.github_api import GithubAPI -from tasks.libs.ciproviders.gitlab_api import get_gitlab_repo +from tasks.libs.ciproviders.gitlab import Gitlab, get_gitlab_token from tasks.libs.common.color import color_message from tasks.libs.common.user_interactions import yes_no_question from tasks.libs.common.utils import ( @@ -1333,7 +1332,7 @@ def build_rc(ctx, major_versions="6,7", patch_version=False, k8s_deployments=Fal if sys.version_info[0] < 3: return Exit(message="Must use Python 3 for this task", code=1) - datadog_agent = get_gitlab_repo() + gitlab = Gitlab(project_name=GITHUB_REPO_NAME, api_token=get_gitlab_token()) list_major_versions = parse_major_versions(major_versions) # Get the version of the highest major: needed for tag_version and to know @@ -1382,11 +1381,7 @@ def build_rc(ctx, major_versions="6,7", patch_version=False, k8s_deployments=Fal print(color_message(f"Waiting until the {new_version} tag appears in Gitlab", "bold")) gitlab_tag = None while not gitlab_tag: - try: - gitlab_tag = datadog_agent.tags.get(str(new_version)) - except GitlabError: - continue - + gitlab_tag = gitlab.find_tag(str(new_version)).get("name", None) sleep(5) print(color_message("Creating RC pipeline", "bold")) diff --git a/tasks/unit-tests/gitlab_api_tests.py b/tasks/unit-tests/gitlab_api_tests.py index 24399f816c8bd..ad618b0d380ff 100644 --- a/tasks/unit-tests/gitlab_api_tests.py +++ b/tasks/unit-tests/gitlab_api_tests.py @@ -1,6 +1,97 @@ import unittest +from itertools import cycle +from unittest import mock -from tasks.libs.ciproviders.gitlab_api import generate_gitlab_full_configuration, read_includes +from invoke.exceptions import Exit + +from tasks.libs.ciproviders.gitlab import Gitlab, generate_gitlab_full_configuration, get_gitlab_token, read_includes +from tasks.libs.common.remote_api import APIError + + +class MockResponse: + def __init__(self, content, status_code): + self.content = content + self.status_code = status_code + + def json(self): + return self.content + + +#################### FAIL REQUEST ##################### + + +def fail_not_found_request(*_args, **_kwargs): + return MockResponse([], 404) + + +##################### MOCKED GITLAB ##################### + + +def mocked_502_gitlab_requests(*_args, **_kwargs): + return MockResponse( + "\r\n502 Bad Gateway\r\n\r\n

502 Bad Gateway

\r\n\r\n\r\n", + 502, + ) + + +def mocked_gitlab_project_request(*_args, **_kwargs): + return MockResponse("name", 200) + + +class SideEffect: + def __init__(self, *fargs): + self.functions = cycle(fargs) + + def __call__(self, *args, **kwargs): + func = next(self.functions) + return func(*args, **kwargs) + + +class TestStatusCode5XX(unittest.TestCase): + @mock.patch('requests.get', side_effect=SideEffect(mocked_502_gitlab_requests, mocked_gitlab_project_request)) + def test_gitlab_one_fail_one_success(self, _): + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.requests_sleep_time = 0 + gitlab.test_project_found() + + @mock.patch( + 'requests.get', + side_effect=SideEffect( + mocked_502_gitlab_requests, + mocked_502_gitlab_requests, + mocked_502_gitlab_requests, + mocked_502_gitlab_requests, + mocked_gitlab_project_request, + ), + ) + def test_gitlab_last_one_success(self, _): + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.requests_sleep_time = 0 + gitlab.test_project_found() + + @mock.patch('requests.get', side_effect=SideEffect(mocked_502_gitlab_requests)) + def test_gitlab_full_fail(self, _): + failed = False + try: + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.requests_sleep_time = 0 + gitlab.test_project_found() + except Exit: + failed = True + if not failed: + Exit("GitlabAPI was expected to fail") + + @mock.patch('requests.get', side_effect=SideEffect(fail_not_found_request, mocked_gitlab_project_request)) + def test_gitlab_real_fail(self, _): + failed = False + try: + gitlab = Gitlab(api_token=get_gitlab_token()) + gitlab.requests_sleep_time = 0 + gitlab.test_project_found() + except APIError: + failed = True + if not failed: + Exit("GitlabAPI was expected to fail") class TestReadIncludes(unittest.TestCase): diff --git a/tasks/unit-tests/notify_tests.py b/tasks/unit-tests/notify_tests.py index 8d3c5b7173d1a..01b54f89170c3 100644 --- a/tasks/unit-tests/notify_tests.py +++ b/tasks/unit-tests/notify_tests.py @@ -2,10 +2,8 @@ import os import pathlib import unittest -from typing import List from unittest.mock import MagicMock, patch -from gitlab.v4.objects import ProjectJob from invoke import MockContext, Result from invoke.exceptions import UnexpectedExit @@ -13,95 +11,67 @@ from tasks.libs.types.types import FailedJobs, FailedJobType -def get_fake_jobs() -> List[ProjectJob]: - with open("tasks/unit-tests/testdata/jobs.json") as f: - jobs = json.load(f) - - return [ProjectJob(MagicMock(), attrs=job) for job in jobs] - - class TestSendMessage(unittest.TestCase): - @patch('tasks.libs.ciproviders.gitlab_api.get_gitlab_api') - def test_merge(self, api_mock): - repo_mock = api_mock.return_value.projects.get.return_value - repo_mock.jobs.get.return_value.trace.return_value = b"Log trace" - list_mock = repo_mock.pipelines.get.return_value.jobs.list - list_mock.side_effect = [get_fake_jobs(), []] - notify.send_message(MockContext(), notification_type="merge", print_to_stdout=True) - list_mock.assert_called() - @patch("tasks.notify.get_failed_jobs") def test_merge_without_get_failed_call(self, get_failed_jobs_mock): failed = FailedJobs() failed.add_failed_job( - ProjectJob( - MagicMock(), - attrs={ - "name": "job1", - "stage": "stage1", - "retry_summary": [], - "web_url": "http://www.job.com", - "failure_type": FailedJobType.INFRA_FAILURE, - "allow_failure": False, - }, - ) + { + "name": "job1", + "stage": "stage1", + "retry_summary": [], + "url": "http://www.job.com", + "failure_type": FailedJobType.INFRA_FAILURE, + "allow_failure": False, + } ) failed.add_failed_job( - ProjectJob( - MagicMock(), - attrs={ - "name": "job2", - "stage": "stage2", - "retry_summary": [], - "web_url": "http://www.job.com", - "failure_type": FailedJobType.INFRA_FAILURE, - "allow_failure": True, - }, - ) + { + "name": "job2", + "stage": "stage2", + "retry_summary": [], + "url": "http://www.job.com", + "failure_type": FailedJobType.INFRA_FAILURE, + "allow_failure": True, + } ) failed.add_failed_job( - ProjectJob( - MagicMock(), - attrs={ - "name": "job3", - "stage": "stage3", - "retry_summary": [], - "web_url": "http://www.job.com", - "failure_type": FailedJobType.JOB_FAILURE, - "allow_failure": False, - }, - ) + { + "name": "job3", + "stage": "stage3", + "retry_summary": [], + "url": "http://www.job.com", + "failure_type": FailedJobType.JOB_FAILURE, + "allow_failure": False, + } ) failed.add_failed_job( - ProjectJob( - MagicMock(), - attrs={ - "name": "job4", - "stage": "stage4", - "retry_summary": [], - "web_url": "http://www.job.com", - "failure_type": FailedJobType.JOB_FAILURE, - "allow_failure": True, - }, - ) + { + "name": "job4", + "stage": "stage4", + "retry_summary": [], + "url": "http://www.job.com", + "failure_type": FailedJobType.JOB_FAILURE, + "allow_failure": True, + } ) get_failed_jobs_mock.return_value = failed notify.send_message(MockContext(), notification_type="merge", print_to_stdout=True) get_failed_jobs_mock.assert_called() - @patch('tasks.libs.ciproviders.gitlab_api.get_gitlab_api') - def test_merge_with_get_failed_call(self, api_mock): - repo_mock = api_mock.return_value.projects.get.return_value - trace_mock = repo_mock.jobs.get.return_value.trace - list_mock = repo_mock.pipelines.get.return_value.jobs.list - - trace_mock.return_value = b"no basic auth credentials" - list_mock.return_value = get_fake_jobs() - + @patch("requests.get") + def test_merge_with_get_failed_call(self, get_mock): + with open("tasks/unit-tests/testdata/jobs.json") as f: + jobs = json.load(f) + job_list = {"json.return_value": jobs} + no_jobs = {"json.return_value": ""} + get_mock.side_effect = [ + MagicMock(status_code=200, **job_list), + MagicMock(status_code=200, **no_jobs), + MagicMock(status_code=200, text="no basic auth credentials"), + ] notify.send_message(MockContext(), notification_type="merge", print_to_stdout=True) - - trace_mock.assert_called() - list_mock.assert_called() + get_mock.assert_called() def test_post_to_channel1(self): self.assertTrue(notify._should_send_message_to_channel('main', default_branch='main')) @@ -132,40 +102,39 @@ def test_post_to_author5(self): class TestSendStats(unittest.TestCase): - @patch('tasks.libs.ciproviders.gitlab_api.get_gitlab_api') + @patch("requests.get") @patch("tasks.notify.create_count", new=MagicMock()) - def test_nominal(self, api_mock): - repo_mock = api_mock.return_value.projects.get.return_value - trace_mock = repo_mock.jobs.get.return_value.trace - list_mock = repo_mock.pipelines.get.return_value.jobs.list - - trace_mock.return_value = b"E2E INTERNAL ERROR" - list_mock.return_value = get_fake_jobs() - + def test_nominal(self, get_mock): + with open("tasks/unit-tests/testdata/jobs.json") as f: + jobs = json.load(f) + job_list = {"json.return_value": jobs} + no_jobs = {"json.return_value": ""} + get_mock.side_effect = [ + MagicMock(status_code=200, **job_list), + MagicMock(status_code=200, **no_jobs), + MagicMock(status_code=200, text="E2E INTERNAL ERROR"), + ] notify.send_stats(MockContext(), print_to_stdout=True) - - trace_mock.assert_called() - list_mock.assert_called() + get_mock.assert_called() class TestCheckConsistentFailures(unittest.TestCase): - @patch('tasks.libs.ciproviders.gitlab_api.get_gitlab_api') - def test_nominal(self, api_mock): + @patch("requests.get") + def test_nominal(self, get_mock): os.environ["CI_PIPELINE_ID"] = "456" - - repo_mock = api_mock.return_value.projects.get.return_value - trace_mock = repo_mock.jobs.get.return_value.trace - list_mock = repo_mock.pipelines.get.return_value.jobs.list - - trace_mock.return_value = b"net/http: TLS handshake timeout" - list_mock.return_value = get_fake_jobs() - + with open("tasks/unit-tests/testdata/jobs.json") as f: + jobs = json.load(f) + job_list = {"json.return_value": jobs} + no_jobs = {"json.return_value": ""} + get_mock.side_effect = [ + MagicMock(status_code=200, **job_list), + MagicMock(status_code=200, **no_jobs), + MagicMock(status_code=200, text="net/http: TLS handshake timeout"), + ] notify.check_consistent_failures( MockContext(run=Result("test")), "tasks/unit-tests/testdata/job_executions.json" ) - - trace_mock.assert_called() - list_mock.assert_called() + get_mock.assert_called() class TestRetrieveJobExecutionsCreated(unittest.TestCase): @@ -204,9 +173,7 @@ class TestUpdateStatistics(unittest.TestCase): @patch('tasks.notify.get_failed_jobs') def test_nominal(self, mock_get_failed): failed_jobs = mock_get_failed.return_value - failed_jobs.all_failures.return_value = [ - ProjectJob(MagicMock(), attrs=a) for a in [{"name": "nifnif"}, {"name": "nafnaf"}] - ] + failed_jobs.all_failures.return_value = [{"name": "nifnif"}, {"name": "nafnaf"}] j = { "jobs": { "nafnaf": {"consecutive_failures": 2, "cumulative_failures": [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]}, @@ -228,9 +195,7 @@ def test_nominal(self, mock_get_failed): @patch('tasks.notify.get_failed_jobs') def test_multiple_failures(self, mock_get_failed): failed_jobs = mock_get_failed.return_value - failed_jobs.all_failures.return_value = [ - ProjectJob(MagicMock(), attrs=a) for a in [{"name": "poulidor"}, {"name": "virenque"}, {"name": "bardet"}] - ] + failed_jobs.all_failures.return_value = [{"name": "poulidor"}, {"name": "virenque"}, {"name": "bardet"}] j = { "jobs": { "poulidor": {"consecutive_failures": 8, "cumulative_failures": [0, 0, 1, 1, 1, 1, 1, 1, 1, 1]}, From 57f095ff9b1dc470e840a5b0471013cf7c60655e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20Beauz=C3=A9e-Luyssen?= Date: Fri, 12 Apr 2024 17:44:49 +0200 Subject: [PATCH 2/9] CI: kitchen_deploy: reduce contention when deploying debian packages (#24610) The kitchen_deploy jobs are publishing to a dedicated repo for each pipeline, meaning there's no need to protect against corruption coming from another pipeline --- .gitlab/kitchen_deploy/kitchen_deploy.yml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/.gitlab/kitchen_deploy/kitchen_deploy.yml b/.gitlab/kitchen_deploy/kitchen_deploy.yml index 246701370d1a5..bd34481a63dde 100644 --- a/.gitlab/kitchen_deploy/kitchen_deploy.yml +++ b/.gitlab/kitchen_deploy/kitchen_deploy.yml @@ -33,21 +33,10 @@ - filename=$(ls datadog-signing-keys*.deb); mv $filename datadog-signing-keys_${DD_PIPELINE_ID}.deb - popd -# Avoid simultaneous writes on the repo metadata file that made kitchen tests fail before -.deploy_deb_resource_group-a6: &deploy_deb_resource_group-a6 - resource_group: deploy_deb_a6 - -.deploy_deb_resource_group-a7: &deploy_deb_resource_group-a7 - resource_group: deploy_deb_a7 - -.deploy_deb_resource_group-i7: &deploy_deb_resource_group-i7 - resource_group: deploy_deb_i7 - .deploy_deb_testing-a6: stage: kitchen_deploy image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-builders/gitlab_agent_deploy:$DATADOG_AGENT_BUILDERS tags: ["arch:amd64"] - <<: *deploy_deb_resource_group-a6 variables: DD_PIPELINE_ID: $CI_PIPELINE_ID-a6 before_script: @@ -58,7 +47,6 @@ stage: kitchen_deploy image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-builders/gitlab_agent_deploy:$DATADOG_AGENT_BUILDERS tags: ["arch:amd64"] - <<: *deploy_deb_resource_group-i7 variables: DD_PIPELINE_ID: $CI_PIPELINE_ID-i7 before_script: @@ -104,7 +92,6 @@ deploy_deb_testing-a6_arm64: stage: kitchen_deploy image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-builders/gitlab_agent_deploy:$DATADOG_AGENT_BUILDERS tags: ["arch:amd64"] - <<: *deploy_deb_resource_group-a7 variables: DD_PIPELINE_ID: $CI_PIPELINE_ID-a7 before_script: From 1359713c94a521e71a95b3193cf5fc2b6c4fe363 Mon Sep 17 00:00:00 2001 From: Paul Cacheux Date: Fri, 12 Apr 2024 17:54:29 +0200 Subject: [PATCH 3/9] fix `inv -e security-agent.sync-secl-win-pkg` on macOS (#24646) --- tasks/security_agent.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tasks/security_agent.py b/tasks/security_agent.py index a938b55323fa8..0af5fc538de7e 100644 --- a/tasks/security_agent.py +++ b/tasks/security_agent.py @@ -945,5 +945,8 @@ def sync_secl_win_pkg(ctx): fto = ffrom ctx.run(f"cp pkg/security/secl/model/{ffrom} pkg/security/seclwin/model/{fto}") - ctx.run(f"sed -i '/^\\/\\/go:build/d' pkg/security/seclwin/model/{fto}") + if sys.platform == "darwin": + ctx.run(f"sed -i '' '/^\\/\\/go:build/d' pkg/security/seclwin/model/{fto}") + else: + ctx.run(f"sed -i '/^\\/\\/go:build/d' pkg/security/seclwin/model/{fto}") ctx.run(f"gofmt -s -w pkg/security/seclwin/model/{fto}") From 911bd7d0c8574dcc5fc206c70d1bdf08afabe918 Mon Sep 17 00:00:00 2001 From: Nicolas Schweitzer Date: Fri, 12 Apr 2024 17:57:39 +0200 Subject: [PATCH 4/9] feat(ci): Remove usage of unused image (#24639) * feat(ci): Remove usage of unused image * Remove reference on unused variables --- .gitlab-ci.yml | 6 +----- .gitlab/check_merge/do_not_merge.yml | 2 -- .gitlab/source_test/linux.yml | 4 ++-- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index baac10e636852..748a75f3fa7f5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -170,10 +170,6 @@ variables: DATADOG_AGENT_ARMBUILDIMAGES: v31802788-2dee8fe9 DATADOG_AGENT_SYSPROBE_BUILDIMAGES_SUFFIX: "" DATADOG_AGENT_SYSPROBE_BUILDIMAGES: v31802788-2dee8fe9 - DATADOG_AGENT_KERNEL_MATRIX_TESTING_BUILDIMAGES_SUFFIX: "" - DATADOG_AGENT_KERNEL_MATRIX_TESTING_BUILDIMAGES: v31802788-2dee8fe9 - DATADOG_AGENT_NIKOS_BUILDIMAGES_SUFFIX: "" - DATADOG_AGENT_NIKOS_BUILDIMAGES: v31802788-2dee8fe9 DATADOG_AGENT_BTF_GEN_BUILDIMAGES_SUFFIX: "" DATADOG_AGENT_BTF_GEN_BUILDIMAGES: v31802788-2dee8fe9 DATADOG_AGENT_BUILDERS: v28719426-b6a4fd9 @@ -367,7 +363,7 @@ variables: - .gitlab/container_build/fakeintake.yml - .gitlab/dev_container_deploy/fakeintake.yml compare_to: main # TODO: use a variable, when this is supported https://gitlab.com/gitlab-org/gitlab/-/issues/369916 - + # # Workflow rules # Rules used to define whether a pipeline should run, and with which variables diff --git a/.gitlab/check_merge/do_not_merge.yml b/.gitlab/check_merge/do_not_merge.yml index 877698f1d2881..b0405b10dd467 100644 --- a/.gitlab/check_merge/do_not_merge.yml +++ b/.gitlab/check_merge/do_not_merge.yml @@ -16,8 +16,6 @@ do-not-merge: [ ! -z "$DATADOG_AGENT_WINBUILDIMAGES_SUFFIX" ] || [ ! -z "$DATADOG_AGENT_ARMBUILDIMAGES_SUFFIX" ] || [ ! -z "$DATADOG_AGENT_SYSPROBE_BUILDIMAGES_SUFFIX" ] || - [ ! -z "$DATADOG_AGENT_KERNEL_MATRIX_TESTING_BUILDIMAGES_SUFFIX" ] || - [ ! -z "$DATADOG_AGENT_NIKOS_BUILDIMAGES_SUFFIX" ] || [ ! -z "$DATADOG_AGENT_BTF_GEN_BUILDIMAGES_SUFFIX" ] || [ ! -z "$TEST_INFRA_DEFINITIONS_BUILDIMAGES_SUFFIX" ]; then echo "Pull request uses non-empty BUILDIMAGES_SUFFIX variable" diff --git a/.gitlab/source_test/linux.yml b/.gitlab/source_test/linux.yml index 5304729be7918..561eb1a201077 100644 --- a/.gitlab/source_test/linux.yml +++ b/.gitlab/source_test/linux.yml @@ -157,7 +157,7 @@ tests_rpm-x64-py2: - !reference [.except_disable_unit_tests] - !reference [.except_mergequeue] - when: on_success - image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-buildimages/rpm_x64_testing$DATADOG_AGENT_BUILDIMAGES_SUFFIX:$DATADOG_AGENT_BUILDIMAGES + image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-buildimages/rpm_x64$DATADOG_AGENT_BUILDIMAGES_SUFFIX:$DATADOG_AGENT_BUILDIMAGES tags: ["arch:amd64"] variables: PYTHON_RUNTIMES: '2' @@ -168,7 +168,7 @@ tests_rpm-x64-py3: extends: - .rtloader_tests - .linux_tests_with_upload - image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-buildimages/rpm_x64_testing$DATADOG_AGENT_BUILDIMAGES_SUFFIX:$DATADOG_AGENT_BUILDIMAGES + image: 486234852809.dkr.ecr.us-east-1.amazonaws.com/ci/datadog-agent-buildimages/rpm_x64$DATADOG_AGENT_BUILDIMAGES_SUFFIX:$DATADOG_AGENT_BUILDIMAGES tags: ["arch:amd64"] variables: PYTHON_RUNTIMES: '3' From c0e41ecacc17ae029f928f1051962f9cbd42bf5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9lian=20Raimbault?= <161456554+CelianR@users.noreply.github.com> Date: Fri, 12 Apr 2024 18:27:39 +0200 Subject: [PATCH 5/9] [fix] Benchmark gitlab import error (#24647) * [fix-benchmark-gitlab-import] Test * [fix-benchmark-gitlab-import] Test * [fix-benchmark-gitlab-import] Cleaned code --- .gitlab/benchmarks/benchmarks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab/benchmarks/benchmarks.yml b/.gitlab/benchmarks/benchmarks.yml index 7152e9c0e2d86..b38fe9b56d894 100644 --- a/.gitlab/benchmarks/benchmarks.yml +++ b/.gitlab/benchmarks/benchmarks.yml @@ -10,6 +10,7 @@ benchmark: tags: ["team:apm-k8s-tweaked-metal-datadog-agent", "specific:true"] script: - export ARTIFACTS_DIR="$(pwd)/artifacts" && mkdir -p $ARTIFACTS_DIR + - pip install -r requirements.txt - ./test/benchmarks/apm_scripts/capture-hardware-software-info.sh - ./test/benchmarks/apm_scripts/run-benchmarks.sh - ./test/benchmarks/apm_scripts/analyze-results.sh From d4a7be27b1ca8e9fbad533d590682f793fbd429d Mon Sep 17 00:00:00 2001 From: Gustavo Caso Date: Fri, 12 Apr 2024 18:33:53 +0200 Subject: [PATCH 6/9] fix jmx and check command to have a valid settings component (#24635) --- cmd/agent/subcommands/jmx/command.go | 6 +++++- pkg/cli/subcommands/check/command.go | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cmd/agent/subcommands/jmx/command.go b/cmd/agent/subcommands/jmx/command.go index 239e1ea7392d1..1f29b02025a64 100644 --- a/cmd/agent/subcommands/jmx/command.go +++ b/cmd/agent/subcommands/jmx/command.go @@ -43,6 +43,7 @@ import ( "github.com/DataDog/datadog-agent/comp/core/log/logimpl" "github.com/DataDog/datadog-agent/comp/core/secrets" "github.com/DataDog/datadog-agent/comp/core/settings" + "github.com/DataDog/datadog-agent/comp/core/settings/settingsimpl" "github.com/DataDog/datadog-agent/comp/core/status" "github.com/DataDog/datadog-agent/comp/core/tagger" "github.com/DataDog/datadog-agent/comp/core/workloadmeta" @@ -141,6 +142,10 @@ func Commands(globalParams *command.GlobalParams) []*cobra.Command { workloadmeta.Module(), apiimpl.Module(), authtokenimpl.Module(), + // The jmx command do not have settings that change are runtime + // still, we need to pass it to ensure the API server is proprely initialized + settingsimpl.Module(), + fx.Supply(settings.Settings{}), // TODO(components): this is a temporary hack as the StartServer() method of the API package was previously called with nil arguments // This highlights the fact that the API Server created by JMX (through ExecJmx... function) should be different from the ones created // in others commands such as run. @@ -153,7 +158,6 @@ func Commands(globalParams *command.GlobalParams) []*cobra.Command { fx.Provide(func() inventoryagent.Component { return nil }), fx.Provide(func() inventoryhost.Component { return nil }), fx.Provide(func() demultiplexer.Component { return nil }), - fx.Provide(func() settings.Component { return nil }), fx.Provide(func() inventorychecks.Component { return nil }), fx.Provide(func() packagesigning.Component { return nil }), fx.Provide(func() optional.Option[rcservice.Component] { return optional.NewNoneOption[rcservice.Component]() }), diff --git a/pkg/cli/subcommands/check/command.go b/pkg/cli/subcommands/check/command.go index 79870a8ad1bae..55dfdfb18b07e 100644 --- a/pkg/cli/subcommands/check/command.go +++ b/pkg/cli/subcommands/check/command.go @@ -44,6 +44,7 @@ import ( "github.com/DataDog/datadog-agent/comp/core/log/logimpl" "github.com/DataDog/datadog-agent/comp/core/secrets" "github.com/DataDog/datadog-agent/comp/core/settings" + "github.com/DataDog/datadog-agent/comp/core/settings/settingsimpl" "github.com/DataDog/datadog-agent/comp/core/status" "github.com/DataDog/datadog-agent/comp/core/status/statusimpl" "github.com/DataDog/datadog-agent/comp/core/sysprobeconfig/sysprobeconfigimpl" @@ -208,7 +209,10 @@ func MakeCommand(globalParamsGetter func() GlobalParams) *cobra.Command { }, ), statusimpl.Module(), - + // The check command do not have settings that change are runtime + // still, we need to pass it to ensure the API server is proprely initialized + settingsimpl.Module(), + fx.Supply(settings.Settings{}), // TODO(components): this is a temporary hack as the StartServer() method of the API package was previously called with nil arguments // This highlights the fact that the API Server created by JMX (through ExecJmx... function) should be different from the ones created // in others commands such as run. @@ -217,7 +221,6 @@ func MakeCommand(globalParamsGetter func() GlobalParams) *cobra.Command { fx.Provide(func() replay.Component { return nil }), fx.Provide(func() pidmap.Component { return nil }), fx.Provide(func() serverdebug.Component { return nil }), - fx.Provide(func() settings.Component { return nil }), fx.Provide(func() host.Component { return nil }), fx.Provide(func() inventoryagent.Component { return nil }), fx.Provide(func() inventoryhost.Component { return nil }), From 44fb1ec7e8c4b0009ddc549b78af4c9217e8ddc6 Mon Sep 17 00:00:00 2001 From: Paul Cacheux Date: Fri, 12 Apr 2024 19:10:47 +0200 Subject: [PATCH 7/9] [CWS] make MacroEvaluator use the cached fields (#24652) --- pkg/security/secl/compiler/eval/macro.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pkg/security/secl/compiler/eval/macro.go b/pkg/security/secl/compiler/eval/macro.go index b01e5e4e218d2..2135c8f178fba 100644 --- a/pkg/security/secl/compiler/eval/macro.go +++ b/pkg/security/secl/compiler/eval/macro.go @@ -169,11 +169,5 @@ func (m *Macro) GetFields() []Field { // GetFields - Returns all the Field that the MacroEvaluator handles func (m *MacroEvaluator) GetFields() []Field { - fields := make([]Field, len(m.fieldValues)) - i := 0 - for key := range m.fieldValues { - fields[i] = key - i++ - } - return fields + return m.fields } From c4d753e27ea247e0cbb665b80cc9bf1ff29f9a25 Mon Sep 17 00:00:00 2001 From: Baptiste Foy Date: Fri, 12 Apr 2024 19:35:40 +0200 Subject: [PATCH 8/9] upgrade(installer): Add APM injector package installation support (#24372) * fix(errors): Clearer errors * upgrade(updater): Add injector support * fix(updater): Use privileged command to write * fix(updater): Update catalog and support ld.so.preload not existing * upgrade(updater): Add injector docker support * chore(updater): Make writing to ld.so.preload safer and remove experiment * remove catalog changes for less conflicts * fix(installer): Cleanup APM injector on setup failure * fix(updater): Remove APM injector on purge * test(installer): Add E2E test for injector installation * fix(installer): Add agent config support for apm injector & fix test * fix(installer): Add error message to helper commands and tentatively fix e2e * fix(installer): Fix e2e tests * refactor(installer): Refactor injector installation * refactor(installer): Manipulate files in go instead of string manipulation * feat(updater): Add lock to package installation commands * fix(tests): Version is not resolved anymore in docker's daemon.json * fix(installer): Allow installation of the injector before the agent * address part of the review * chore(installer): Add more tests * fix(tests): Skip some tests to be able to merge --- pkg/updater/install.go | 34 ++- pkg/updater/service/apm_inject.go | 356 ++++++++++++++++++++++ pkg/updater/service/apm_inject_test.go | 155 ++++++++++ pkg/updater/service/apm_inject_windows.go | 19 ++ pkg/updater/service/datadog_agent.go | 20 ++ pkg/updater/service/docker.go | 196 ++++++++++++ pkg/updater/service/docker_test.go | 137 +++++++++ pkg/updater/service/helper/main.go | 30 +- pkg/updater/service/systemd.go | 58 +++- pkg/updater/service/systemd_test.go | 13 +- pkg/updater/updater.go | 7 +- test/new-e2e/tests/updater/docker.go | 91 ++++++ test/new-e2e/tests/updater/linux_test.go | 141 ++++++++- 13 files changed, 1233 insertions(+), 24 deletions(-) create mode 100644 pkg/updater/service/apm_inject.go create mode 100644 pkg/updater/service/apm_inject_test.go create mode 100644 pkg/updater/service/apm_inject_windows.go create mode 100644 pkg/updater/service/docker.go create mode 100644 pkg/updater/service/docker_test.go create mode 100644 test/new-e2e/tests/updater/docker.go diff --git a/pkg/updater/install.go b/pkg/updater/install.go index b18b000a4a354..092013a087f67 100644 --- a/pkg/updater/install.go +++ b/pkg/updater/install.go @@ -13,6 +13,7 @@ import ( "os" "path/filepath" "strings" + "sync" oci "github.com/google/go-containerregistry/pkg/v1" "github.com/google/go-containerregistry/pkg/v1/types" @@ -27,11 +28,15 @@ const ( datadogPackageConfigLayerMediaType types.MediaType = "application/vnd.datadog.package.config.layer.v1.tar+zstd" datadogPackageMaxSize = 3 << 30 // 3GiB defaultConfigsDir = "/etc" + + packageDatadogAgent = "datadog-agent" + packageAPMInjector = "datadog-apm-inject" ) type installer struct { repositories *repository.Repositories configsDir string + installLock sync.Mutex } func newInstaller(repositories *repository.Repositories) *installer { @@ -56,10 +61,17 @@ func (i *installer) installStable(pkg string, version string, image oci.Image) e if err != nil { return fmt.Errorf("could not create repository: %w", err) } - if pkg == "datadog-agent" { + + i.installLock.Lock() + defer i.installLock.Unlock() + switch pkg { + case packageDatadogAgent: return service.SetupAgentUnits() + case packageAPMInjector: + return service.SetupAPMInjector() + default: + return nil } - return nil } func (i *installer) installExperiment(pkg string, version string, image oci.Image) error { @@ -100,19 +112,25 @@ func (i *installer) uninstallExperiment(pkg string) error { } func (i *installer) startExperiment(pkg string) error { - // TODO(arthur): currently we only support the datadog-agent package - if pkg != "datadog-agent" { + i.installLock.Lock() + defer i.installLock.Unlock() + switch pkg { + case packageDatadogAgent: + return service.StartAgentExperiment() + default: return nil } - return service.StartAgentExperiment() } func (i *installer) stopExperiment(pkg string) error { - // TODO(arthur): currently we only support the datadog-agent package - if pkg != "datadog-agent" { + i.installLock.Lock() + defer i.installLock.Unlock() + switch pkg { + case packageDatadogAgent: + return service.StopAgentExperiment() + default: return nil } - return service.StopAgentExperiment() } func extractPackageLayers(image oci.Image, configDir string, packageDir string) error { diff --git a/pkg/updater/service/apm_inject.go b/pkg/updater/service/apm_inject.go new file mode 100644 index 0000000000000..4982b61c2a826 --- /dev/null +++ b/pkg/updater/service/apm_inject.go @@ -0,0 +1,356 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build !windows + +// Package service provides a way to interact with os services +package service + +import ( + "bytes" + "fmt" + "os" + "path" + "strings" + + "github.com/DataDog/datadog-agent/pkg/util/log" +) + +var ( + injectorConfigPrefix = []byte("# BEGIN LD PRELOAD CONFIG") + injectorConfigSuffix = []byte("# END LD PRELOAD CONFIG") +) + +const ( + injectorConfigTemplate = ` +apm_config: + receiver_socket: %s +use_dogstatsd: true +dogstatsd_socket: %s +` + datadogConfigPath = "/etc/datadog-agent/datadog.yaml" + ldSoPreloadPath = "/etc/ld.so.preload" +) + +// SetupAPMInjector sets up the injector at bootstrap +func SetupAPMInjector() error { + // Enforce dd-installer is in the dd-agent group + if err := setInstallerAgentGroup(); err != nil { + return err + } + + installer := &apmInjectorInstaller{ + installPath: "/opt/datadog-packages/datadog-apm-inject/stable", + } + return installer.Setup() +} + +// RemoveAPMInjector removes the APM injector +func RemoveAPMInjector() error { + installer := &apmInjectorInstaller{ + installPath: "/opt/datadog-packages/datadog-apm-inject/stable", + } + return installer.Remove() +} + +type apmInjectorInstaller struct { + installPath string +} + +// Setup sets up the APM injector +func (a *apmInjectorInstaller) Setup() error { + var err error + defer func() { + if err != nil { + removeErr := a.Remove() + if removeErr != nil { + log.Warnf("Failed to remove APM injector: %v", removeErr) + } + } + }() + if err := a.setAgentConfig(); err != nil { + return err + } + if err := a.setRunPermissions(); err != nil { + return err + } + if err := a.setLDPreloadConfig(); err != nil { + return err + } + if err := a.setDockerConfig(); err != nil { + return err + } + return nil +} + +func (a *apmInjectorInstaller) Remove() error { + if err := a.deleteAgentConfig(); err != nil { + return err + } + if err := a.deleteLDPreloadConfig(); err != nil { + return err + } + if err := a.deleteDockerConfig(); err != nil { + return err + } + return nil +} + +func (a *apmInjectorInstaller) setRunPermissions() error { + return os.Chmod(path.Join(a.installPath, "inject", "run"), 0777) +} + +// setLDPreloadConfig adds preload options on /etc/ld.so.preload, overriding existing ones +func (a *apmInjectorInstaller) setLDPreloadConfig() error { + var ldSoPreload []byte + stat, err := os.Stat(ldSoPreloadPath) + if err == nil { + ldSoPreload, err = os.ReadFile(ldSoPreloadPath) + if err != nil { + return err + } + } else if !os.IsNotExist(err) { + return err + } + + newLdSoPreload, err := a.setLDPreloadConfigContent(ldSoPreload) + if err != nil { + return err + } + if bytes.Equal(ldSoPreload, newLdSoPreload) { + // No changes needed + return nil + } + + perms := os.FileMode(0644) + if stat != nil { + perms = stat.Mode() + } + err = os.WriteFile("/tmp/ld.so.preload.tmp", newLdSoPreload, perms) + if err != nil { + return err + } + + return executeCommand(string(replaceLDPreloadCommand)) +} + +// setLDPreloadConfigContent sets the content of the LD preload configuration +func (a *apmInjectorInstaller) setLDPreloadConfigContent(ldSoPreload []byte) ([]byte, error) { + launcherPreloadPath := path.Join(a.installPath, "inject", "launcher.preload.so") + + if strings.Contains(string(ldSoPreload), launcherPreloadPath) { + // If the line of interest is already in /etc/ld.so.preload, return fast + return ldSoPreload, nil + } + + // Append the launcher preload path to the file + if len(ldSoPreload) > 0 && ldSoPreload[len(ldSoPreload)-1] != '\n' { + ldSoPreload = append(ldSoPreload, '\n') + } + ldSoPreload = append(ldSoPreload, []byte(launcherPreloadPath+"\n")...) + return ldSoPreload, nil +} + +// deleteLDPreloadConfig removes the preload options from /etc/ld.so.preload +func (a *apmInjectorInstaller) deleteLDPreloadConfig() error { + var ldSoPreload []byte + stat, err := os.Stat(ldSoPreloadPath) + if err == nil { + ldSoPreload, err = os.ReadFile(ldSoPreloadPath) + if err != nil { + return err + } + } else if !os.IsNotExist(err) { + return err + } else { + return nil + } + + newLdSoPreload, err := a.deleteLDPreloadConfigContent(ldSoPreload) + if err != nil { + return err + } + if bytes.Equal(ldSoPreload, newLdSoPreload) { + // No changes needed + return nil + } + + perms := os.FileMode(0644) + if stat != nil { + perms = stat.Mode() + } + err = os.WriteFile("/tmp/ld.so.preload.tmp", newLdSoPreload, perms) + if err != nil { + return err + } + + return executeCommand(string(replaceLDPreloadCommand)) +} + +// deleteLDPreloadConfigContent deletes the content of the LD preload configuration +func (a *apmInjectorInstaller) deleteLDPreloadConfigContent(ldSoPreload []byte) ([]byte, error) { + launcherPreloadPath := path.Join(a.installPath, "inject", "launcher.preload.so") + + if !strings.Contains(string(ldSoPreload), launcherPreloadPath) { + // If the line of interest isn't there, return fast + return ldSoPreload, nil + } + + // Possible configurations of the preload path, order matters + replacementsToTest := [][]byte{ + []byte(launcherPreloadPath + "\n"), + []byte("\n" + launcherPreloadPath), + []byte(launcherPreloadPath + " "), + []byte(" " + launcherPreloadPath), + } + for _, replacement := range replacementsToTest { + ldSoPreloadNew := bytes.Replace(ldSoPreload, replacement, []byte{}, 1) + if !bytes.Equal(ldSoPreloadNew, ldSoPreload) { + return ldSoPreloadNew, nil + } + } + if bytes.Equal(ldSoPreload, []byte(launcherPreloadPath)) { + // If the line is the only one in the file without newlines, return an empty file + return []byte{}, nil + } + + return nil, fmt.Errorf("failed to remove %s from %s", launcherPreloadPath, ldSoPreloadPath) +} + +// setAgentConfig adds the agent configuration for the APM injector if it is not there already +// We assume that the agent file has been created by the installer's postinst script +// +// Note: This is not safe, as it assumes there were no changes to the agent configuration made without +// restart by the user. This means that the agent can crash on restart. This is a limitation of the current +// installer system and this will be replaced by a proper experiment when available. This is a temporary +// solution to allow the APM injector to be installed, and if the agent crashes, we try to detect it and +// restore the previous configuration +func (a *apmInjectorInstaller) setAgentConfig() (err error) { + err = backupAgentConfig() + if err != nil { + return err + } + defer func() { + if err != nil { + restoreErr := restoreAgentConfig() + if restoreErr != nil { + log.Warnf("Failed to restore agent config: %v", restoreErr) + } + } + }() + + content, err := os.ReadFile(datadogConfigPath) + if err != nil { + return err + } + + newContent := a.setAgentConfigContent(content) + if bytes.Equal(content, newContent) { + // No changes needed + return nil + } + + err = os.WriteFile(datadogConfigPath, newContent, 0644) + if err != nil { + return err + } + + err = restartTraceAgent() + return +} + +func (a *apmInjectorInstaller) setAgentConfigContent(content []byte) []byte { + runPath := path.Join(a.installPath, "inject", "run") + apmSocketPath := path.Join(runPath, "apm.socket") + dsdSocketPath := path.Join(runPath, "dsd.socket") + + if !bytes.Contains(content, injectorConfigPrefix) { + content = append(content, []byte("\n")...) + content = append(content, injectorConfigPrefix...) + content = append(content, []byte( + fmt.Sprintf(injectorConfigTemplate, apmSocketPath, dsdSocketPath), + )...) + content = append(content, injectorConfigSuffix...) + content = append(content, []byte("\n")...) + } + return content +} + +// deleteAgentConfig removes the agent configuration for the APM injector +func (a *apmInjectorInstaller) deleteAgentConfig() (err error) { + err = backupAgentConfig() + if err != nil { + return err + } + defer func() { + if err != nil { + restoreErr := restoreAgentConfig() + if restoreErr != nil { + log.Warnf("Failed to restore agent config: %v", restoreErr) + } + } + }() + + content, err := os.ReadFile(datadogConfigPath) + if err != nil { + return err + } + + newContent := a.deleteAgentConfigContent(content) + if bytes.Equal(content, newContent) { + // No changes needed + return nil + } + + err = os.WriteFile(datadogConfigPath, content, 0644) + if err != nil { + return err + } + + return restartTraceAgent() +} + +// deleteAgentConfigContent deletes the agent configuration for the APM injector +func (a *apmInjectorInstaller) deleteAgentConfigContent(content []byte) []byte { + start := bytes.Index(content, injectorConfigPrefix) + end := bytes.Index(content, injectorConfigSuffix) + len(injectorConfigSuffix) + if start == -1 || end == -1 || start >= end { + // Config not found + return content + } + + return append(content[:start], content[end:]...) +} + +// backupAgentConfig backs up the agent configuration +func backupAgentConfig() error { + return executeCommandStruct(privilegeCommand{ + Command: string(backupCommand), + Path: datadogConfigPath, + }) +} + +// restoreAgentConfig restores the agent configuration & restarts the agent +func restoreAgentConfig() error { + err := executeCommandStruct(privilegeCommand{ + Command: string(restoreCommand), + Path: datadogConfigPath, + }) + if err != nil { + return err + } + return restartTraceAgent() +} + +// restartTraceAgent restarts the trace agent, both stable and experimental +func restartTraceAgent() error { + if err := restartUnit("datadog-agent-trace.service"); err != nil { + return err + } + if err := restartUnit("datadog-agent-trace-exp.service"); err != nil { + return err + } + return nil +} diff --git a/pkg/updater/service/apm_inject_test.go b/pkg/updater/service/apm_inject_test.go new file mode 100644 index 0000000000000..813f800a0ee74 --- /dev/null +++ b/pkg/updater/service/apm_inject_test.go @@ -0,0 +1,155 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build !windows + +// Package service provides a way to interact with os services +package service + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSetLDPreloadConfig(t *testing.T) { + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + + for input, expected := range map[string]string{ + // File doesn't exist + "": "/tmp/stable/inject/launcher.preload.so\n", + // File contains unrelated entries + "/abc/def/preload.so\n": "/abc/def/preload.so\n/tmp/stable/inject/launcher.preload.so\n", + // File contains unrelated entries with no newline + "/abc/def/preload.so": "/abc/def/preload.so\n/tmp/stable/inject/launcher.preload.so\n", + } { + output, err := a.setLDPreloadConfigContent([]byte(input)) + assert.Nil(t, err) + assert.Equal(t, expected, string(output)) + } +} + +func TestRemoveLDPreloadConfig(t *testing.T) { + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + + for input, expected := range map[string]string{ + // File doesn't exist + "": "", + // File only contains the entry to remove + "/tmp/stable/inject/launcher.preload.so\n": "", + // File only contains the entry to remove without newline + "/tmp/stable/inject/launcher.preload.so": "", + // File contains unrelated entries + "/abc/def/preload.so\n/tmp/stable/inject/launcher.preload.so\n": "/abc/def/preload.so\n", + // File contains unrelated entries at the end + "/tmp/stable/inject/launcher.preload.so\n/def/abc/preload.so": "/def/abc/preload.so", + // File contains multiple unrelated entries + "/abc/def/preload.so\n/tmp/stable/inject/launcher.preload.so\n/def/abc/preload.so": "/abc/def/preload.so\n/def/abc/preload.so", + // File contains unrelated entries with no newline (reformatted by customer?) + "/abc/def/preload.so /tmp/stable/inject/launcher.preload.so": "/abc/def/preload.so", + // File contains unrelated entries with no newline (reformatted by customer?) + "/abc/def/preload.so /tmp/stable/inject/launcher.preload.so /def/abc/preload.so": "/abc/def/preload.so /def/abc/preload.so", + // File contains unrelated entries with no newline (reformatted by customer?) + "/tmp/stable/inject/launcher.preload.so /def/abc/preload.so": "/def/abc/preload.so", + // File doesn't contain the entry to remove (removed by customer?) + "/abc/def/preload.so /def/abc/preload.so": "/abc/def/preload.so /def/abc/preload.so", + } { + output, err := a.deleteLDPreloadConfigContent([]byte(input)) + assert.Nil(t, err) + assert.Equal(t, expected, string(output)) + } + + // File is badly formatted (non-breaking space instead of space) + input := "/tmp/stable/inject/launcher.preload.so\u00a0/def/abc/preload.so" + output, err := a.deleteLDPreloadConfigContent([]byte(input)) + assert.NotNil(t, err) + assert.Equal(t, "", string(output)) +} + +func TestSetAgentConfig(t *testing.T) { + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + + for input, expected := range map[string]string{ + // File doesn't exist + "": ` +# BEGIN LD PRELOAD CONFIG +apm_config: + receiver_socket: /tmp/stable/inject/run/apm.socket +use_dogstatsd: true +dogstatsd_socket: /tmp/stable/inject/run/dsd.socket +# END LD PRELOAD CONFIG +`, + // File contains unrelated entries + `api_key: 000000000 +site: datad0g.com`: `api_key: 000000000 +site: datad0g.com +# BEGIN LD PRELOAD CONFIG +apm_config: + receiver_socket: /tmp/stable/inject/run/apm.socket +use_dogstatsd: true +dogstatsd_socket: /tmp/stable/inject/run/dsd.socket +# END LD PRELOAD CONFIG +`, + // File already contains the agent config + `# BEGIN LD PRELOAD CONFIG +apm_config: + receiver_socket: /tmp/stable/inject/run/apm.socket +use_dogstatsd: true +dogstatsd_socket: /tmp/stable/inject/run/dsd.socket +# END LD PRELOAD CONFIG`: `# BEGIN LD PRELOAD CONFIG +apm_config: + receiver_socket: /tmp/stable/inject/run/apm.socket +use_dogstatsd: true +dogstatsd_socket: /tmp/stable/inject/run/dsd.socket +# END LD PRELOAD CONFIG`, + } { + output := a.setAgentConfigContent([]byte(input)) + assert.Equal(t, expected, string(output)) + } +} + +func TestRemoveAgentConfig(t *testing.T) { + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + + for input, expected := range map[string]string{ + // File doesn't exist + "": "", + // File only contains the agent config + `# BEGIN LD PRELOAD CONFIG + apm_config: + receiver_socket: /tmp/stable/inject/run/apm.socket + use_dogstatsd: true + dogstatsd_socket: /tmp/stable/inject/run/dsd.socket + # END LD PRELOAD CONFIG`: "", + // File contains unrelated entries + `api_key: 000000000 +site: datad0g.com +# BEGIN LD PRELOAD CONFIG +apm_config: + receiver_socket: /tmp/stable/inject/run/apm.socket +use_dogstatsd: true +dogstatsd_socket: /tmp/stable/inject/run/dsd.socket +# END LD PRELOAD CONFIG +`: `api_key: 000000000 +site: datad0g.com + +`, + // File **only** contains unrelated entries somehow + `api_key: 000000000 +site: datad0g.com`: `api_key: 000000000 +site: datad0g.com`, + } { + output := a.deleteAgentConfigContent([]byte(input)) + assert.Equal(t, expected, string(output)) + } +} diff --git a/pkg/updater/service/apm_inject_windows.go b/pkg/updater/service/apm_inject_windows.go new file mode 100644 index 0000000000000..8bbb49c5c7095 --- /dev/null +++ b/pkg/updater/service/apm_inject_windows.go @@ -0,0 +1,19 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build windows + +// Package service provides a way to interact with os services +package service + +// SetupAPMInjector noop +func SetupAPMInjector() error { + return nil +} + +// RemoveAPMInjector noop +func RemoveAPMInjector() error { + return nil +} diff --git a/pkg/updater/service/datadog_agent.go b/pkg/updater/service/datadog_agent.go index e183f9f5f5229..8767e7a20d864 100644 --- a/pkg/updater/service/datadog_agent.go +++ b/pkg/updater/service/datadog_agent.go @@ -9,6 +9,9 @@ package service import ( + "os/exec" + "strings" + "github.com/DataDog/datadog-agent/pkg/util/installinfo" "github.com/DataDog/datadog-agent/pkg/util/log" ) @@ -52,6 +55,10 @@ func SetupAgentUnits() (err error) { } }() + if err = setInstallerAgentGroup(); err != nil { + return + } + for _, unit := range stableUnits { if err = loadUnit(unit); err != nil { return @@ -132,3 +139,16 @@ func StartAgentExperiment() error { func StopAgentExperiment() error { return startUnit(agentUnit) } + +// setInstallerAgentGroup adds the dd-installer to the dd-agent group if it's not already in it +func setInstallerAgentGroup() error { + // Get groups of dd-installer + out, err := exec.Command("id", "-Gn", "dd-installer").Output() + if err != nil { + return err + } + if strings.Contains(string(out), "dd-agent") { + return nil + } + return executeCommand(string(addInstallerToAgentGroup)) +} diff --git a/pkg/updater/service/docker.go b/pkg/updater/service/docker.go new file mode 100644 index 0000000000000..c4cdb3fc0de20 --- /dev/null +++ b/pkg/updater/service/docker.go @@ -0,0 +1,196 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build !windows + +// Package service provides a way to interact with os services +package service + +import ( + "bytes" + "encoding/json" + "os" + "os/exec" + "path" + + "github.com/DataDog/datadog-agent/pkg/util/log" +) + +type dockerDaemonConfig map[string]interface{} + +const ( + tmpDockerDaemonPath = "/tmp/daemon.json.tmp" + dockerDaemonPath = "/etc/docker/daemon.json" +) + +// setDockerConfig sets up the docker daemon to use the APM injector +// even if docker isn't installed, to prepare for if it is installed +// later +func (a *apmInjectorInstaller) setDockerConfig() error { + // Create docker dir if it doesn't exist + err := executeCommand(createDockerDirCommand) + if err != nil { + return err + } + + var file []byte + stat, err := os.Stat(dockerDaemonPath) + if err == nil { + // Read the existing configuration + file, err = os.ReadFile(dockerDaemonPath) + if err != nil { + return err + } + } else if !os.IsNotExist(err) { + return err + } + + dockerConfigJSON, err := a.setDockerConfigContent(file) + if err != nil { + return err + } + + // Write the new configuration to a temporary file + perms := os.FileMode(0644) + if stat != nil { + perms = stat.Mode() + } + err = os.WriteFile(tmpDockerDaemonPath, dockerConfigJSON, perms) + if err != nil { + return err + } + + // Move the temporary file to the final location + err = executeCommand(string(replaceDockerCommand)) + if err != nil { + return err + } + + return restartDocker() +} + +// setDockerConfigContent sets the content of the docker daemon configuration +func (a *apmInjectorInstaller) setDockerConfigContent(previousContent []byte) ([]byte, error) { + dockerConfig := dockerDaemonConfig{} + + if len(previousContent) > 0 { + err := json.Unmarshal(previousContent, &dockerConfig) + if err != nil { + return nil, err + } + } + + if _, ok := dockerConfig["default-runtime"]; ok { + dockerConfig["default-runtime-backup"] = dockerConfig["default-runtime"] + } + dockerConfig["default-runtime"] = "dd-shim" + runtimes, ok := dockerConfig["runtimes"].(map[string]interface{}) + if !ok { + runtimes = map[string]interface{}{} + } + runtimes["dd-shim"] = map[string]interface{}{ + "path": path.Join(a.installPath, "inject", "auto_inject_runc"), + } + dockerConfig["runtimes"] = runtimes + + dockerConfigJSON, err := json.MarshalIndent(dockerConfig, "", " ") + if err != nil { + return nil, err + } + + return dockerConfigJSON, nil +} + +// deleteDockerConfig restores the docker daemon configuration +func (a *apmInjectorInstaller) deleteDockerConfig() error { + var file []byte + stat, err := os.Stat(dockerDaemonPath) + if err == nil { + // Read the existing configuration + file, err = os.ReadFile(dockerDaemonPath) + if err != nil { + return err + } + } else if os.IsNotExist(err) { + // If the file doesn't exist, there's nothing to do + return nil + } + + dockerConfigJSON, err := a.deleteDockerConfigContent(file) + if err != nil { + return err + } + + // Write the new configuration to a temporary file + perms := os.FileMode(0644) + if stat != nil { + perms = stat.Mode() + } + err = os.WriteFile(tmpDockerDaemonPath, dockerConfigJSON, perms) + if err != nil { + return err + } + + // Move the temporary file to the final location + err = executeCommand(string(replaceDockerCommand)) + if err != nil { + return err + } + return restartDocker() +} + +// deleteDockerConfigContent restores the content of the docker daemon configuration +func (a *apmInjectorInstaller) deleteDockerConfigContent(previousContent []byte) ([]byte, error) { + dockerConfig := dockerDaemonConfig{} + + if len(previousContent) > 0 { + err := json.Unmarshal(previousContent, &dockerConfig) + if err != nil { + return nil, err + } + } + + if _, ok := dockerConfig["default-runtime-backup"]; ok { + dockerConfig["default-runtime"] = dockerConfig["default-runtime-backup"] + delete(dockerConfig, "default-runtime-backup") + } else { + dockerConfig["default-runtime"] = "runc" + } + runtimes, ok := dockerConfig["runtimes"].(map[string]interface{}) + if !ok { + runtimes = map[string]interface{}{} + } + delete(runtimes, "dd-shim") + dockerConfig["runtimes"] = runtimes + + dockerConfigJSON, err := json.MarshalIndent(dockerConfig, "", " ") + if err != nil { + return nil, err + } + + return dockerConfigJSON, nil +} + +// restartDocker reloads the docker daemon if it exists +func restartDocker() error { + if !isDockerInstalled() { + log.Info("updater: docker is not installed, skipping reload") + return nil + } + return executeCommand(restartDockerCommand) +} + +// isDockerInstalled checks if docker is installed on the system +func isDockerInstalled() bool { + cmd := exec.Command("which", "docker") + var outb bytes.Buffer + cmd.Stdout = &outb + err := cmd.Run() + if err != nil { + log.Warn("updater: failed to check if docker is installed, assuming it isn't: ", err) + return false + } + return len(outb.String()) != 0 +} diff --git a/pkg/updater/service/docker_test.go b/pkg/updater/service/docker_test.go new file mode 100644 index 0000000000000..912a4d680a606 --- /dev/null +++ b/pkg/updater/service/docker_test.go @@ -0,0 +1,137 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +//go:build !windows + +// Package service provides a way to interact with os services +package service + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestSetDockerConfig(t *testing.T) { + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + + for input, expected := range map[string]string{ + // File doesn't exist + "": `{ + "default-runtime": "dd-shim", + "runtimes": { + "dd-shim": { + "path": "/tmp/stable/inject/auto_inject_runc" + } + } +}`, + // File contains unrelated entries + `{ + "cgroup-parent": "abc", + "raw-logs": false +}`: `{ + "cgroup-parent": "abc", + "default-runtime": "dd-shim", + "raw-logs": false, + "runtimes": { + "dd-shim": { + "path": "/tmp/stable/inject/auto_inject_runc" + } + } +}`, + // File has already overridden the default runtime + `{ + "default-runtime": "containerd", + "runtimes": { + "containerd": { + "path": "/usr/bin/containerd" + } + } +}`: `{ + "default-runtime": "dd-shim", + "default-runtime-backup": "containerd", + "runtimes": { + "containerd": { + "path": "/usr/bin/containerd" + }, + "dd-shim": { + "path": "/tmp/stable/inject/auto_inject_runc" + } + } +}`, + } { + output, err := a.setDockerConfigContent([]byte(input)) + assert.Nil(t, err) + assert.Equal(t, expected, string(output)) + } +} + +func TestRemoveDockerConfig(t *testing.T) { + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + + for input, expected := range map[string]string{ + // Empty file, shouldn't happen but still tested + "": `{ + "default-runtime": "runc", + "runtimes": {} +}`, + // File only contains the injected content + `{ + "default-runtime": "dd-shim", + "runtimes": { + "dd-shim": { + "path": "/tmp/stable/inject/auto_inject_runc" + } + } + }`: `{ + "default-runtime": "runc", + "runtimes": {} +}`, + // File contained unrelated entries + `{ + "cgroup-parent": "abc", + "default-runtime": "dd-shim", + "raw-logs": false, + "runtimes": { + "dd-shim": { + "path": "/tmp/stable/inject/auto_inject_runc" + } + } +}`: `{ + "cgroup-parent": "abc", + "default-runtime": "runc", + "raw-logs": false, + "runtimes": {} +}`, + // File had already overridden the default runtime + `{ + "default-runtime": "dd-shim", + "default-runtime-backup": "containerd", + "runtimes": { + "containerd": { + "path": "/usr/bin/containerd" + }, + "dd-shim": { + "path": "/tmp/stable/inject/auto_inject_runc" + } + } +}`: `{ + "default-runtime": "containerd", + "runtimes": { + "containerd": { + "path": "/usr/bin/containerd" + } + } +}`, + } { + output, err := a.deleteDockerConfigContent([]byte(input)) + assert.Nil(t, err) + assert.Equal(t, expected, string(output)) + } +} diff --git a/pkg/updater/service/helper/main.go b/pkg/updater/service/helper/main.go index 37f9ac13a06c6..a20a1de7a99c1 100644 --- a/pkg/updater/service/helper/main.go +++ b/pkg/updater/service/helper/main.go @@ -8,6 +8,7 @@ package main import ( + "bytes" "encoding/json" "fmt" "log" @@ -25,6 +26,8 @@ var ( installPath string systemdPath = "/lib/systemd/system" // todo load it at build time from omnibus pkgDir = "/opt/datadog-packages" + agentDir = "/etc/datadog-agent" + dockerDir = "/etc/docker" testSkipUID = "" ) @@ -36,6 +39,7 @@ type privilegeCommand struct { Command string `json:"command,omitempty"` Unit string `json:"unit,omitempty"` Path string `json:"path,omitempty"` + Content string `json:"content,omitempty"` } func isValidUnitChar(c rune) bool { @@ -66,6 +70,16 @@ func buildCommand(inputCommand privilegeCommand) (*exec.Cmd, error) { return exec.Command("ln", "-sf", "/opt/datadog-packages/datadog-agent/stable/bin/agent/agent", "/usr/bin/datadog-agent"), nil case "rm-agent-symlink": return exec.Command("rm", "-f", "/usr/bin/datadog-agent"), nil + case "create-docker-dir": + return exec.Command("mkdir", "-p", "/etc/docker"), nil + case "replace-docker": + return exec.Command("mv", "/tmp/daemon.json.tmp", "/etc/docker/daemon.json"), nil + case "restart-docker": + return exec.Command("systemctl", "restart", "docker"), nil + case "replace-ld-preload": + return exec.Command("mv", "/tmp/ld.so.preload.tmp", "/etc/ld.so.preload"), nil + case "add-installer-to-agent-group": + return exec.Command("usermod", "-aG", "dd-agent", "dd-installer"), nil default: return nil, fmt.Errorf("invalid command") } @@ -99,7 +113,7 @@ func buildPathCommand(inputCommand privilegeCommand) (*exec.Cmd, error) { if absPath != path || err != nil { return nil, fmt.Errorf("invalid path") } - if !strings.HasPrefix(path, pkgDir) { + if !strings.HasPrefix(path, pkgDir) && !strings.HasPrefix(path, agentDir) { return nil, fmt.Errorf("invalid path") } switch inputCommand.Command { @@ -107,6 +121,10 @@ func buildPathCommand(inputCommand privilegeCommand) (*exec.Cmd, error) { return exec.Command("chown", "-R", "dd-agent:dd-agent", path), nil case "rm": return exec.Command("rm", "-rf", path), nil + case "backup-file": + return exec.Command("cp", "-f", path, path+".bak"), nil + case "restore-file": + return exec.Command("mv", path+".bak", path), nil default: return nil, fmt.Errorf("invalid command") } @@ -121,7 +139,7 @@ func executeCommand() error { var pc privilegeCommand err := json.Unmarshal([]byte(inputCommand), &pc) if err != nil { - return fmt.Errorf("decoding command") + return fmt.Errorf("decoding command %s", inputCommand) } currentUser := syscall.Getuid() @@ -150,8 +168,14 @@ func executeCommand() error { }() } + commandErr := new(bytes.Buffer) + command.Stderr = commandErr log.Printf("Running command: %s", command.String()) - return command.Run() + err = command.Run() + if err != nil { + return fmt.Errorf("running command (%s): %s", err.Error(), commandErr.String()) + } + return nil } func main() { diff --git a/pkg/updater/service/systemd.go b/pkg/updater/service/systemd.go index 21f70d94b0fef..2f384b010ad2e 100644 --- a/pkg/updater/service/systemd.go +++ b/pkg/updater/service/systemd.go @@ -10,25 +10,58 @@ package service import ( "encoding/json" + "os" + "path" + + "github.com/DataDog/datadog-agent/pkg/util/log" ) type unitCommand string +var ( + systemdPath = "/lib/systemd/system" // todo load it at build time from omnibus +) + const ( - startCommand unitCommand = "start" - stopCommand unitCommand = "stop" - enableCommand unitCommand = "enable" - disableCommand unitCommand = "disable" - loadCommand unitCommand = "load-unit" - removeCommand unitCommand = "remove-unit" - systemdReloadCommand = `{"command":"systemd-reload"}` - adminExecutor = "datadog-updater-admin.service" + startCommand unitCommand = "start" + stopCommand unitCommand = "stop" + enableCommand unitCommand = "enable" + disableCommand unitCommand = "disable" + loadCommand unitCommand = "load-unit" + removeCommand unitCommand = "remove-unit" + addInstallerToAgentGroup unitCommand = "add-installer-to-agent-group" + backupCommand unitCommand = `backup-file` + restoreCommand unitCommand = `restore-file` + replaceDockerCommand = `{"command":"replace-docker"}` + restartDockerCommand = `{"command":"restart-docker"}` + createDockerDirCommand = `{"command":"create-docker-dir"}` + replaceLDPreloadCommand = `{"command":"replace-ld-preload"}` + systemdReloadCommand = `{"command":"systemd-reload"}` + adminExecutor = "datadog-updater-admin.service" ) type privilegeCommand struct { Command string `json:"command,omitempty"` Unit string `json:"unit,omitempty"` Path string `json:"path,omitempty"` + Content string `json:"content,omitempty"` +} + +// restartUnit restarts a systemd unit +func restartUnit(unit string) error { + // check that the unit exists first + if _, err := os.Stat(path.Join(systemdPath, unit)); os.IsNotExist(err) { + log.Infof("Unit %s does not exist, skipping restart", unit) + return nil + } + + if err := stopUnit(unit); err != nil { + return err + } + if err := startUnit(unit); err != nil { + return err + } + return nil } func stopUnit(unit string) error { @@ -68,3 +101,12 @@ func wrapUnitCommand(command unitCommand, unit string) string { } return string(rawJSON) } + +func executeCommandStruct(command privilegeCommand) error { + rawJSON, err := json.Marshal(command) + if err != nil { + return err + } + privilegeCommandJSON := string(rawJSON) + return executeCommand(privilegeCommandJSON) +} diff --git a/pkg/updater/service/systemd_test.go b/pkg/updater/service/systemd_test.go index 85f48151561e4..51212f6caa015 100644 --- a/pkg/updater/service/systemd_test.go +++ b/pkg/updater/service/systemd_test.go @@ -26,8 +26,8 @@ func TestInvalidCommands(t *testing.T) { // assert wrong commands for input, expected := range map[string]string{ // fail assert_command characters assertion - ";": "error: decoding command\n", - "&": "error: decoding command\n", + ";": "error: decoding command ;\n", + "&": "error: decoding command &\n", `{"command":"start", "unit":"does-not-exist"}`: "error: invalid unit\n", `{"command":"start", "unit":"datadog-//"}`: "error: invalid unit\n", `{"command":"does-not-exist", "unit":"datadog-"}`: "error: invalid command\n", @@ -55,4 +55,13 @@ func TestAssertWorkingCommands(t *testing.T) { assert.Equal(t, successErr, removeUnit("datadog-agent").Error()) assert.Equal(t, successErr, createAgentSymlink().Error()) assert.Equal(t, successErr, rmAgentSymlink().Error()) + assert.Equal(t, successErr, backupAgentConfig().Error()) + assert.Equal(t, successErr, restoreAgentConfig().Error()) + + a := &apmInjectorInstaller{ + installPath: "/tmp/stable", + } + assert.Equal(t, successErr, a.setLDPreloadConfig().Error()) + assert.Equal(t, successErr, a.setAgentConfig().Error()) + assert.Equal(t, successErr, a.setDockerConfig().Error()) } diff --git a/pkg/updater/updater.go b/pkg/updater/updater.go index 76ca92b255d31..766952a8ab57b 100644 --- a/pkg/updater/updater.go +++ b/pkg/updater/updater.go @@ -109,6 +109,9 @@ func Purge() { func purge(locksPath, repositoryPath string) { service.RemoveAgentUnits() + if err := service.RemoveAPMInjector(); err != nil { + log.Warnf("updater: could not remove APM injector: %v", err) + } cleanDir(locksPath, os.RemoveAll) cleanDir(repositoryPath, service.RemoveAll) } @@ -220,7 +223,7 @@ func (u *updaterImpl) BootstrapDefault(ctx context.Context, pkg string) (err err stablePackage, ok := u.catalog.getDefaultPackage(u.bootstrapVersions, pkg, runtime.GOARCH, runtime.GOOS) if !ok { - return fmt.Errorf("could not get default package %s for %s, %s", pkg, runtime.GOARCH, runtime.GOOS) + return fmt.Errorf("could not get default package '%s' for arch '%s' and platform '%s'", pkg, runtime.GOARCH, runtime.GOOS) } return u.boostrapPackage(ctx, stablePackage.URL, stablePackage.Name, stablePackage.Version) } @@ -236,7 +239,7 @@ func (u *updaterImpl) BootstrapVersion(ctx context.Context, pkg string, version stablePackage, ok := u.catalog.getPackage(pkg, version, runtime.GOARCH, runtime.GOOS) if !ok { - return fmt.Errorf("could not get package %s version %s for %s, %s", pkg, version, runtime.GOARCH, runtime.GOOS) + return fmt.Errorf("could not get package '%s' version '%s' for arch '%s' and platform '%s'", pkg, version, runtime.GOARCH, runtime.GOOS) } return u.boostrapPackage(ctx, stablePackage.URL, stablePackage.Name, stablePackage.Version) } diff --git a/test/new-e2e/tests/updater/docker.go b/test/new-e2e/tests/updater/docker.go new file mode 100644 index 0000000000000..3762f53f589d4 --- /dev/null +++ b/test/new-e2e/tests/updater/docker.go @@ -0,0 +1,91 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2016-present Datadog, Inc. + +// Package updater contains tests for the updater package +package updater + +import ( + "testing" + "time" + + "github.com/DataDog/datadog-agent/test/new-e2e/pkg/components" + "github.com/DataDog/test-infra-definitions/components/os" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// installDocker installs docker on the host +func installDocker(distro os.Descriptor, t *testing.T, host *components.RemoteHost) { + switch distro { + case os.UbuntuDefault: + _, err := host.WriteFile("/tmp/install-docker.sh", []byte(` +sudo apt-get update +sudo apt-get install ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + `)) + require.Nil(t, err) + host.MustExecute(`sudo chmod +x /tmp/install-docker.sh`) + host.MustExecute(`sudo /tmp/install-docker.sh`) + err = host.Remove("/tmp/install-docker.sh") + require.Nil(t, err) + case os.DebianDefault: + _, err := host.WriteFile("/tmp/install-docker.sh", []byte(` +sudo apt-get update +sudo apt-get install ca-certificates curl +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc + +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + `)) + require.Nil(t, err) + host.MustExecute(`sudo chmod +x /tmp/install-docker.sh`) + host.MustExecute(`sudo /tmp/install-docker.sh`) + err = host.Remove("/tmp/install-docker.sh") + require.Nil(t, err) + case os.CentOSDefault, os.RedHatDefault: + _, err := host.WriteFile("/tmp/install-docker.sh", []byte(` +sudo yum install -y yum-utils +sudo yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo +sudo yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +sudo systemctl start docker + `)) + require.Nil(t, err) + host.MustExecute(`sudo chmod +x /tmp/install-docker.sh`) + host.MustExecute(`sudo /tmp/install-docker.sh`) + err = host.Remove("/tmp/install-docker.sh") + require.Nil(t, err) + default: + t.Fatalf("unsupported distro: %s", distro.String()) + } +} + +// launchJavaDockerContainer launches a small Java HTTP server in a docker container +// and make a call to it +func launchJavaDockerContainer(t *testing.T, host *components.RemoteHost) { + host.MustExecute(`sudo docker run -d -p8887:8888 baptistefoy702/message-server:latest`) + // for i := 0; i < 10; i++ { + assert.Eventually(t, + func() bool { + _, err := host.Execute(`curl -m 1 localhost:8887/messages`) + return err == nil + }, 10*time.Second, 100*time.Millisecond, + ) + // } +} diff --git a/test/new-e2e/tests/updater/linux_test.go b/test/new-e2e/tests/updater/linux_test.go index 4d49c168ef955..59280eaf1f0f7 100644 --- a/test/new-e2e/tests/updater/linux_test.go +++ b/test/new-e2e/tests/updater/linux_test.go @@ -12,6 +12,7 @@ import ( "regexp" "strings" "testing" + "time" "github.com/DataDog/test-infra-definitions/components/os" "github.com/DataDog/test-infra-definitions/scenarios/aws/ec2" @@ -37,12 +38,14 @@ const ( type vmUpdaterSuite struct { e2e.BaseSuite[environments.Host] packageManager string + distro os.Descriptor + arch os.Architecture } func runTest(t *testing.T, pkgManager string, arch os.Architecture, distro os.Descriptor) { reg := regexp.MustCompile(`[^a-zA-Z0-9_\-.]`) testName := reg.ReplaceAllString(distro.String()+"-"+string(arch), "_") - e2e.Run(t, &vmUpdaterSuite{packageManager: pkgManager}, e2e.WithProvisioner(awshost.ProvisionerNoFakeIntake( + e2e.Run(t, &vmUpdaterSuite{packageManager: pkgManager, distro: distro, arch: arch}, e2e.WithProvisioner(awshost.ProvisionerNoFakeIntake( awshost.WithUpdater(), awshost.WithEC2InstanceOptions(ec2.WithOSArch(distro, arch)), )), @@ -202,6 +205,142 @@ func (v *vmUpdaterSuite) TestPurgeAndInstallAgent() { } } +func (v *vmUpdaterSuite) TestPurgeAndInstallAPMInjector() { + // Temporarily disable CentOS & Redhat, as there is a bug in the APM injector + if v.distro == os.CentOSDefault || v.distro == os.RedHatDefault { + v.T().Skip("APM injector not available for CentOS or RedHat yet") + } + if v.distro == os.DebianDefault || v.distro == os.UbuntuDefault && v.arch == os.AMD64Arch { + // TODO (baptiste): Fix test + v.T().Skip("Test has been temporarily disabled") + } + + host := v.Env().RemoteHost + + /////////////////// + // Setup machine // + /////////////////// + + host.MustExecute(fmt.Sprintf("sudo %v/bin/installer/installer purge", bootUpdaterDir)) + // Install docker + installDocker(v.distro, v.T(), host) + defer func() { + // Best effort to stop any running container at the end of the test + host.Execute(`sudo docker ps -aq | xargs sudo docker stop | xargs sudo docker rm`) + }() + + ///////////////////////// + // Check initial state // + ///////////////////////// + + // packages dir exists; but there are no packages installed + host.MustExecute(`test -d /opt/datadog-packages`) + _, err := host.Execute(`test -d /opt/datadog-packages/datadog-apm-inject`) + require.NotNil(v.T(), err) + _, err = host.Execute(`test -d /opt/datadog-packages/datadog-agent`) + require.NotNil(v.T(), err) + _, err = host.Execute(`test -d /opt/datadog-packages/datadog-apm-library-java`) + require.NotNil(v.T(), err) + + // /etc/ld.so.preload does not contain the injector + _, err = host.Execute(`grep "/opt/datadog-packages/datadog-apm-inject" /etc/ld.so.preload`) + require.NotNil(v.T(), err) + + // docker daemon does not contain the injector + _, err = host.Execute(`grep "/opt/datadog-packages/datadog-apm-inject" /etc/docker/daemon.json`) + require.NotNil(v.T(), err) + + //////////////////////// + // Bootstrap packages // + //////////////////////// + + host.MustExecute(fmt.Sprintf(`sudo %v/bin/installer/installer bootstrap --url "oci://docker.io/datadog/agent-package-dev:7.54.0-devel.git.247.f92fbc1.pipeline.31778392-1"`, bootUpdaterDir)) + host.MustExecute(fmt.Sprintf(`sudo %v/bin/installer/installer bootstrap --url "oci://docker.io/datadog/apm-library-java-package-dev:1.32.0-SNAPSHOT-8708864e8e-pipeline.30373268.beta.8708864e-1"`, bootUpdaterDir)) + host.MustExecute(fmt.Sprintf(`sudo %v/bin/installer/installer bootstrap --url "oci://docker.io/datadog/apm-inject-package-dev:0.12.3-dev.bddec85.glci481808135.g8acdc698-1"`, bootUpdaterDir)) + + //////////////////////////////// + // Check post-bootstrap state // + //////////////////////////////// + + // assert packages dir exist + host.MustExecute(`test -L /opt/datadog-packages/datadog-agent/stable`) + host.MustExecute(`test -L /opt/datadog-packages/datadog-apm-library-java/stable`) + host.MustExecute(`test -L /opt/datadog-packages/datadog-apm-inject/stable`) + + // assert /etc/ld.so.preload contains the injector + res, err := host.Execute(`grep "/opt/datadog-packages/datadog-apm-inject" /etc/ld.so.preload`) + require.Nil(v.T(), err) + require.Equal(v.T(), "/opt/datadog-packages/datadog-apm-inject/stable/inject/launcher.preload.so\n", res) + + // assert docker daemon contains the injector (removing blank spaces for easier comparison) + res, err = host.Execute(`grep "/opt/datadog-packages/datadog-apm-inject" /etc/docker/daemon.json | sed -re 's/^[[:blank:]]+|[[:blank:]]+$//g' -e 's/[[:blank:]]+/ /g'`) + require.Nil(v.T(), err) + require.Equal(v.T(), "\"path\": \"/opt/datadog-packages/datadog-apm-inject/stable/inject/auto_inject_runc\"\n", res) + + // assert agent config has been changed + raw, err := host.ReadFile("/etc/datadog-agent/datadog.yaml") + require.Nil(v.T(), err) + require.True(v.T(), strings.Contains(string(raw), "# BEGIN LD PRELOAD CONFIG"), "missing LD_PRELOAD config, config:\n%s", string(raw)) + + // assert agent is running + host.MustExecute("sudo systemctl status datadog-agent.service") + + _, err = host.Execute("sudo systemctl status datadog-agent-trace.service") + require.Nil(v.T(), err) + + // assert required files exist + requiredFiles := []string{ + "auto_inject_runc", + "launcher.preload.so", + "ld.so.preload", + "musl-launcher.preload.so", + "process", + } + for _, file := range requiredFiles { + host.MustExecute(fmt.Sprintf("test -f /opt/datadog-packages/datadog-apm-inject/stable/inject/%s", file)) + } + + // assert file ownerships + injectorDir := "/opt/datadog-packages/datadog-apm-inject" + require.Equal(v.T(), "dd-installer\n", host.MustExecute(`stat -c "%U" `+injectorDir)) + require.Equal(v.T(), "dd-installer\n", host.MustExecute(`stat -c "%G" `+injectorDir)) + require.Equal(v.T(), "drwxr-xr-x\n", host.MustExecute(`stat -c "%A" `+injectorDir)) + require.Equal(v.T(), "1\n", host.MustExecute(`sudo ls -l /opt/datadog-packages/datadog-apm-inject | awk '$9 != "stable" && $3 == "dd-installer" && $4 == "dd-installer"' | wc -l`)) + + ///////////////////////////////////// + // Check injection with a real app // + ///////////////////////////////////// + + launchJavaDockerContainer(v.T(), host) + + // check "Dropping Payload due to non-retryable error" in trace agent logs + // as we don't have an API key the payloads can't be flushed successfully, + // but this log indicates that the trace agent managed to receive the payload + require.Eventually(v.T(), func() bool { + _, err := host.Execute(`cat /var/log/datadog/trace-agent.log | grep "Dropping Payload due to non-retryable error"`) + return err == nil + }, 30*time.Second, 100*time.Millisecond) + + /////////////////////// + // Check purge state // + /////////////////////// + + host.MustExecute(fmt.Sprintf("sudo %v/bin/installer/installer purge", bootUpdaterDir)) + + _, err = host.Execute(`test -d /opt/datadog-packages/datadog-apm-inject`) + require.NotNil(v.T(), err) + _, err = host.Execute(`test -d /opt/datadog-packages/datadog-agent`) + require.NotNil(v.T(), err) + _, err = host.Execute(`test -d /opt/datadog-packages/datadog-apm-library-java`) + require.NotNil(v.T(), err) + _, err = host.Execute(`grep "/opt/datadog-packages/datadog-apm-inject" /etc/ld.so.preload`) + require.NotNil(v.T(), err) + _, err = host.Execute(`grep "/opt/datadog-packages/datadog-apm-inject" /etc/docker/daemon.json`) + require.NotNil(v.T(), err) + _, err = host.Execute(`test -f /etc/docker/daemon.json.bak`) + require.NotNil(v.T(), err) +} + func assertInstallMethod(v *vmUpdaterSuite, t *testing.T, host *components.RemoteHost) { rawYaml, err := host.ReadFile(filepath.Join(confDir, "install_info")) assert.Nil(t, err) From 1f33b617699c27397ea2b6f668c686f85d83f9c6 Mon Sep 17 00:00:00 2001 From: Dylan Yang Date: Fri, 12 Apr 2024 16:03:46 -0400 Subject: [PATCH 9/9] [SVLS-4142] Create a Lambda span on timeouts (#21481) * create a Lambda span on timeouts * don't create a cold start span when the runtime restarts during timeouts * fix linting * fix test * lint: rename name variables * lint again * small fixes * refactor timeout span logic * add mutexes * fix span completed check * revert refactor * remove cold start span changes * use mutex over rwmutex * test routes * add comment + update tests * test endExecutionSpan * add serverless.go test * add test /hello for route * only set span incomplete when /startInvocation has been hit * time out -> timeout Co-authored-by: Duncan Harvey <35278470+duncanpharvey@users.noreply.github.com> --------- Co-authored-by: Duncan Harvey <35278470+duncanpharvey@users.noreply.github.com> --- cmd/serverless/main.go | 2 +- pkg/serverless/daemon/daemon.go | 30 +++++ pkg/serverless/daemon/routes.go | 4 + pkg/serverless/daemon/routes_test.go | 54 ++++++++ .../invocationlifecycle/invocation_details.go | 1 + .../invocationlifecycle/lifecycle.go | 53 ++++---- .../invocationlifecycle/lifecycle_test.go | 117 ++++++++++++++++++ pkg/serverless/invocationlifecycle/trace.go | 43 +++++-- .../invocationlifecycle/trace_test.go | 48 +++++++ pkg/serverless/serverless.go | 22 ++++ pkg/serverless/serverless_test.go | 40 ++++++ 11 files changed, 379 insertions(+), 35 deletions(-) diff --git a/cmd/serverless/main.go b/cmd/serverless/main.go index b974114acbe6e..dd13654bc35be 100644 --- a/cmd/serverless/main.go +++ b/cmd/serverless/main.go @@ -287,7 +287,7 @@ func runAgent() { ExtraTags: serverlessDaemon.ExtraTags, Demux: serverlessDaemon.MetricAgent.Demux, ProcessTrace: ta.Process, - DetectLambdaLibrary: func() bool { return serverlessDaemon.LambdaLibraryDetected }, + DetectLambdaLibrary: serverlessDaemon.IsLambdaLibraryDetected, InferredSpansEnabled: inferredspan.IsInferredSpansEnabled(), } diff --git a/pkg/serverless/daemon/daemon.go b/pkg/serverless/daemon/daemon.go index 21386b9653449..58bc1ac85190e 100644 --- a/pkg/serverless/daemon/daemon.go +++ b/pkg/serverless/daemon/daemon.go @@ -66,6 +66,15 @@ type Daemon struct { // LambdaLibraryDetected represents whether the Datadog Lambda Library was detected in the environment LambdaLibraryDetected bool + // LambdaLibraryStateLock keeps track of whether the Datadog Lambda Library was detected in the environment + LambdaLibraryStateLock sync.Mutex + + // executionSpanIncomplete indicates whether the Lambda span has been completed by the Extension + executionSpanIncomplete bool + + // ExecutionSpanStateLock keeps track of whether the serverless Invocation routes have been hit to complete the execution span + ExecutionSpanStateLock sync.Mutex + // runtimeStateMutex is used to ensure that modifying the state of the runtime is thread-safe runtimeStateMutex sync.Mutex @@ -435,3 +444,24 @@ func (d *Daemon) setTraceTags(tagMap map[string]string) bool { } return false } + +// IsLambdaLibraryDetected returns if the Lambda Library is in use +func (d *Daemon) IsLambdaLibraryDetected() bool { + d.LambdaLibraryStateLock.Lock() + defer d.LambdaLibraryStateLock.Unlock() + return d.LambdaLibraryDetected +} + +// IsExecutionSpanIncomplete checks if the Lambda execution span was finished +func (d *Daemon) IsExecutionSpanIncomplete() bool { + d.ExecutionSpanStateLock.Lock() + defer d.ExecutionSpanStateLock.Unlock() + return d.executionSpanIncomplete +} + +// SetExecutionSpanIncomplete keeps track of whether the Extension completed the Lambda execution span +func (d *Daemon) SetExecutionSpanIncomplete(spanIncomplete bool) { + d.ExecutionSpanStateLock.Lock() + defer d.ExecutionSpanStateLock.Unlock() + d.executionSpanIncomplete = spanIncomplete +} diff --git a/pkg/serverless/daemon/routes.go b/pkg/serverless/daemon/routes.go index 1b2379d8e1822..93e113782dbb8 100644 --- a/pkg/serverless/daemon/routes.go +++ b/pkg/serverless/daemon/routes.go @@ -26,6 +26,8 @@ type Hello struct { //nolint:revive // TODO(SERV) Fix revive linter func (h *Hello) ServeHTTP(w http.ResponseWriter, r *http.Request) { log.Debug("Hit on the serverless.Hello route.") + h.daemon.LambdaLibraryStateLock.Lock() + defer h.daemon.LambdaLibraryStateLock.Unlock() h.daemon.LambdaLibraryDetected = true } @@ -53,6 +55,7 @@ type StartInvocation struct { func (s *StartInvocation) ServeHTTP(w http.ResponseWriter, r *http.Request) { log.Debug("Hit on the serverless.StartInvocation route.") + s.daemon.SetExecutionSpanIncomplete(true) startTime := time.Now() reqBody, err := io.ReadAll(r.Body) if err != nil { @@ -86,6 +89,7 @@ type EndInvocation struct { func (e *EndInvocation) ServeHTTP(w http.ResponseWriter, r *http.Request) { log.Debug("Hit on the serverless.EndInvocation route.") + e.daemon.SetExecutionSpanIncomplete(false) endTime := time.Now() ecs := e.daemon.ExecutionContext.GetCurrentState() coldStartTags := e.daemon.ExecutionContext.GetColdStartTagsForRequestID(ecs.LastRequestID) diff --git a/pkg/serverless/daemon/routes_test.go b/pkg/serverless/daemon/routes_test.go index eab3e09e6be02..0cdae0c594057 100644 --- a/pkg/serverless/daemon/routes_test.go +++ b/pkg/serverless/daemon/routes_test.go @@ -161,6 +161,30 @@ func TestTraceContext(t *testing.T) { } } +func TestHello(t *testing.T) { + assert := assert.New(t) + + port := testutil.FreeTCPPort(t) + d := StartDaemon(fmt.Sprintf("127.0.0.1:%d", port)) + time.Sleep(100 * time.Millisecond) + defer d.Stop() + d.InvocationProcessor = &invocationlifecycle.LifecycleProcessor{ + ExtraTags: d.ExtraTags, + Demux: nil, + ProcessTrace: nil, + DetectLambdaLibrary: d.IsLambdaLibraryDetected, + } + client := &http.Client{} + body := bytes.NewBuffer([]byte(`{}`)) + request, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/lambda/hello", port), body) + assert.Nil(err) + assert.False(d.IsLambdaLibraryDetected()) + response, err := client.Do(request) + assert.Nil(err) + response.Body.Close() + assert.True(d.IsLambdaLibraryDetected()) +} + func TestStartEndInvocationSpanParenting(t *testing.T) { port := testutil.FreeTCPPort(t) d := StartDaemon(fmt.Sprintf("127.0.0.1:%d", port)) @@ -332,6 +356,36 @@ func TestStartEndInvocationSpanParenting(t *testing.T) { } } +func TestStartEndInvocationIsExecutionSpanIncomplete(t *testing.T) { + assert := assert.New(t) + port := testutil.FreeTCPPort(t) + d := StartDaemon(fmt.Sprintf("127.0.0.1:%d", port)) + time.Sleep(100 * time.Millisecond) + defer d.Stop() + + m := &mockLifecycleProcessor{} + d.InvocationProcessor = m + + client := &http.Client{} + body := bytes.NewBuffer([]byte(`{"key": "value"}`)) + startReq, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/lambda/start-invocation", port), body) + assert.Nil(err) + startResp, err := client.Do(startReq) + assert.Nil(err) + startResp.Body.Close() + assert.True(m.OnInvokeStartCalled) + assert.True(d.IsExecutionSpanIncomplete()) + + body = bytes.NewBuffer([]byte(`{}`)) + endReq, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/lambda/end-invocation", port), body) + assert.Nil(err) + endResp, err := client.Do(endReq) + assert.Nil(err) + endResp.Body.Close() + assert.True(m.OnInvokeEndCalled) + assert.False(d.IsExecutionSpanIncomplete()) +} + // Helper function for reading test file func getEventFromFile(filename string) string { event, err := os.ReadFile("../trace/testdata/event_samples/" + filename) diff --git a/pkg/serverless/invocationlifecycle/invocation_details.go b/pkg/serverless/invocationlifecycle/invocation_details.go index bd0e285f8d377..0ad7d0a98b8ea 100644 --- a/pkg/serverless/invocationlifecycle/invocation_details.go +++ b/pkg/serverless/invocationlifecycle/invocation_details.go @@ -27,6 +27,7 @@ type InvocationStartDetails struct { type InvocationEndDetails struct { EndTime time.Time IsError bool + IsTimeout bool RequestID string ResponseRawPayload []byte ColdStart bool diff --git a/pkg/serverless/invocationlifecycle/lifecycle.go b/pkg/serverless/invocationlifecycle/lifecycle.go index d8c470b187db5..90e931767cef1 100644 --- a/pkg/serverless/invocationlifecycle/lifecycle.go +++ b/pkg/serverless/invocationlifecycle/lifecycle.go @@ -281,32 +281,14 @@ func (lp *LifecycleProcessor) OnInvokeEnd(endDetails *InvocationEndDetails) { spans = append(spans, span) if lp.InferredSpansEnabled { - log.Debug("[lifecycle] Attempting to complete the inferred span") - log.Debugf("[lifecycle] Inferred span context: %+v", lp.GetInferredSpan().Span) - if lp.GetInferredSpan().Span.Start != 0 { - span0, span1 := lp.requestHandler.inferredSpans[0], lp.requestHandler.inferredSpans[1] - if span1 != nil { - log.Debug("[lifecycle] Completing a secondary inferred span") - lp.setParentIDForMultipleInferredSpans() - span1.AddTagToInferredSpan("http.status_code", statusCode) - span1.AddTagToInferredSpan("peer.service", lp.GetServiceName()) - span := lp.completeInferredSpan(span1, lp.getInferredSpanStart(), endDetails.IsError) - spans = append(spans, span) - log.Debug("[lifecycle] The secondary inferred span attributes are %v", lp.requestHandler.inferredSpans[1]) - } - span0.AddTagToInferredSpan("http.status_code", statusCode) - span0.AddTagToInferredSpan("peer.service", lp.GetServiceName()) - span := lp.completeInferredSpan(span0, endDetails.EndTime, endDetails.IsError) - spans = append(spans, span) - log.Debugf("[lifecycle] The inferred span attributes are: %v", lp.GetInferredSpan()) - } else { - log.Debug("[lifecyle] Failed to complete inferred span due to a missing start time. Please check that the event payload was received with the appropriate data") - } + inferredSpans := lp.endInferredSpan(statusCode, endDetails.EndTime, endDetails.IsError) + spans = append(spans, inferredSpans...) } lp.processTrace(spans) } - if endDetails.IsError { + // We don't submit an error metric on timeouts since it should have already been submitted when the Extension receives a SHUTDOWN event + if endDetails.IsError && !endDetails.IsTimeout { serverlessMetrics.SendErrorsEnhancedMetric( lp.ExtraTags.Tags, endDetails.EndTime, lp.Demux, ) @@ -385,3 +367,30 @@ func (lp *LifecycleProcessor) setParentIDForMultipleInferredSpans() { lp.requestHandler.inferredSpans[1].Span.ParentID = lp.requestHandler.inferredSpans[0].Span.ParentID lp.requestHandler.inferredSpans[0].Span.ParentID = lp.requestHandler.inferredSpans[1].Span.SpanID } + +// endInferredSpan attempts to complete any inferred spans and send them to intake +func (lp *LifecycleProcessor) endInferredSpan(statusCode string, endTime time.Time, isError bool) []*pb.Span { + spans := make([]*pb.Span, 0, 2) + log.Debug("[lifecycle] Attempting to complete the inferred span") + log.Debugf("[lifecycle] Inferred span context: %+v", lp.GetInferredSpan().Span) + if lp.GetInferredSpan().Span.Start != 0 { + span0, span1 := lp.requestHandler.inferredSpans[0], lp.requestHandler.inferredSpans[1] + if span1 != nil { + log.Debug("[lifecycle] Completing a secondary inferred span") + lp.setParentIDForMultipleInferredSpans() + span1.AddTagToInferredSpan("http.status_code", statusCode) + span1.AddTagToInferredSpan("peer.service", lp.GetServiceName()) + span := lp.completeInferredSpan(span1, lp.getInferredSpanStart(), isError) + spans = append(spans, span) + log.Debug("[lifecycle] The secondary inferred span attributes are %v", lp.requestHandler.inferredSpans[1]) + } + span0.AddTagToInferredSpan("http.status_code", statusCode) + span0.AddTagToInferredSpan("peer.service", lp.GetServiceName()) + span := lp.completeInferredSpan(span0, endTime, isError) + spans = append(spans, span) + log.Debugf("[lifecycle] The inferred span attributes are: %v", lp.GetInferredSpan()) + } else { + log.Debug("[lifecyle] Failed to complete inferred span due to a missing start time. Please check that the event payload was received with the appropriate data") + } + return spans +} diff --git a/pkg/serverless/invocationlifecycle/lifecycle_test.go b/pkg/serverless/invocationlifecycle/lifecycle_test.go index e33d574035dd7..b7ee5aaa3057d 100644 --- a/pkg/serverless/invocationlifecycle/lifecycle_test.go +++ b/pkg/serverless/invocationlifecycle/lifecycle_test.go @@ -379,6 +379,123 @@ func TestCompleteInferredSpanWithOutStartTime(t *testing.T) { completedInferredSpan := tracePayload.TracerPayload.Chunks[0].Spans[0] assert.Equal(t, startInvocationTime.UnixNano(), completedInferredSpan.Start) } + +func TestTimeoutExecutionSpan(t *testing.T) { + t.Setenv(functionNameEnvVar, "my-function") + t.Setenv("DD_SERVICE", "mock-lambda-service") + + extraTags := &logs.Tags{ + Tags: []string{"functionname:test-function"}, + } + demux := createDemultiplexer(t) + defer demux.Stop(false) + mockDetectLambdaLibrary := func() bool { return false } + + var tracePayload *api.Payload + mockProcessTrace := func(payload *api.Payload) { + tracePayload = payload + } + + testProcessor := LifecycleProcessor{ + ExtraTags: extraTags, + ProcessTrace: mockProcessTrace, + DetectLambdaLibrary: mockDetectLambdaLibrary, + Demux: demux, + InferredSpansEnabled: true, + } + startTime := time.Now() + duration := 1 * time.Second + endTime := startTime.Add(duration) + startDetails := InvocationStartDetails{ + StartTime: time.Now(), + InvokeEventRawPayload: []byte(`{}`), + InvokedFunctionARN: "arn:aws:lambda:us-east-1:123456789012:function:my-function", + } + testProcessor.OnInvokeStart(&startDetails) + + timeoutCtx := &InvocationEndDetails{ + RequestID: "test-request-id", + Runtime: "java11", + ColdStart: false, + ProactiveInit: false, + EndTime: endTime, + IsError: true, + IsTimeout: true, + ResponseRawPayload: nil, + } + testProcessor.OnInvokeEnd(timeoutCtx) + + spans := tracePayload.TracerPayload.Chunks[0].Spans + assert.Equal(t, 1, len(spans)) + // No trace context passed + assert.NotZero(t, testProcessor.GetExecutionInfo().TraceID) + assert.Equal(t, uint64(0), testProcessor.GetExecutionInfo().SpanID) + assert.Equal(t, int32(-128), tracePayload.TracerPayload.Chunks[0].Priority) + // New trace ID and span ID has been created + assert.NotEqual(t, uint64(0), spans[0].TraceID) + assert.NotEqual(t, uint64(0), spans[0].SpanID) + assert.Equal(t, spans[0].TraceID, testProcessor.GetExecutionInfo().TraceID) + assert.Equal(t, spans[0].Error, int32(1)) + assert.Equal(t, spans[0].GetMeta()["request_id"], "test-request-id") + assert.Equal(t, spans[0].GetMeta()["language"], "java") +} + +func TestTimeoutExecutionSpanWithTraceContext(t *testing.T) { + t.Setenv(functionNameEnvVar, "my-function") + t.Setenv("DD_SERVICE", "mock-lambda-service") + + extraTags := &logs.Tags{ + Tags: []string{"functionname:test-function"}, + } + demux := createDemultiplexer(t) + defer demux.Stop(false) + mockDetectLambdaLibrary := func() bool { return false } + + var tracePayload *api.Payload + mockProcessTrace := func(payload *api.Payload) { + tracePayload = payload + } + + testProcessor := LifecycleProcessor{ + ExtraTags: extraTags, + ProcessTrace: mockProcessTrace, + DetectLambdaLibrary: mockDetectLambdaLibrary, + Demux: demux, + InferredSpansEnabled: true, + } + eventPayload := `a5a{"resource":"/users/create","path":"/users/create","httpMethod":"GET","headers":{"Accept":"*/*","Accept-Encoding":"gzip","x-datadog-parent-id":"1480558859903409531","x-datadog-sampling-priority":"1","x-datadog-trace-id":"5736943178450432258"}}0` + startTime := time.Now() + duration := 1 * time.Second + endTime := startTime.Add(duration) + startDetails := InvocationStartDetails{ + StartTime: startTime, + InvokeEventRawPayload: []byte(eventPayload), + InvokedFunctionARN: "arn:aws:lambda:us-east-1:123456789012:function:my-function", + } + testProcessor.OnInvokeStart(&startDetails) + timeoutCtx := &InvocationEndDetails{ + RequestID: "test-request-id", + Runtime: "java11", + ColdStart: false, + ProactiveInit: false, + EndTime: endTime, + IsError: true, + IsTimeout: true, + ResponseRawPayload: nil, + } + testProcessor.OnInvokeEnd(timeoutCtx) + + spans := tracePayload.TracerPayload.Chunks[0].Spans + assert.Equal(t, 1, len(spans)) + // Trace context received + assert.Equal(t, spans[0].GetTraceID(), testProcessor.GetExecutionInfo().TraceID) + assert.Equal(t, spans[0].GetParentID(), testProcessor.GetExecutionInfo().parentID) + assert.Equal(t, tracePayload.TracerPayload.Chunks[0].Priority, int32(testProcessor.GetExecutionInfo().SamplingPriority)) + assert.Equal(t, spans[0].Error, int32(1)) + assert.Equal(t, spans[0].GetMeta()["request_id"], "test-request-id") + assert.Equal(t, spans[0].GetMeta()["language"], "java") +} + func TestTriggerTypesLifecycleEventForAPIGatewayRest(t *testing.T) { startDetails := &InvocationStartDetails{ InvokeEventRawPayload: getEventFromFile("api-gateway.json"), diff --git a/pkg/serverless/invocationlifecycle/trace.go b/pkg/serverless/invocationlifecycle/trace.go index 6ed2344b1014f..cfd545ed144f4 100644 --- a/pkg/serverless/invocationlifecycle/trace.go +++ b/pkg/serverless/invocationlifecycle/trace.go @@ -18,6 +18,7 @@ import ( "github.com/DataDog/datadog-agent/pkg/config" pb "github.com/DataDog/datadog-agent/pkg/proto/pbgo/trace" + "github.com/DataDog/datadog-agent/pkg/serverless/random" "github.com/DataDog/datadog-agent/pkg/serverless/trace/inferredspan" "github.com/DataDog/datadog-agent/pkg/trace/api" "github.com/DataDog/datadog-agent/pkg/trace/info" @@ -76,18 +77,29 @@ func (lp *LifecycleProcessor) startExecutionSpan(event interface{}, rawPayload [ // It should be called at the end of the invocation. func (lp *LifecycleProcessor) endExecutionSpan(endDetails *InvocationEndDetails) *pb.Span { executionContext := lp.GetExecutionInfo() - duration := endDetails.EndTime.UnixNano() - executionContext.startTime.UnixNano() + start := executionContext.startTime.UnixNano() + + traceID := executionContext.TraceID + spanID := executionContext.SpanID + // If we fail to receive the trace and span IDs from the tracer during a timeout we create it ourselves + if endDetails.IsTimeout && traceID == 0 { + traceID = random.Random.Uint64() + lp.requestHandler.executionInfo.TraceID = traceID + } + if endDetails.IsTimeout && spanID == 0 { + spanID = random.Random.Uint64() + } executionSpan := &pb.Span{ Service: "aws.lambda", // will be replaced by the span processor Name: "aws.lambda", Resource: os.Getenv(functionNameEnvVar), Type: "serverless", - TraceID: executionContext.TraceID, - SpanID: executionContext.SpanID, + TraceID: traceID, + SpanID: spanID, ParentID: executionContext.parentID, - Start: executionContext.startTime.UnixNano(), - Duration: duration, + Start: start, + Duration: endDetails.EndTime.UnixNano() - start, Meta: lp.requestHandler.triggerTags, Metrics: lp.requestHandler.triggerMetrics, } @@ -110,17 +122,19 @@ func (lp *LifecycleProcessor) endExecutionSpan(endDetails *InvocationEndDetails) } else { capturePayloadAsTags(requestPayloadJSON, executionSpan, "function.request", 0, capturePayloadMaxDepth) } - responsePayloadJSON := make(map[string]interface{}) - if err := json.Unmarshal(endDetails.ResponseRawPayload, &responsePayloadJSON); err != nil { - log.Debugf("[lifecycle] Failed to parse response payload: %v", err) - executionSpan.Meta["function.response"] = string(endDetails.ResponseRawPayload) - } else { - capturePayloadAsTags(responsePayloadJSON, executionSpan, "function.response", 0, capturePayloadMaxDepth) + if endDetails.ResponseRawPayload != nil { + responsePayloadJSON := make(map[string]interface{}) + if err := json.Unmarshal(endDetails.ResponseRawPayload, &responsePayloadJSON); err != nil { + log.Debugf("[lifecycle] Failed to parse response payload: %v", err) + executionSpan.Meta["function.response"] = string(endDetails.ResponseRawPayload) + } else { + capturePayloadAsTags(responsePayloadJSON, executionSpan, "function.response", 0, capturePayloadMaxDepth) + } } } - if endDetails.IsError { executionSpan.Error = 1 + if len(endDetails.ErrorMsg) > 0 { executionSpan.Meta["error.msg"] = endDetails.ErrorMsg } @@ -130,6 +144,11 @@ func (lp *LifecycleProcessor) endExecutionSpan(endDetails *InvocationEndDetails) if len(endDetails.ErrorStack) > 0 { executionSpan.Meta["error.stack"] = endDetails.ErrorStack } + + if endDetails.IsTimeout { + executionSpan.Meta["error.type"] = "Impending Timeout" + executionSpan.Meta["error.msg"] = "Datadog detected an Impending Timeout" + } } return executionSpan diff --git a/pkg/serverless/invocationlifecycle/trace_test.go b/pkg/serverless/invocationlifecycle/trace_test.go index 0b925f9a25be6..6b45d32755165 100644 --- a/pkg/serverless/invocationlifecycle/trace_test.go +++ b/pkg/serverless/invocationlifecycle/trace_test.go @@ -649,6 +649,54 @@ func TestEndExecutionSpanWithError(t *testing.T) { assert.Equal(t, executionSpan.Error, int32(1)) } +func TestEndExecutionSpanWithTimeout(t *testing.T) { + t.Setenv(functionNameEnvVar, "TestFunction") + currentExecutionInfo := &ExecutionStartInfo{} + lp := &LifecycleProcessor{ + requestHandler: &RequestHandler{ + executionInfo: currentExecutionInfo, + triggerTags: make(map[string]string), + }, + } + + startTime := time.Now() + startDetails := &InvocationStartDetails{ + StartTime: startTime, + InvokeEventHeaders: http.Header{}, + } + lp.startExecutionSpan(nil, []byte("[]"), startDetails) + + assert.Zero(t, currentExecutionInfo.TraceID) + assert.Zero(t, currentExecutionInfo.SpanID) + + duration := 1 * time.Second + endTime := startTime.Add(duration) + + endDetails := &InvocationEndDetails{ + EndTime: endTime, + IsError: true, + IsTimeout: true, + RequestID: "test-request-id", + ResponseRawPayload: nil, + ColdStart: true, + ProactiveInit: false, + Runtime: "dotnet6", + } + executionSpan := lp.endExecutionSpan(endDetails) + assert.Equal(t, "aws.lambda", executionSpan.Name) + assert.Equal(t, "aws.lambda", executionSpan.Service) + assert.Equal(t, "TestFunction", executionSpan.Resource) + assert.Equal(t, "serverless", executionSpan.Type) + assert.Equal(t, "dotnet", executionSpan.Meta["language"]) + assert.Equal(t, lp.requestHandler.executionInfo.TraceID, executionSpan.TraceID) + assert.NotZero(t, executionSpan.TraceID) + assert.NotZero(t, executionSpan.SpanID) + assert.Equal(t, startTime.UnixNano(), executionSpan.Start) + assert.Equal(t, duration.Nanoseconds(), executionSpan.Duration) + assert.Equal(t, "Impending Timeout", executionSpan.Meta["error.type"]) + assert.Equal(t, "Datadog detected an Impending Timeout", executionSpan.Meta["error.msg"]) +} + func TestParseLambdaPayload(t *testing.T) { assert.Equal(t, []byte(""), ParseLambdaPayload([]byte(""))) assert.Equal(t, []byte("{}"), ParseLambdaPayload([]byte("{}"))) diff --git a/pkg/serverless/serverless.go b/pkg/serverless/serverless.go index 091494b15afce..24c04e22a08ad 100644 --- a/pkg/serverless/serverless.go +++ b/pkg/serverless/serverless.go @@ -18,6 +18,7 @@ import ( "github.com/DataDog/datadog-agent/pkg/serverless/daemon" "github.com/DataDog/datadog-agent/pkg/serverless/flush" + "github.com/DataDog/datadog-agent/pkg/serverless/invocationlifecycle" "github.com/DataDog/datadog-agent/pkg/serverless/metrics" "github.com/DataDog/datadog-agent/pkg/serverless/registration" "github.com/DataDog/datadog-agent/pkg/serverless/tags" @@ -139,6 +140,10 @@ func WaitForNextInvocation(stopCh chan struct{}, daemon *daemon.Daemon, id regis metricTags = tags.AddInitTypeTag(metricTags) metrics.SendTimeoutEnhancedMetric(metricTags, daemon.MetricAgent.Demux) metrics.SendErrorsEnhancedMetric(metricTags, time.Now(), daemon.MetricAgent.Demux) + + if daemon.IsExecutionSpanIncomplete() { + finishTimeoutExecutionSpan(daemon, coldStartTags.IsColdStart, coldStartTags.IsProactiveInit) + } } err := daemon.ExecutionContext.SaveCurrentExecutionContext() if err != nil { @@ -214,3 +219,20 @@ func removeQualifierFromArn(functionArn string) string { } return functionArn } + +func finishTimeoutExecutionSpan(daemon *daemon.Daemon, isColdStart bool, isProactiveInit bool) { + ecs := daemon.ExecutionContext.GetCurrentState() + timeoutDetails := &invocationlifecycle.InvocationEndDetails{ + RequestID: ecs.LastRequestID, + Runtime: ecs.Runtime, + ColdStart: isColdStart, + ProactiveInit: isProactiveInit, + EndTime: time.Now(), + IsError: true, + IsTimeout: true, + ResponseRawPayload: nil, + } + log.Debug("Could not complete the execution span due to a timeout. Attempting to finish the span without details from the tracer.") + daemon.InvocationProcessor.OnInvokeEnd(timeoutDetails) + daemon.SetExecutionSpanIncomplete(false) +} diff --git a/pkg/serverless/serverless_test.go b/pkg/serverless/serverless_test.go index ccd144ea939bd..14bd868ab6548 100644 --- a/pkg/serverless/serverless_test.go +++ b/pkg/serverless/serverless_test.go @@ -15,6 +15,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/DataDog/datadog-agent/pkg/serverless/daemon" + "github.com/DataDog/datadog-agent/pkg/serverless/invocationlifecycle" + "github.com/DataDog/datadog-agent/pkg/serverless/trace" + "github.com/DataDog/datadog-agent/pkg/trace/testutil" ) func TestMain(m *testing.M) { @@ -69,3 +72,40 @@ func TestRemoveQualifierFromArnWithoutAlias(t *testing.T) { functionArn := removeQualifierFromArn(invokedFunctionArn) assert.Equal(t, functionArn, invokedFunctionArn) } + +type mockLifecycleProcessor struct { + isError bool + isTimeout bool + isColdStart bool + isProactiveInit bool +} + +func (m *mockLifecycleProcessor) GetExecutionInfo() *invocationlifecycle.ExecutionStartInfo { + return &invocationlifecycle.ExecutionStartInfo{} +} +func (m *mockLifecycleProcessor) OnInvokeStart(*invocationlifecycle.InvocationStartDetails) {} +func (m *mockLifecycleProcessor) OnInvokeEnd(endDetails *invocationlifecycle.InvocationEndDetails) { + m.isError = endDetails.IsError + m.isTimeout = endDetails.IsTimeout + m.isColdStart = endDetails.ColdStart + m.isProactiveInit = endDetails.ProactiveInit +} + +func TestFinishTimeoutExecutionSpan(t *testing.T) { + port := testutil.FreeTCPPort(t) + d := daemon.StartDaemon(fmt.Sprintf("127.0.0.1:%d", port)) + d.TraceAgent = &trace.ServerlessTraceAgent{} + mock := &mockLifecycleProcessor{} + d.InvocationProcessor = mock + defer d.Stop() + + assert.False(t, d.IsExecutionSpanIncomplete()) + d.SetExecutionSpanIncomplete(true) + assert.True(t, d.IsExecutionSpanIncomplete()) + finishTimeoutExecutionSpan(d, true, true) + assert.False(t, d.IsExecutionSpanIncomplete()) + assert.True(t, mock.isError) + assert.True(t, mock.isTimeout) + assert.True(t, mock.isColdStart) + assert.True(t, mock.isProactiveInit) +}