From 9ade1cf7aa25482afeed7c5353a719cd44d372b2 Mon Sep 17 00:00:00 2001 From: Matt Cotter Date: Thu, 21 Nov 2024 15:21:39 -0600 Subject: [PATCH] feat: add new diagnostic check to validate the agent status (#126) ### Description OB-32327 add new diagnostic check to validate the agent status, update integration tests to separate config initialization from running the diagnose check ### Checklist - [x] Created tests which fail without the change (if possible) - [ ] Extended the README / documentation, if necessary --- integration/README.md | 121 ++++----- integration/scripts/test_configure.py | 146 +++------- integration/scripts/test_diagnose.py | 69 +++++ integration/scripts/test_ec2_connection.py | 134 +++++---- integration/scripts/test_install.py | 146 +++++----- integration/scripts/test_start.py | 228 ++++++++-------- integration/scripts/test_version.py | 146 +++++----- integration/scripts/utils.py | 257 +++++++++++++----- integration/tests/integration.tftest.hcl | 32 ++- .../commands/diagnose/agentstatuscheck.go | 59 ++++ .../commands/diagnose/agentstatuscheck.tmpl | 7 + internal/commands/diagnose/authcheck.go | 9 +- internal/commands/diagnose/authcheck.tmpl | 11 +- internal/commands/diagnose/configcheck.go | 12 +- internal/commands/diagnose/configcheck.tmpl | 9 +- .../commands/diagnose/configcheck_test.go | 3 +- internal/commands/diagnose/diagnose.go | 22 +- internal/commands/diagnose/otelconfigcheck.go | 13 +- .../commands/diagnose/otelconfigcheck.tmpl | 6 +- internal/commands/start/start.go | 9 +- internal/commands/version/version.go | 13 +- internal/connections/confighandler.go | 2 +- internal/root/root.go | 8 +- main_windows.go | 2 +- packaging/linux/config/observe-agent.yaml | 4 +- 25 files changed, 863 insertions(+), 605 deletions(-) create mode 100755 integration/scripts/test_diagnose.py create mode 100644 internal/commands/diagnose/agentstatuscheck.go create mode 100644 internal/commands/diagnose/agentstatuscheck.tmpl diff --git a/integration/README.md b/integration/README.md index b01a54c9c..fcf6a68e0 100644 --- a/integration/README.md +++ b/integration/README.md @@ -1,35 +1,35 @@ -## Integration Tests - +# Integration Tests The root of this module location is intended to run integration tests using the terraform test framework. The tests are located at `integration/tests` -The tests are run using the `terraform test -verbose` command from this folder `observe-agent/integration` +The tests are run using the `terraform test -verbose` command from this folder `observe-agent/integration` -When the above command is run, the tests in the `integration/tests` directory are ran using the variables provided. The tests are ran in the order of the run blocks provided in `.tftest.hcl` +When the above command is run, the tests in the `integration/tests` directory are ran using the variables provided. The tests are ran in the order of the run blocks provided in `.tftest.hcl` Generally a test will do the following for any given EC2 Machine: + - Create a machine using the variables provided below in `us-west-1` -- Run a test using `observeinc/collection/aws//modules/testing/exec` module to accept python scripts located at `integration/tests/scripts` +- Run a test using `observeinc/collection/aws//modules/testing/exec` module to accept python scripts located at `integration/tests/scripts` -### Pre-requisites +## Pre-requisites -Ensure you have the following: -- Built version of the agent ( in `observe-repos/observe-agent/dist`) using `go-releaser` +Ensure you have the following: + +- Built version of the agent ( in `observe-agent/dist`) using `go-releaser` - Blunderdome Admin Access in AWS (used to assume the `gh-observe_agent-repo` role for testing ) -- Observe Collection URL & Datastream Token to test with -- Generated Private & public key pair ( name to `test_key.pub` & `test_key.pem`) +- Observe Collection URL & Datastream Token to test with +- Generated Private & public key pair ( name to `test_key.pub` & `test_key.pem`) - Terraform provider overide and terraform variables (see below section on how to do this) - **Building the agent**: -``` +```sh observe-agent git:(nikhil/update-RM) ✗ goreleaser release --snapshot --clean --verbose ``` If agent distributable is not built, you may get the following message: -``` +```txt │ Error: Test assertion failed │ │ on tests/integration.tftest.hcl line 65, in run "test_install": @@ -40,78 +40,69 @@ If agent distributable is not built, you may get the following message: │ Error in Installation Test ``` - -**SSH Key Pairs**: +**SSH Key Pairs**: Generate in PEM format for the OpenSSH Key that will be used by Terraform EC2 Modules: -``` +```sh ssh-keygen -m PEM ``` For more info on generating SSH keys, see [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-key-pairs.html#how-to-generate-your-own-key-and-import-it-to-aws) -Ensure you have `./test_key.pub` and `./test_key.pem` in the `integration` directory. This can also be changed and specifed in the `integration/tests.auto.tfvars` file. - -Ensure the extensions are correct! +Ensure you have `./test_key.pub` and `./test_key.pem` in the `integration` directory. This can also be changed and specifed in the `integration/tests.auto.tfvars` file. +Ensure the extensions are correct! **AWS UI Access (optional)**: -For AWS UI access for viewing machines: -1. Login to Britive Blunderdome +For AWS UI access for viewing machines: -2. Navigate to AWS Console +1. Login to Britive Blunderdome -

- AWS Console -

+2. Navigate to AWS Console -3. Click Switch role and input details for `nikhil-ps-account` which is the member account for integration testing within Blunderdome. Details are the following: -

- Member Account -

+ ![AWS Console](screenshots/aws-console.png) -4. You can now access EC2 machines and download key pairs (same as what Github Actions workflow uses) +3. Click Switch role and input details for `nikhil-ps-account` which is the member account for integration testing within Blunderdome. Details are the following: -5. Ensure you're in `us-west-1` + ![Member Account](screenshots/member-account.png) +4. You can now access EC2 machines and download key pairs (same as what Github Actions workflow uses) +5. Ensure you're in `us-west-1` -### Terraform Variables +## Terraform Variables -The tests are run using the following variables. These can be set in the `integration/tests.auto.tfvars` file for local testing. +The tests are run using the following variables. These can be set in the `integration/tests.auto.tfvars` file for local testing. -``` -name_format = "tf-observe-agent-test-%s" -AWS_MACHINE= "AMAZON_LINUX_2023" #Choose the AWS Machine to run the tests on +```terraform +name_format = "tf-observe-agent-test-%s" +AWS_MACHINE = "AMAZON_LINUX_2023" #Choose the AWS Machine to run the tests on PUBLIC_KEY_PATH = "./test_key.pub" #Path to Public Key for EC2 PRIVATE_KEY_PATH = "./test_key.pem" #Path to Private Key for EC2 -OBSERVE_URL = "https://" #Observe URL to use for testing -OBSERVE_TOKEN =" -``` +OBSERVE_URL = "https://" #Observe URL to use for testing +OBSERVE_TOKEN = " +``` -The PUBLIC & PRIVATE key pair can be generated by following the instructions in the "SSH Key Pairs" section above +The PUBLIC & PRIVATE key pair can be generated by following the instructions in the "SSH Key Pairs" section above Example of what the `integrations` folder contains after creating `.*tfvars` file and SSH Key Pair: +![SSH Example](screenshots/ssh-example.png)] -

- SSH Example -

- - -### Terraform Provider +## Terraform Provider Note: You must also set the provider correctly. We use the following settings: + - Region: `us-west-1` - Profile: `blunderdome` -- IAM Role Assumed: `gh-observe_agent-repo` +- IAM Role Assumed: `gh-observe_agent-repo` - The above role has permissions to create and destroy EC2 instances. See `modules/setup_aws_backend/role.tf` for more details. The provider can be directly set in the `integration/tests/integration.tftest.hcl` as below: -``` +```terraform provider "aws" { region = "us-west-1" # Specify the AWS region profile = "blunderdome" @@ -126,19 +117,14 @@ or through a `provider_override.tf` placed in `modules/create_ec2` directory. Example of this: -

- SSH Example -

- - - +![Provider Example](screenshots/provider-example.png) > [!NOTE] -> For Terraform to access and assume the role properly, you MUST be logged into Blunderdome Admin in console and have the correct permissions! +> For Terraform to access and assume the role properly, you MUST be logged into Blunderdome Admin in console and have the correct permissions! Example of this: -``` +```sh observe git:(master) ✗ s/aws-creds checkout blunderdome Checked out 'AWS Blunderdome Organization/460044344528 (observe-blunderdome)/BritiveBlunderdome-FullAWSAdmin' into awscli profile 'blunderdome' @@ -146,14 +132,13 @@ Checked out 'AWS Blunderdome Organization/460044344528 (observe-blunderdome)/Bri observe git:(master) ✗ export AWS_PROFILE=blunderdome ``` - -### Local Testing (without terraform test) +## Local Testing (without terraform test) Any of the python scripts in the `/scripts` directory can be tested by running them directly, granted an EC2 Machine exists. As the scripts rely on the outputs of `create_ec2` and `setup_observe_variables` modules to be passed in as environment variables, these environment variables can be manually set if the set up modules are not ran. -The `/scripts/.py` expects the following environment variables to be set: -``` +```sh HOST="54.177.249.99" #HOST IP Address USER="ubuntu" #HOST user to login as KEY_FILENAME="./test_key.pem" #Private path to key @@ -162,19 +147,20 @@ MACHINE_CONFIG="ami_description:Ubuntu Server 22.04 LTS (HVM)- EBS General Purpo OBSERVE_URL="" #Observe URL to use for testing OBSERVE_TOKEN="" #Observe Token to use for testing PASSWORD="WindowsPassword to be used for testing" # Set to None for testing - ``` Run the scripts from the folder as below: -``` + +```sh ➜ integration git:(nikhil/integration-testing-windows) ✗ pwd /Users/nikhil.dua/Documents/observe-repos/observe-agent/integration -➜ integration git:(nikhil/integration-testing-windows) ✗ python3 scripts/test_installation.py +➜ integration git:(nikhil/integration-testing-windows) ✗ python3 scripts/test_install.py ``` -Note: If testing Windows machines, the RDP password is redacted by default in the python scripts. +Note: If testing Windows machines, the RDP password is redacted by default in the python scripts. This can be turned off when disabling mask by setting below environment variable to `False` before running these scripts -``` + +```sh export MASK=False python3 scripts/test_ec2_connection.py ------------------------------ @@ -185,9 +171,6 @@ Env vars set to: Testing SSH connection to host 54.177.26.178 with timeout 120s ``` -### Architecture +## Architecture The architecture diagram can be found ![here](screenshots/Observe-Agent.png) - - - \ No newline at end of file diff --git a/integration/scripts/test_configure.py b/integration/scripts/test_configure.py index 3d1be31f0..7fff2ec36 100755 --- a/integration/scripts/test_configure.py +++ b/integration/scripts/test_configure.py @@ -1,128 +1,66 @@ #!/usr/bin/env python3 -import os -import sys -import re -import time import utils as u -@u.print_test_decorator -def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: - - """ - Test to validate connection of observe-agent to Observe - Args: - remote_host (Host): instance to ssh into - env_vars (dict): environment variables passed into for testing +@u.print_test_decorator +def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: + init_command = r'Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent init-config --token {} --observe_url {}'.format( + env_vars["observe_token"], env_vars["observe_url"] + ) - Raises: - ValueError: Something failed with initial config or observe-agent -> observe connection - """ - - init_command='Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent init-config --token {} --observe_url {}'.format(env_vars["observe_token"], env_vars["observe_url"]) - diagnose_command='Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent diagnose' - - #Set up correct config with observe url and token + # Set up correct config with observe url and token result = remote_host.run_command(init_command) + if result.exited != 0 or result.stderr: + u.print_remote_result(result) + raise ValueError("❌ Error in init-config") - #Check diagnose command - result = remote_host.run_command(diagnose_command) - observe_val = False - for line in result.stdout.splitlines(): - if "Request to test URL responded with response code 200" in line: - print (" ✅ observe-agent -> observe validation passed! ") - observe_val = True - break - if not observe_val: - print(result) - raise ValueError(f"❌ Failed: observe-agent -> observe validation") - - pass @u.print_test_decorator -def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: - docker_prefix='sudo docker run \ - --mount type=bind,source=/proc,target=/hostfs/proc,readonly \ - --mount type=bind,source=/snap,target=/hostfs/snap,readonly \ - --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \ - --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \ - --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \ - --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \ - --pid host \ - $(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep SNAPSHOT)' - - init_command='init-config --token {} --observe_url {}'.format(env_vars["observe_token"], env_vars["observe_url"]) - diagnose_command='diagnose' - - #Set up correct config with observe url and token - result = remote_host.run_command(docker_prefix + ' ' + init_command) - - #Check diagnose command - result = remote_host.run_command(docker_prefix + ' ' + diagnose_command) - observe_val = False - for line in result.stdout.splitlines(): - if "Request to test URL responded with response code 200" in line: - print (" ✅ observe-agent -> observe validation passed! ") - observe_val = True - break - if not observe_val: - print(result) - raise ValueError(f"❌ Failed: observe-agent -> observe validation") +def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: + docker_prefix = u.get_docker_prefix(remote_host, False) + init_command = "{} init-config --token {} --observe_url {}".format( + docker_prefix, env_vars["observe_token"], env_vars["observe_url"] + ) + # Set up correct config with observe url and token + result = remote_host.run_command(init_command) + if result.exited != 0 or result.stderr: + u.print_remote_result(result) + raise ValueError("❌ Error in init-config") - pass @u.print_test_decorator -def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: - - """ - Test to validate connection of observe-agent to Observe +def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: + init_command = "sudo observe-agent init-config --token {} --observe_url {}".format( + env_vars["observe_token"], env_vars["observe_url"] + ) - Args: - remote_host (Host): instance to ssh into - env_vars (dict): environment variables passed into for testing - - Raises: - ValueError: Something failed with initial config or observe-agent -> observe connection - """ - - init_command='sudo observe-agent init-config --token {} --observe_url {}'.format(env_vars["observe_token"], env_vars["observe_url"]) - diagnose_command='observe-agent diagnose' - - #Set up correct config with observe url and token + # Set up correct config with observe url and token result = remote_host.run_command(init_command) + if result.exited != 0 or result.stderr: + u.print_remote_result(result) + raise ValueError("❌ Error in init-config") - #Check diagnose command - result = remote_host.run_command(diagnose_command) - observe_val = False - for line in result.stdout.splitlines(): - if "Request to test URL responded with response code 200" in line: - print (" ✅ observe-agent -> observe validation passed! ") - observe_val = True - break - if not observe_val: - print(result) - raise ValueError(f"❌ Failed: observe-agent -> observe validation") - -if __name__ == '__main__': +if __name__ == "__main__": env_vars = u.get_env_vars(need_observe=True) - remote_host = u.Host(host_ip=env_vars["host"], - username=env_vars["user"], - key_file_path=env_vars["key_filename"], - password=env_vars["password"]) - - #Test SSH Connection before starting test of interest - remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) - - if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]: + remote_host = u.Host( + host_ip=env_vars["host"], + username=env_vars["user"], + key_file_path=env_vars["key_filename"], + password=env_vars["password"], + ) + + # Test SSH Connection before starting test of interest + remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) + + if ( + "redhat" in env_vars["machine_config"]["distribution"] + or "debian" in env_vars["machine_config"]["distribution"] + ): run_test_linux(remote_host, env_vars) elif "windows" in env_vars["machine_config"]["distribution"]: run_test_windows(remote_host, env_vars) elif "docker" in env_vars["machine_config"]["distribution"]: run_test_docker(remote_host, env_vars) - - pass - - diff --git a/integration/scripts/test_diagnose.py b/integration/scripts/test_diagnose.py new file mode 100755 index 000000000..3abe96724 --- /dev/null +++ b/integration/scripts/test_diagnose.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +from fabric import Result + +import re +import utils as u + + +def _check_diagnose_result(result: Result) -> bool: + passed = re.search(r"All \d+ checks passed", result.stdout) is not None + if passed: + print(" ✅ observe-agent -> observe validation passed! ") + else: + u.print_remote_result(result) + raise ValueError( + f"❌ Failed: observe-agent -> observe validation (regex on diagnose output did not match)" + ) + + +@u.print_test_decorator +def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: + diagnose_command = r'Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent diagnose' + + # Check diagnose command + result = remote_host.run_command(diagnose_command) + _check_diagnose_result(result) + + +@u.print_test_decorator +def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: + container_id = u.get_docker_container(remote_host) + exec_prefix = f"sudo docker exec {container_id} ./observe-agent" + diagnose_command = exec_prefix + " diagnose" + + # Check diagnose command + result = remote_host.run_command(diagnose_command) + _check_diagnose_result(result) + + +@u.print_test_decorator +def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: + diagnose_command = "observe-agent diagnose" + + # Check diagnose command + result = remote_host.run_command(diagnose_command) + _check_diagnose_result(result) + + +if __name__ == "__main__": + + env_vars = u.get_env_vars() + remote_host = u.Host( + host_ip=env_vars["host"], + username=env_vars["user"], + key_file_path=env_vars["key_filename"], + password=env_vars["password"], + ) + + # Test SSH Connection before starting test of interest + remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) + + if ( + "redhat" in env_vars["machine_config"]["distribution"] + or "debian" in env_vars["machine_config"]["distribution"] + ): + run_test_linux(remote_host, env_vars) + elif "windows" in env_vars["machine_config"]["distribution"]: + run_test_windows(remote_host, env_vars) + elif "docker" in env_vars["machine_config"]["distribution"]: + run_test_docker(remote_host, env_vars) diff --git a/integration/scripts/test_ec2_connection.py b/integration/scripts/test_ec2_connection.py index 7670b3364..e03592506 100755 --- a/integration/scripts/test_ec2_connection.py +++ b/integration/scripts/test_ec2_connection.py @@ -4,77 +4,88 @@ import os import sys import re -import time +import time import utils as u - -@u.print_test_decorator -def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: +@u.print_test_decorator +def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: """ - This test validates that the UserdataExecution.log finished successfully + This test validates that the UserdataExecution.log finished successfully and ec2 instance is in stable state prior to running other Args: - remote_host (Host): instance to ssh into + remote_host (Host): instance to ssh into env_vars (dict): environment variables passed into for testing Raises: RuntimeError: Failed to verify UserdataExecution.log or agent.logfile """ - - + tmp_file = "/tmp/UserdataExecution.log" - cloud_init_file_timeout = 240 # 4 minutes - - if "2022" in env_vars["machine_name"]: #Windows 2022 - Test windows cloud-init file finished successfully + cloud_init_file_timeout = 240 # 4 minutes + + if ( + "2022" in env_vars["machine_name"] + ): # Windows 2022 - Test windows cloud-init file finished successfully print("Windows 2022 detected") - cloud_init_file = r'/C:/ProgramData/Amazon/EC2Launch/log/agent.log' - - for _ in range(cloud_init_file_timeout): - remote_host.get_file(cloud_init_file, tmp_file) # This command will automatically test connection - with open(tmp_file) as file: #No encoding for windows 2022 needed + cloud_init_file = r"/C:/ProgramData/Amazon/EC2Launch/log/agent.log" + + for _ in range(cloud_init_file_timeout): + remote_host.get_file( + cloud_init_file, tmp_file + ) # This command will automatically test connection + with open(tmp_file) as file: # No encoding for windows 2022 needed content = file.read().lower() - if "script execution finished successfully" in content: + if "script execution finished successfully" in content: print(" ✅ Verified agent.log had completed successfully!") - return + return else: print(" Looking for the agent.log file to finish completing...") - time.sleep(1) - raise RuntimeError("❌ The agent.log file did not finish successfully in time") - else: # Windows 2016/2019 - Test windows cloud-init file finished successfully + time.sleep(1) + raise RuntimeError("❌ The agent.log file did not finish successfully in time") + else: # Windows 2016/2019 - Test windows cloud-init file finished successfully print("Windows 2016 or 2019 detected") - cloud_init_file = r'/C:/ProgramData/Amazon/EC2-Windows/Launch/Log/UserdataExecution.log' - - for _ in range(cloud_init_file_timeout): - remote_host.get_file(cloud_init_file, tmp_file) # This command will automatically test connection + cloud_init_file = ( + r"/C:/ProgramData/Amazon/EC2-Windows/Launch/Log/UserdataExecution.log" + ) + + for _ in range(cloud_init_file_timeout): + remote_host.get_file( + cloud_init_file, tmp_file + ) # This command will automatically test connection with open(tmp_file, encoding="utf-16") as file: content = file.read().lower() - if "user data script completed" in content: + if "user data script completed" in content: print(" ✅ Verified UserdataExecution had completed successfully!") - return + return else: - print(" Looking for the UserdataExecution.log file to finish completing...") - time.sleep(1) - raise RuntimeError("❌ The UserdataExecution file did not finish successfully in time") + print( + " Looking for the UserdataExecution.log file to finish completing..." + ) + time.sleep(1) + raise RuntimeError( + "❌ The UserdataExecution file did not finish successfully in time" + ) + @u.print_test_decorator -def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: - #Since our test is being done on a linux EC2, we can just check it initializes and runs similar to linux test +def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: + # Since our test is being done on a linux EC2, we can just check it initializes and runs similar to linux test run_test_linux(remote_host, env_vars) - pass - + pass + @u.print_test_decorator -def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: +def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: """ - This test validates that the cloud-init file finished successfully + This test validates that the cloud-init file finished successfully and ec2 instance is in stable state prior to running other Args: - remote_host (Host): instance to ssh into + remote_host (Host): instance to ssh into env_vars (dict): environment variables passed into for testing Raises: @@ -83,34 +94,41 @@ def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: cloud_init_file = "/var/log/cloud-init-output.log" tmp_file = "/tmp/cloud-init-output.log" - cloud_init_file_timeout = 240 # 4 minutes + cloud_init_file_timeout = 240 # 4 minutes - #Test cloud-init file finished successfully - for _ in range(cloud_init_file_timeout): - remote_host.get_file(cloud_init_file, tmp_file) # This command will automatically test connection + # Test cloud-init file finished successfully + for _ in range(cloud_init_file_timeout): + remote_host.get_file( + cloud_init_file, tmp_file + ) # This command will automatically test connection with open(tmp_file, "r") as file: content = file.read().lower() - if "finished at" in content: + if "finished at" in content: print(" ✅ Verified cloud-init file had completed successfully!") - return + return else: - print(" Looking for the cloud-init file to finish completing...") - time.sleep(1) - raise RuntimeError("❌ The cloud-init file did not finish successfully in time") + print(" Looking for the cloud-init file to finish completing...") + time.sleep(1) + raise RuntimeError("❌ The cloud-init file did not finish successfully in time") + + +if __name__ == "__main__": -if __name__ == '__main__': - env_vars = u.get_env_vars() - remote_host = u.Host(host_ip=env_vars["host"], - username=env_vars["user"], - key_file_path=env_vars["key_filename"], - password=env_vars["password"]) - - #Test SSH Connection before starting test of interest - remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) - - - if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]: + remote_host = u.Host( + host_ip=env_vars["host"], + username=env_vars["user"], + key_file_path=env_vars["key_filename"], + password=env_vars["password"], + ) + + # Test SSH Connection before starting test of interest + remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) + + if ( + "redhat" in env_vars["machine_config"]["distribution"] + or "debian" in env_vars["machine_config"]["distribution"] + ): run_test_linux(remote_host, env_vars) elif "windows" in env_vars["machine_config"]["distribution"]: run_test_windows(remote_host, env_vars) diff --git a/integration/scripts/test_install.py b/integration/scripts/test_install.py index 9a622c6a6..503f6a5a8 100755 --- a/integration/scripts/test_install.py +++ b/integration/scripts/test_install.py @@ -4,8 +4,8 @@ import os import sys import re -import time -import inspect +import time +import inspect import utils as u @@ -23,18 +23,20 @@ def _get_installation_package(env_vars: dict) -> tuple: """ current_dir = os.getcwd() - dist_directory = os.path.abspath(os.path.join(current_dir, '..', 'dist')) + dist_directory = os.path.abspath(os.path.join(current_dir, "..", "dist")) print(f"Path to 'dist' directory: {dist_directory}") # List files in the directory - files = os.listdir(dist_directory) + files = os.listdir(dist_directory) # Search criteria package_type = env_vars["machine_config"]["package_type"] architecture = env_vars["machine_config"]["architecture"] distribution = env_vars["machine_config"]["distribution"] - print(f"Looking for installation package '{package_type}' and architecture '{architecture}'") + print( + f"Looking for installation package '{package_type}' and architecture '{architecture}'" + ) # Iterate through files and find matches for filename in files: @@ -45,60 +47,76 @@ def _get_installation_package(env_vars: dict) -> tuple: full_path = os.path.join(dist_directory, filename) print(f"Found matching file {filename} at: {full_path}") return filename, full_path - u.die(f"❌ No matching file found for {distribution},{architecture},{package_type} in {dist_directory}: {', '.join(files)}") + u.die( + f"❌ No matching file found for {distribution},{architecture},{package_type} in {dist_directory}: {', '.join(files)}" + ) @u.print_test_decorator -def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: - +def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: """ - Test to install local observe-agent on a windows ec2 instance and validate command ran successfully + Test to install local observe-agent on a windows ec2 instance and validate command ran successfully Args: - remote_host (Host): instance to ssh into + remote_host (Host): instance to ssh into env_vars (dict): environment variables passed into for testing Raises: RuntimeError: Installation error in powershell script """ - # Get built dist. installation package path for machine - filename, full_path = _get_installation_package(env_vars) - - # Set windows home dir paths for consistency - home_dir = r"/C:/Users/{}".format(env_vars["user"]) #for user in sftp - home_dir_powershell = r"C:\Users\{}".format(env_vars["user"]) #for use in powershell script - - # Find agent installation script path - current_script_dir = os.path.dirname(os.path.abspath(__file__)) - ps_installation_script_path = os.path.join(current_script_dir, 'install_windows.ps1') - - - # Copy built distribution package to remote host home dir - remote_host.put_file(local_path=full_path, remote_path=home_dir) #Eg: sftp to /C:/Users/Adminstrator/observe-agent_Windows_x86_64.zip - - # Copy observe-agent powershell installation script to remote host home dir - remote_host.put_file(local_path=ps_installation_script_path, remote_path=home_dir) #Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1 + # Get built dist. installation package path for machine + filename, full_path = _get_installation_package(env_vars) + + # Set windows home dir paths for consistency + home_dir = r"/C:/Users/{}".format(env_vars["user"]) # for user in sftp + home_dir_powershell = r"C:\Users\{}".format( + env_vars["user"] + ) # for use in powershell script + + # Find agent installation script path + current_script_dir = os.path.dirname(os.path.abspath(__file__)) + ps_installation_script_path = os.path.join( + current_script_dir, "install_windows.ps1" + ) + + # Copy built distribution package to remote host home dir + remote_host.put_file( + local_path=full_path, remote_path=home_dir + ) # Eg: sftp to /C:/Users/Adminstrator/observe-agent_Windows_x86_64.zip + + # Copy observe-agent powershell installation script to remote host home dir + remote_host.put_file( + local_path=ps_installation_script_path, remote_path=home_dir + ) # Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1 # Run install script and pass in distribution package path # Eg: .\install_windows.ps1 -local_installer C:\Users\Adminstrator\observe-agent_Windows_x86_64.zip - # observe-agent gets installed to C:\Program Files\observe-agent on ec2 machine - result = remote_host.run_command('.\install_windows.ps1 -local_installer {}\{}'.format(home_dir_powershell, filename)) + # observe-agent gets installed to C:\Program Files\observe-agent on ec2 machine + result = remote_host.run_command( + r".\install_windows.ps1 -local_installer {}\{}".format( + home_dir_powershell, filename + ) + ) print(result) - - if result.stderr: #Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr - raise RuntimeError("❌ Installation error in install_windows.ps1 powershell script") - else: + + if ( + result.stderr + ): # Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr + raise RuntimeError( + "❌ Installation error in install_windows.ps1 powershell script" + ) + else: print("✅ Installation test passed") - + @u.print_test_decorator -def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: +def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: - filename, full_path= _get_installation_package(env_vars) + filename, full_path = _get_installation_package(env_vars) home_dir = "/home/{}".format(env_vars["user"]) remote_host.put_file(full_path, home_dir) - result = remote_host.run_command('sudo docker load --input {}'.format(filename)) + result = remote_host.run_command("sudo docker load --input {}".format(filename)) if result.stderr: print(result) raise RuntimeError("❌ Installation error in docker load") @@ -107,51 +125,53 @@ def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: @u.print_test_decorator -def run_test_linux(remote_host: u.Host, env_vars: dict): +def run_test_linux(remote_host: u.Host, env_vars: dict): """ - Test to install local observe-agent on a linux ec2 instance and validate command ran successfully + Test to install local observe-agent on a linux ec2 instance and validate command ran successfully Args: - remote_host (Host): instance to ssh into + remote_host (Host): instance to ssh into env_vars (dict): environment variables passed into for testing Raises: - RuntimeError: Unknown distribution type passed + RuntimeError: Unknown distribution type passed """ - filename, full_path= _get_installation_package(env_vars) + filename, full_path = _get_installation_package(env_vars) home_dir = "/home/{}".format(env_vars["user"]) remote_host.put_file(full_path, home_dir) if "redhat" in env_vars["machine_config"]["distribution"]: - result = remote_host.run_command('cd ~ && sudo yum localinstall {} -y'.format(filename)) - elif "debian" in env_vars["machine_config"]["distribution"] : - result = remote_host.run_command('cd ~ && sudo dpkg -i {}'.format(filename)) + result = remote_host.run_command( + "cd ~ && sudo yum localinstall {} -y".format(filename) + ) + elif "debian" in env_vars["machine_config"]["distribution"]: + result = remote_host.run_command("cd ~ && sudo dpkg -i {}".format(filename)) else: - raise RuntimeError("❌ Unknown distribution type") - - print(result) + raise RuntimeError("❌ Unknown distribution type") + + print(result) print("✅ Installation test passed") +if __name__ == "__main__": -if __name__ == '__main__': - env_vars = u.get_env_vars() - remote_host = u.Host(host_ip=env_vars["host"], - username=env_vars["user"], - key_file_path=env_vars["key_filename"], - password=env_vars["password"]) - - #Test SSH Connection before starting test of interest - remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) - - if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]: + remote_host = u.Host( + host_ip=env_vars["host"], + username=env_vars["user"], + key_file_path=env_vars["key_filename"], + password=env_vars["password"], + ) + + # Test SSH Connection before starting test of interest + remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) + + if ( + "redhat" in env_vars["machine_config"]["distribution"] + or "debian" in env_vars["machine_config"]["distribution"] + ): run_test_linux(remote_host, env_vars) elif "windows" in env_vars["machine_config"]["distribution"]: run_test_windows(remote_host, env_vars) elif "docker" in env_vars["machine_config"]["distribution"]: run_test_docker(remote_host, env_vars) - - - - diff --git a/integration/scripts/test_start.py b/integration/scripts/test_start.py index 6f437a46f..cf7aab88f 100755 --- a/integration/scripts/test_start.py +++ b/integration/scripts/test_start.py @@ -2,161 +2,165 @@ import os import sys import re -import time +import time import pprint import utils as u from collections import defaultdict -def _check_status_loop(remote_host: u.Host, start_timeout: int, status_command: str) -> bool: + +def _check_status_loop( + remote_host: u.Host, start_timeout: int, status_command: str +) -> bool: """Run Check Status Command in a loop to wait for observe-agent to start Args: - remote_host (Host): instance to ssh into - start_timeout (int): timeout in seconds to wait for agent to start - status_command (str): windows/linux status command to run + remote_host (Host): instance to ssh into + start_timeout (int): timeout in seconds to wait for agent to start + status_command (str): windows/linux status command to run Returns: bool: agent_status """ - - - agent_status=False - for _ in range(start_timeout): - metrics_dict = defaultdict(list) + + agent_status = False + for _ in range(start_timeout): + metrics_dict = defaultdict(list) + try: result = remote_host.run_command(status_command) - for line in result.stdout.splitlines(): - if ":" in line: - metric, value = line.split(":", 1) - metric = metric.strip() - value = value.strip() - metrics_dict[metric].append(value) - print(line) - if metrics_dict["Status"] and metrics_dict["Status"][0] == "Running": - print("✅ Observe Agent is active and running without errors!") - agent_status=True - break - else: - print("❌ Observe Agent is not running. Retry Count is {}/{}...".format(_+1, start_timeout)) - time.sleep(1) + except Exception as e: + print("Ignoring exception: ", e) + time.sleep(1) + continue + for line in result.stdout.splitlines(): + if ":" in line: + metric, value = line.split(":", 1) + metric = metric.strip() + value = value.strip() + metrics_dict[metric].append(value) + print(line) + if metrics_dict["Status"] and metrics_dict["Status"][0] == "Running": + print("✅ Observe Agent is active and running without errors!") + agent_status = True + break + print( + "❌ Observe Agent is not running. Retry Count is {}/{}...".format( + _ + 1, start_timeout + ) + ) + time.sleep(1) return agent_status - + @u.print_test_decorator -def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: +def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: """ - Test to check if observe-agent is running correctly + Test to check if observe-agent is running correctly Args: - remote_host (Host): instance to ssh into - env_vars (dict): environment variables passed into for testing - - """ - - #status - start_command=r'.\start_agent_windows.ps1' - status_command=r'Get-Service ObserveAgent;Set-Location "${Env:Programfiles}\Observe\observe-agent"; ./observe-agent status' - start_timeout = 30 #how long to wait for observe-agent to start - - # Get windows home dir paths for consistency - home_dir = r"/C:/Users/{}".format(env_vars["user"]) #for user in sftp - - # Find start agent script path - current_script_dir = os.path.dirname(os.path.abspath(__file__)) - ps_installation_script_path = os.path.join(current_script_dir, 'start_agent_windows.ps1') - - #Copy start_agent powershell installation script to remote host home dir - remote_host.put_file(local_path=ps_installation_script_path, remote_path=home_dir) #Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1 - # Run start_agent script + remote_host (Host): instance to ssh into + env_vars (dict): environment variables passed into for testing + + """ + + # status + start_command = r".\start_agent_windows.ps1" + status_command = r'Get-Service ObserveAgent;Set-Location "${Env:Programfiles}\Observe\observe-agent"; ./observe-agent status' + start_timeout = 30 # how long to wait for observe-agent to start + + # Get windows home dir paths for consistency + home_dir = r"/C:/Users/{}".format(env_vars["user"]) # for user in sftp + + # Find start agent script path + current_script_dir = os.path.dirname(os.path.abspath(__file__)) + ps_installation_script_path = os.path.join( + current_script_dir, "start_agent_windows.ps1" + ) + + # Copy start_agent powershell installation script to remote host home dir + remote_host.put_file( + local_path=ps_installation_script_path, remote_path=home_dir + ) # Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1 + # Run start_agent script result = remote_host.run_command(start_command) print(result) - if result.stderr: #Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr - raise RuntimeError("❌ Error in start_agent_windows.ps1 powershell script") - - #Check Agent Status - agent_status=_check_status_loop(remote_host, start_timeout, status_command) + if ( + result.stderr + ): # Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr + raise RuntimeError("❌ Error in start_agent_windows.ps1 powershell script") + + # Check Agent Status + agent_status = _check_status_loop(remote_host, start_timeout, status_command) if not agent_status: - u.die("❌ Error in Observe Agent Status Test ") + u.die("❌ Error in Observe Agent Status Test ") + @u.print_test_decorator -def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: - - docker_prefix='sudo docker run -d --restart always \ - --mount type=bind,source=/proc,target=/hostfs/proc,readonly \ - --mount type=bind,source=/snap,target=/hostfs/snap,readonly \ - --mount type=bind,source=/boot,target=/hostfs/boot,readonly \ - --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \ - --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \ - --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \ - --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \ - --pid host \ - $(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep SNAPSHOT)' - start_command='start' - get_container_command = ( - "sudo docker ps --filter \"status=running\" --format \"{{.ID}} {{.Image}} {{.CreatedAt}}\" | " - "grep \"SNAPSHOT\" | sort -k3 -r | head -n 1 | awk '{print $1}'" - ) - start_timeout = 30 #how long to wait for observe-agent to start - - #Start Observe Agent - result = remote_host.run_command(docker_prefix + ' ' + start_command) - if result.stderr: +def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: + docker_prefix = u.get_docker_prefix(remote_host, True) + start_command = "start" + start_timeout = 30 # how long to wait for observe-agent to start + + # Start Observe Agent + result = remote_host.run_command(docker_prefix + " " + start_command) + if result.stderr: u.die("❌ Error starting observe-agent container") + else: + print("✅ Observe Agent started successfully: " + result.stdout) - #Get Observe Agent Container ID - container_id = remote_host.run_command(get_container_command) - status_command='sudo docker exec {} ./observe-agent status'.format(container_id.stdout.strip()) - if not container_id: - u.die("❌ Error in finding observe-agent container") + # Get Observe Agent Container ID + container_id = u.get_docker_container(remote_host) + status_command = f"sudo docker exec {container_id} ./observe-agent status" - #Check Agent Status - agent_status=_check_status_loop(remote_host, start_timeout, status_command) - if not agent_status: + # Check Agent Status + agent_status = _check_status_loop(remote_host, start_timeout, status_command) + if not agent_status: u.die("❌ Error in Observe Agent Status Test ") @u.print_test_decorator -def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: - - """ - Test to check if observe-agent is running correctly +def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: + """ + Test to check if observe-agent is running correctly Args: - remote_host (Host): instance to ssh into - env_vars (dict): environment variables passed into for testing + remote_host (Host): instance to ssh into + env_vars (dict): environment variables passed into for testing - """ + """ - start_command='sudo systemctl enable --now observe-agent' - status_command='observe-agent status' - start_timeout = 30 #how long to wait for observe-agent to start + start_command = "sudo systemctl enable --now observe-agent" + status_command = "observe-agent status" + start_timeout = 30 # how long to wait for observe-agent to start + # Start Observe Agent + remote_host.run_command(start_command) - #Start Observe Agent - remote_host.run_command(start_command) - - #Check Agent Status - agent_status=_check_status_loop(remote_host, start_timeout, status_command) - if not agent_status: + # Check Agent Status + agent_status = _check_status_loop(remote_host, start_timeout, status_command) + if not agent_status: u.die("❌ Error in Observe Agent Status Test ") - -if __name__ == '__main__': + +if __name__ == "__main__": env_vars = u.get_env_vars() - remote_host = u.Host(host_ip=env_vars["host"], - username=env_vars["user"], - key_file_path=env_vars["key_filename"], - password=env_vars["password"]) - - #Test SSH Connection before starting test of interest - remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) - - if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]: + remote_host = u.Host( + host_ip=env_vars["host"], + username=env_vars["user"], + key_file_path=env_vars["key_filename"], + password=env_vars["password"], + ) + + # Test SSH Connection before starting test of interest + remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) + + if ( + "redhat" in env_vars["machine_config"]["distribution"] + or "debian" in env_vars["machine_config"]["distribution"] + ): run_test_linux(remote_host, env_vars) elif "windows" in env_vars["machine_config"]["distribution"]: run_test_windows(remote_host, env_vars) elif "docker" in env_vars["machine_config"]["distribution"]: run_test_docker(remote_host, env_vars) - - diff --git a/integration/scripts/test_version.py b/integration/scripts/test_version.py index c8d44a59c..962f97806 100755 --- a/integration/scripts/test_version.py +++ b/integration/scripts/test_version.py @@ -1,142 +1,138 @@ #!/usr/bin/env python3 import os -import sys import re -import time import utils as u + def _extract_version_config(result: any) -> tuple: - """Extract version name and config file from ssh result output + """Extract version name and config file from ssh result output Args: result (any): ssh result output Returns: - tuple: config_file, version of the installed observe-agent package + tuple: config_file, version of the installed observe-agent package """ - - # Split the output by newlines and extract everything after the colon - for line in result.stdout.splitlines(): - if ":" in line: - _, version = line.split(":", 1) - version = version.strip() # Remove leading/trailing whitespace - print(f"Version: {version}") - for line in result.stderr.splitlines(): - if ":" in line: - _, config_file = line.split(":", 1) - config_file = config_file.strip() # Remove leading/trailing whitespace - print(f"Config File: {config_file}") + + # Split the output by newlines and extract everything after the colon + version_match = re.search(r"version: (.*)(?:\n|$)", result.stdout) + if version_match is not None: + version = version_match.group(1).strip() + else: + raise ValueError( + f"❌ Failed: observe-agent version output did not match regex. Output: {result.stdout}" + ) + + config_match = re.search(r"config file: (.*)(?:\n|$)", result.stdout) + if config_match is not None: + config_file = config_match.group(1).strip() + else: + raise ValueError( + f"❌ Failed: observe-agent version output did not match regex. Output: {result.stdout}" + ) + print(f"Version: {version}, Config File: {config_file}") return config_file, version @u.print_test_decorator -def run_test_windows(remote_host:u.Host, env_vars: dict) -> None: - +def run_test_windows(remote_host: u.Host, env_vars: dict) -> None: """ - Test to validate observe-agent version and config file loaded is correct + Test to validate observe-agent version and config file loaded is correct Args: - remote_host (Host): instance to ssh into + remote_host (Host): instance to ssh into env_vars (dict): environment variables passed into for testing Raises: ValueError: if version or config file is invalid """ - config_file_windows = 'C:\\Program Files\\Observe\\observe-agent\\observe-agent.yaml' - #Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2 - version_pattern = re.compile(r'^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$') + config_file_windows = ( + "C:\\Program Files\\Observe\\observe-agent\\observe-agent.yaml" + ) + # Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2 + version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$") - result = remote_host.run_command('Set-Location "${Env:Programfiles}\\Observe\\observe-agent"; ./observe-agent version') + result = remote_host.run_command( + 'Set-Location "${Env:Programfiles}\\Observe\\observe-agent"; ./observe-agent version' + ) config_file, version = _extract_version_config(result) - + if config_file != config_file_windows: raise ValueError(f" ❌ Invalid config file: {config_file}") if not version_pattern.match(version): raise ValueError(f" ❌ Invalid version: {version}") - print (" ✅ Verified version and config file succesfully! ") + print(" ✅ Verified version and config file succesfully! ") - pass + pass @u.print_test_decorator -def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: - docker_prefix='sudo docker run \ - --mount type=bind,source=/proc,target=/hostfs/proc,readonly \ - --mount type=bind,source=/snap,target=/hostfs/snap,readonly \ - --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \ - --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \ - --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \ - --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \ - --pid host \ - $(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep SNAPSHOT)' - config_file_linux = '/etc/observe-agent/observe-agent.yaml' - version_pattern = re.compile(r'^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$') - home_dir = "/home/{}".format(env_vars["user"]) - - # Upload default observe-agent.yaml to remote host home dir - # mount via $(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml - observe_agent_file_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'packaging/linux/config/observe-agent.yaml')) - print(f"Path to 'observe-agent.yaml' file: {observe_agent_file_path }") - remote_host.put_file(local_path=observe_agent_file_path, remote_path=home_dir) - - #Run command to get version & config-file info - result = remote_host.run_command('{} version'.format(docker_prefix)) +def run_test_docker(remote_host: u.Host, env_vars: dict) -> None: + u.upload_default_docker_config(env_vars, remote_host) + docker_prefix = u.get_docker_prefix(remote_host, False) + config_file_linux = "/etc/observe-agent/observe-agent.yaml" + version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$") + + # Run command to get version & config-file info + result = remote_host.run_command(docker_prefix + " version") config_file, version = _extract_version_config(result) - - if config_file != config_file_linux: + if config_file != config_file_linux: raise ValueError(f" ❌ Invalid config file: {config_file}") if not version_pattern.match(version): raise ValueError(f" ❌ Invalid version: {version}") - print (" ✅ Verified version and config file succesfully! ") + print(" ✅ Verified version and config file succesfully! ") -@u.print_test_decorator -def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: +@u.print_test_decorator +def run_test_linux(remote_host: u.Host, env_vars: dict) -> None: """ - Test to validate observe-agent version and config file loaded is correct + Test to validate observe-agent version and config file loaded is correct Args: - remote_host (Host): instance to ssh into + remote_host (Host): instance to ssh into env_vars (dict): environment variables passed into for testing Raises: ValueError: if version or config file is invalid """ - config_file_linux = '/etc/observe-agent/observe-agent.yaml' - #Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2 - version_pattern = re.compile(r'^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$') - - result = remote_host.run_command('observe-agent version') + config_file_linux = "/etc/observe-agent/observe-agent.yaml" + # Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2 + version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$") + + result = remote_host.run_command("observe-agent version") config_file, version = _extract_version_config(result) - + if config_file != config_file_linux: raise ValueError(f" ❌ Invalid config file: {config_file}") if not version_pattern.match(version): raise ValueError(f" ❌ Invalid version: {version}") - print (" ✅ Verified version and config file succesfully! ") + print(" ✅ Verified version and config file succesfully! ") -if __name__ == '__main__': +if __name__ == "__main__": env_vars = u.get_env_vars() - remote_host = u.Host(host_ip=env_vars["host"], - username=env_vars["user"], - key_file_path=env_vars["key_filename"], - password=env_vars["password"]) - - #Test SSH Connection before starting test of interest - remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) - - if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]: + remote_host = u.Host( + host_ip=env_vars["host"], + username=env_vars["user"], + key_file_path=env_vars["key_filename"], + password=env_vars["password"], + ) + + # Test SSH Connection before starting test of interest + remote_host.test_conection(int(env_vars["machine_config"]["sleep"])) + + if ( + "redhat" in env_vars["machine_config"]["distribution"] + or "debian" in env_vars["machine_config"]["distribution"] + ): run_test_linux(remote_host, env_vars) elif "windows" in env_vars["machine_config"]["distribution"]: run_test_windows(remote_host, env_vars) elif "docker" in env_vars["machine_config"]["distribution"]: run_test_docker(remote_host, env_vars) - - diff --git a/integration/scripts/utils.py b/integration/scripts/utils.py index 6fe677d11..2bb2f6090 100644 --- a/integration/scripts/utils.py +++ b/integration/scripts/utils.py @@ -1,37 +1,47 @@ +from typing import Any, Dict from socket import error as socket_error -from fabric import Connection +from fabric import Connection, Result from paramiko.ssh_exception import AuthenticationException, NoValidConnectionsError import os import sys -import re -import time -import json -import pprint +import time def die(message: str) -> None: print(message, file=sys.stderr) sys.exit(1) -def mask_credentials(env_vars): + +def print_remote_result(result: Result) -> None: + print(str(result)) + + +def mask_credentials(env_vars: Dict[str, Any]) -> Dict[str, Any]: masked_env_vars = env_vars.copy() - #Only mask if vars exist - if masked_env_vars["password"] and masked_env_vars["password"] is not None and masked_env_vars["password"] != "None" : - masked_env_vars["password"] = '*' * 5 - if masked_env_vars["observe_token"] and masked_env_vars["observe_token"] is not None and masked_env_vars["observe_token"] != "None": - masked_env_vars["observe_token"] = '*' * 5 + # Only mask if vars exist + if ( + masked_env_vars["password"] + and masked_env_vars["password"] is not None + and masked_env_vars["password"] != "None" + ): + masked_env_vars["password"] = "*" * 5 + if ( + masked_env_vars["observe_token"] + and masked_env_vars["observe_token"] is not None + and masked_env_vars["observe_token"] != "None" + ): + masked_env_vars["observe_token"] = "*" * 5 return masked_env_vars -def get_env_vars(need_observe: bool = False) -> dict: - +def get_env_vars(need_observe: bool = False) -> Dict[str, Any]: """Gets environmental variables from OS and returns a dict of env_vars Args: need_observe (bool, optional): whether or not to require observe url/token variables. - Defaults to False. + Defaults to False. Returns: _type_: dict of environment variables @@ -39,45 +49,60 @@ def get_env_vars(need_observe: bool = False) -> dict: host = os.environ.get("HOST") user = os.environ.get("USER") key_filename = os.environ.get("KEY_FILENAME") - password=os.environ.get("PASSWORD") - machine_name=os.environ.get("MACHINE_NAME") - machine_config_string=os.environ.get("MACHINE_CONFIG") - observe_url=os.environ.get("OBSERVE_URL") - observe_token=os.environ.get("OBSERVE_TOKEN") + password = os.environ.get("PASSWORD") + machine_name = os.environ.get("MACHINE_NAME") + machine_config_string = os.environ.get("MACHINE_CONFIG") + observe_url = os.environ.get("OBSERVE_URL") + observe_token = os.environ.get("OBSERVE_TOKEN") mask = os.getenv("MASK", "True").lower() not in ("false", "0", "f", "no", "n") - if host is None: - die("Error: HOST environment variable is not set. This should be an output variable from create_ec2 module") + die( + "Error: HOST environment variable is not set. This should be an output variable from create_ec2 module" + ) if user is None: - die("Error: USER environment variable is not set. This should be an output variable from create_ec2 module") + die( + "Error: USER environment variable is not set. This should be an output variable from create_ec2 module" + ) if key_filename is None: - die("Error: KEY_FILENAME environment variable is not set. This should be an output variable from create_ec2 module") + die( + "Error: KEY_FILENAME environment variable is not set. This should be an output variable from create_ec2 module" + ) - if (password == 'None' or password is None) and "WINDOWS" in machine_name: - die("Error: Windows is specified but PASSWORD environment variable is not set. This should be an output variable from create_ec2 module") + if (password == "None" or password is None) and "WINDOWS" in machine_name: + die( + "Error: Windows is specified but PASSWORD environment variable is not set. This should be an output variable from create_ec2 module" + ) if machine_name is None: - die("Error: MACHINE_NAME environment variable is not set. This should be an output variable from create_ec2 module") + die( + "Error: MACHINE_NAME environment variable is not set. This should be an output variable from create_ec2 module" + ) if machine_config_string is None: - die("Error: MACHINE_CONFIG environment variable is not set. This should be an output variable from create_ec2 module") + die( + "Error: MACHINE_CONFIG environment variable is not set. This should be an output variable from create_ec2 module" + ) if observe_url is None and need_observe: - die("Error: OBSERVE_URL environment variable is not set. This should be an output variable from setup_observe_variables module") + die( + "Error: OBSERVE_URL environment variable is not set. This should be an output variable from setup_observe_variables module" + ) if observe_token is None and need_observe: - die("Error: OBSERVE_TOKEN environment variable is not set. This should be an output variable from setup_observe_variables module") + die( + "Error: OBSERVE_TOKEN environment variable is not set. This should be an output variable from setup_observe_variables module" + ) - # Split the string into key-value pairs - pairs = machine_config_string.split(',') + # Split the string into key-value pairs + pairs = machine_config_string.split(",") data = {} for pair in pairs: - key, value = pair.split(':', 1) # + key, value = pair.split(":", 1) # data[key] = value - + env_vars = { "host": host, "user": user, @@ -86,20 +111,20 @@ def get_env_vars(need_observe: bool = False) -> dict: "machine_name": machine_name, "machine_config": data, "observe_url": observe_url, - "observe_token": observe_token + "observe_token": observe_token, } # Mask sensitive vars before printing masked_env_vars = mask_credentials(env_vars) - print("-"*30) + print("-" * 30) if mask: print("Masking Enabled") - print("Env vars set to: \n", masked_env_vars ) + print("Env vars set to: \n", masked_env_vars) else: print("Masking Disabled") - print("Env vars set to: \n", env_vars ) - print("-"*30) + print("Env vars set to: \n", env_vars) + print("-" * 30) return env_vars @@ -112,51 +137,65 @@ def wrapper(*args, **kwargs): result = func(*args, **kwargs) print("*" * 30) return result + return wrapper -class ExampleException(Exception): #We can put our custom exceptions here + +class ExampleException(Exception): # We can put our custom exceptions here pass class Host(object): + """Host class for SSH into EC2 instances""" - """Host class for SSH into EC2 instances - """ - def __init__(self, host_ip, username, key_file_path,password=None): + def __init__(self, host_ip, username, key_file_path, password=None): self.host_ip = host_ip self.username = username self.key_file_path = key_file_path - self.password=password + self.password = password def _get_connection(self) -> Connection: - connect_kwargs = {'key_filename': self.key_file_path, - 'password': self.password , - 'timeout': 60, - } - return Connection(host=self.host_ip, user=self.username, port=22, - connect_kwargs=connect_kwargs) - - def run_command(self, command): + connect_kwargs = { + "key_filename": self.key_file_path, + "password": self.password, + "timeout": 60, + } + return Connection( + host=self.host_ip, + user=self.username, + port=22, + connect_kwargs=connect_kwargs, + ) + + def run_command(self, command) -> Result: try: with self._get_connection() as connection: - print('Running `{0}` on {1}'.format(command, self.host_ip)) - result = connection.run(command, warn=True, hide=True) + print("Running `{0}` on {1}".format(command, self.host_ip)) + result = connection.run(command, warn=True, hide=True) except (socket_error, AuthenticationException) as exc: self._raise_authentication_err(exc) if result.failed: raise ExampleException( - 'The command `{0}` on host {1} failed with the error: ' - '{2}'.format(command, self.host_ip, str(result.stderr))) - - return result + "The command `{0}` on host {1} failed with the error: " + "{2}\n\nCommand output: {3}".format( + command, + self.host_ip, + str(result.stderr) or "", + str(result.stdout) or "", + ) + ) + return result def put_file(self, local_path, remote_path) -> None: try: with self._get_connection() as connection: - print('Copying {0} to {1} on host {2}'.format( - local_path, remote_path, self.host_ip)) + print( + "Copying {0} to {1} on host {2}".format( + local_path, remote_path, self.host_ip + ) + ) connection.put(local_path, remote_path) except (socket_error, AuthenticationException) as exc: self._raise_authentication_err(exc) @@ -164,21 +203,25 @@ def put_file(self, local_path, remote_path) -> None: def get_file(self, remote_path, local_path) -> None: try: with self._get_connection() as connection: - print('Copying {0} to {1} from host {2}'.format( - remote_path, local_path, self.host_ip)) + print( + "Copying {0} to {1} from host {2}".format( + remote_path, local_path, self.host_ip + ) + ) connection.get(remote_path, local_path) except (socket_error, AuthenticationException) as exc: self._raise_authentication_err(exc) - def _raise_authentication_err(self, exc): + def _raise_authentication_err(self, exc) -> None: raise ExampleException( "SSH: could not connect to {host} " "(username: {user}, key: {key}): {exc}".format( - host=self.host_ip, user=self.username, - key=self.key_file_path, exc=exc)) - - def test_conection(self, timeout=60): - """Tests SSH connection to the host + host=self.host_ip, user=self.username, key=self.key_file_path, exc=exc + ) + ) + + def test_conection(self, timeout=60) -> None: + """Tests SSH connection to the host Args: timeout (int, optional): how long to wait for the connection to be established. Defaults to 60. @@ -186,16 +229,92 @@ def test_conection(self, timeout=60): Raises: RuntimeError: SSH connection failures if the timeout is reached and no valid connection found """ - print("Testing SSH connection to host {} with timeout {}s".format(self.host_ip, timeout)) + print( + "Testing SSH connection to host {} with timeout {}s".format( + self.host_ip, timeout + ) + ) for _ in range(timeout): connection = self._get_connection() try: connection.open() print("✅ SSH connection successful") connection.close() - return + return except (socket_error, NoValidConnectionsError) as exc: print(f"❌ SSH connection failed: {exc}") time.sleep(1) raise RuntimeError(" ❌ The SSH connection failed") + +def get_docker_image(remote_host: Host) -> str: + result = remote_host.run_command( + 'sudo docker images --format "{{.Repository}}:{{.Tag}}"' + ) + images = [line.strip() for line in result.stdout.splitlines() if "SNAPSHOT" in line] + if len(images) != 1: + die("❌ Error in finding observe-agent image\n" + str(result)) + + return images[0] + + +def get_docker_prefix(remote_host: Host, detach: bool) -> str: + image = get_docker_image(remote_host) + return f'sudo docker run {"-d --restart on-failure" if detach else ""} \ + --mount type=bind,source=/proc,target=/hostfs/proc,readonly \ + --mount type=bind,source=/snap,target=/hostfs/snap,readonly \ + --mount type=bind,source=/boot,target=/hostfs/boot,readonly \ + --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \ + --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \ + --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \ + --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \ + --pid host {image}' + + +def upload_default_docker_config(env_vars: dict, remote_host: Host) -> None: + home_dir = "/home/{}".format(env_vars["user"]) + # Upload default observe-agent.yaml to remote host home dir + # mount via $(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml + observe_agent_file_path = os.path.abspath( + os.path.join(os.getcwd(), "..", "packaging/linux/config/observe-agent.yaml") + ) + print(f"Path to 'observe-agent.yaml' file: {observe_agent_file_path }") + remote_host.put_file(local_path=observe_agent_file_path, remote_path=home_dir) + + +def get_docker_container(remote_host: Host) -> str: + get_container_command = 'sudo docker ps --filter "status=running" --format "{{.ID}} {{.Image}} {{.CreatedAt}}"' + result = remote_host.run_command(get_container_command) + running = [ + line.strip() for line in result.stdout.splitlines() if "SNAPSHOT" in line + ] + if len(running) == 0: + # No container matched our filter. Get logs from all containers to help debug. + result = remote_host.run_command('sudo docker ps --format "{{.ID}}"') + if result.stdout != "": + container_ids = result.stdout.splitlines() + for container_id in container_ids: + print( + "Logs for container {}:".format(container_id), + file=sys.stderr, + ) + result = remote_host.run_command( + "sudo docker logs {}".format(container_id) + ) + print_remote_result(result) + else: + print_remote_result(result) + die( + "❌ Error in finding observe-agent container; command output:\n{}\ncommand error:\n{}".format( + result.stdout or "", + result.stderr or "", + ) + ) + return "" + if len(running) > 1: + die( + "❌ Error in finding observe-agent container, too many snapshots running:\n" + + result.stdout + ) + # Only one snapshot running; return the ID from the first line. + return running[0].split()[0] diff --git a/integration/tests/integration.tftest.hcl b/integration/tests/integration.tftest.hcl index 01fac5a6b..7e69c7f46 100644 --- a/integration/tests/integration.tftest.hcl +++ b/integration/tests/integration.tftest.hcl @@ -15,7 +15,6 @@ run "setup_observe_variables" { } - run "test_ec2_connection" { module { source = "observeinc/collection/aws//modules/testing/exec" @@ -41,8 +40,6 @@ run "test_ec2_connection" { } - - run "test_install" { module { source = "observeinc/collection/aws//modules/testing/exec" @@ -68,8 +65,6 @@ run "test_install" { } - - run "test_version" { module { source = "observeinc/collection/aws//modules/testing/exec" @@ -95,8 +90,6 @@ run "test_version" { } - - run "test_configure" { module { source = "observeinc/collection/aws//modules/testing/exec" @@ -123,6 +116,7 @@ run "test_configure" { } } + run "test_start" { module { source = "observeinc/collection/aws//modules/testing/exec" @@ -147,3 +141,27 @@ run "test_start" { } } + +run "test_diagnose" { + module { + source = "observeinc/collection/aws//modules/testing/exec" + version = "2.9.0" + } + + variables { + command = "python3 ./scripts/test_diagnose.py" + env_vars = { + HOST = run.setup_ec2.public_ip + USER = run.setup_ec2.user_name + KEY_FILENAME = run.setup_ec2.private_key_path + PASSWORD = run.setup_ec2.password + MACHINE_NAME = run.setup_ec2.machine_name + MACHINE_CONFIG = run.setup_ec2.machine_config + } + } + + assert { + condition = output.error == "" + error_message = "Error in Diagnose Test" + } +} diff --git a/internal/commands/diagnose/agentstatuscheck.go b/internal/commands/diagnose/agentstatuscheck.go new file mode 100644 index 000000000..87a46a21f --- /dev/null +++ b/internal/commands/diagnose/agentstatuscheck.go @@ -0,0 +1,59 @@ +package diagnose + +import ( + "embed" + + "github.com/observeinc/observe-agent/internal/commands/status" + "github.com/spf13/viper" +) + +type StatusTestResult struct { + Passed bool + AgentRunning bool + Error string +} + +func checkStatus(_ *viper.Viper) (bool, any, error) { + data, err := status.GetStatusData() + if err != nil { + return false, StatusTestResult{ + Passed: false, + AgentRunning: false, + Error: err.Error(), + }, nil + } + if data.Status != status.Running.String() { + return false, StatusTestResult{ + Passed: false, + AgentRunning: false, + Error: "agent is not running", + }, nil + } + if data.AgentMetrics == (status.AgentMetrics{}) { + return false, StatusTestResult{ + Passed: false, + AgentRunning: true, + Error: "agent metrics are not available", + }, nil + } + return true, StatusTestResult{ + Passed: true, + AgentRunning: true, + }, nil +} + +const agentStatusCheckTemplate = "agentstatuscheck.tmpl" + +var ( + //go:embed agentstatuscheck.tmpl + agentStatusCheckTemplateFS embed.FS +) + +func agentstatusDiagnostic() Diagnostic { + return Diagnostic{ + check: checkStatus, + checkName: "Agent Status Check", + templateName: agentStatusCheckTemplate, + templateFS: agentStatusCheckTemplateFS, + } +} diff --git a/internal/commands/diagnose/agentstatuscheck.tmpl b/internal/commands/diagnose/agentstatuscheck.tmpl new file mode 100644 index 000000000..eef7b8efb --- /dev/null +++ b/internal/commands/diagnose/agentstatuscheck.tmpl @@ -0,0 +1,7 @@ +{{- if .Passed -}} +Observe agent is running and metrics are available. +{{- else if not .AgentRunning -}} +⚠️ Observe agent is not running. {{- if .Error }} Error: {{ .Error }}{{ end }} +{{- else -}} +⚠️ Observe agent status check failed with error: {{ .Error }} +{{- end -}} diff --git a/internal/commands/diagnose/authcheck.go b/internal/commands/diagnose/authcheck.go index e6c559eb8..09733682a 100644 --- a/internal/commands/diagnose/authcheck.go +++ b/internal/commands/diagnose/authcheck.go @@ -9,11 +9,6 @@ import ( "github.com/spf13/viper" ) -const ( - ChallengeURL = "https://175914298205.collect.observeinc.com/.well-known/fastly/logging/challenge" - AuthCheckURL = "https://175914298205.collect.observeinc.com/status" -) - type NetworkTestResult struct { URL string ResponseCode int @@ -69,11 +64,11 @@ func makeTestRequest(URL string, headers map[string]string) NetworkTestResult { } } -func makeAuthTestRequest(v *viper.Viper) (any, error) { +func makeAuthTestRequest(v *viper.Viper) (bool, any, error) { collector_url := v.GetString("observe_url") authToken := fmt.Sprintf("Bearer %s", v.GetString("token")) authTestResponse := makeTestRequest(collector_url, map[string]string{"Authorization": authToken}) - return authTestResponse, nil + return authTestResponse.Passed, authTestResponse, nil } // const networkcheckTemplate = "networkcheck.tmpl" diff --git a/internal/commands/diagnose/authcheck.tmpl b/internal/commands/diagnose/authcheck.tmpl index 9433bc852..4581ce6ac 100644 --- a/internal/commands/diagnose/authcheck.tmpl +++ b/internal/commands/diagnose/authcheck.tmpl @@ -1,13 +1,12 @@ Running auth check against {{ .URL }} -{{- if .Passed }} + +{{- if .Passed -}} Request to test URL responded with response code {{ .ResponseCode }} -{{- else }} -{{- if eq .ResponseCode 401 }} +{{- else if eq .ResponseCode 401 -}} ⚠️ Request to test URL failed with error {{ .Error }}. Remediation Please check that the token is present in the `observe-agent.yaml` config file and that the token is valid. -{{- else }} +{{- else -}} ⚠️ Request to test URL failed with error {{ .Error }} and response code {{ .ResponseCode }}. -{{- end }} -{{ end }} +{{- end -}} diff --git a/internal/commands/diagnose/configcheck.go b/internal/commands/diagnose/configcheck.go index 736cdfa35..9e4857b23 100644 --- a/internal/commands/diagnose/configcheck.go +++ b/internal/commands/diagnose/configcheck.go @@ -17,18 +17,18 @@ type ConfigTestResult struct { Error string } -func checkConfig(v *viper.Viper) (any, error) { +func checkConfig(v *viper.Viper) (bool, any, error) { configFile := v.ConfigFileUsed() if configFile == "" { - return nil, fmt.Errorf("no config file defined") + return false, nil, fmt.Errorf("no config file defined") } contents, err := os.ReadFile(configFile) if err != nil { - return nil, err + return false, nil, err } var conf config.AgentConfig if err = yaml.Unmarshal(contents, &conf); err != nil { - return ConfigTestResult{ + return false, ConfigTestResult{ ConfigFile: configFile, ParseSucceeded: false, IsValid: false, @@ -36,14 +36,14 @@ func checkConfig(v *viper.Viper) (any, error) { }, nil } if err = conf.Validate(); err != nil { - return ConfigTestResult{ + return false, ConfigTestResult{ ConfigFile: configFile, ParseSucceeded: true, IsValid: false, Error: err.Error(), }, nil } - return ConfigTestResult{ + return true, ConfigTestResult{ ConfigFile: configFile, ParseSucceeded: true, IsValid: true, diff --git a/internal/commands/diagnose/configcheck.tmpl b/internal/commands/diagnose/configcheck.tmpl index 6f804f8bd..e65d294ce 100644 --- a/internal/commands/diagnose/configcheck.tmpl +++ b/internal/commands/diagnose/configcheck.tmpl @@ -1,9 +1,10 @@ Running check on observe-agent config file {{ .ConfigFile }} -{{- if .IsValid }} + +{{- if .IsValid -}} Config file is valid. -{{- else if .ParseSucceeded}} +{{- else if .ParseSucceeded -}} ⚠️ Config file validation failed with error {{ .Error }} -{{- else }} +{{- else -}} ⚠️ Config file could not be parsed as YAML {{ .Error }} -{{- end }} +{{- end -}} diff --git a/internal/commands/diagnose/configcheck_test.go b/internal/commands/diagnose/configcheck_test.go index 01db3ccc7..686df8e1d 100644 --- a/internal/commands/diagnose/configcheck_test.go +++ b/internal/commands/diagnose/configcheck_test.go @@ -66,7 +66,7 @@ func Test_checkConfig(t *testing.T) { v := viper.New() v.SetConfigFile(f.Name()) - resultAny, err := checkConfig(v) + success, resultAny, err := checkConfig(v) assert.NoError(t, err) result, ok := resultAny.(ConfigTestResult) assert.True(t, ok) @@ -77,6 +77,7 @@ func Test_checkConfig(t *testing.T) { } assert.Equal(t, tc.shouldParse, result.ParseSucceeded) assert.Equal(t, tc.isValid, result.IsValid) + assert.Equal(t, tc.isValid && tc.shouldParse, success) assert.Equal(t, f.Name(), result.ConfigFile) } } diff --git a/internal/commands/diagnose/diagnose.go b/internal/commands/diagnose/diagnose.go index 16d65ae19..c2db1bccb 100644 --- a/internal/commands/diagnose/diagnose.go +++ b/internal/commands/diagnose/diagnose.go @@ -15,7 +15,7 @@ import ( ) type Diagnostic struct { - check func(*viper.Viper) (any, error) + check func(*viper.Viper) (bool, any, error) checkName string templateName string templateFS embed.FS @@ -24,6 +24,7 @@ type Diagnostic struct { var diagnostics = []Diagnostic{ configDiagnostic(), otelconfigDiagnostic(), + agentstatusDiagnostic(), authDiagnostic(), } @@ -35,10 +36,14 @@ var diagnoseCmd = &cobra.Command{ to attempt to identify issues that could cause the agent to function improperly.`, Run: func(cmd *cobra.Command, args []string) { v := viper.GetViper() - fmt.Print("Running diagnosis checks...\n") + fmt.Print("Running diagnosis checks...") + var failedChecks []string for _, diagnostic := range diagnostics { - fmt.Printf("\n%s\n================\n\n", diagnostic.checkName) - data, err := diagnostic.check(v) + fmt.Printf("\n\n\n%s\n==================\n", diagnostic.checkName) + success, data, err := diagnostic.check(v) + if !success { + failedChecks = append(failedChecks, diagnostic.checkName) + } if err != nil { fmt.Printf("⚠️ Failed to run check: %s\n", err.Error()) continue @@ -51,6 +56,15 @@ to attempt to identify issues that could cause the agent to function improperly. continue } } + if len(failedChecks) > 0 { + fmt.Printf("\n\n\n❌ %d out of %d checks failed:\n", len(failedChecks), len(diagnostics)) + for _, check := range failedChecks { + fmt.Printf(" - %s\n", check) + } + os.Exit(1) + } else { + fmt.Printf("\n✅ All %d checks passed!\n", len(diagnostics)) + } }, } diff --git a/internal/commands/diagnose/otelconfigcheck.go b/internal/commands/diagnose/otelconfigcheck.go index 5a33976b5..72a6886ce 100644 --- a/internal/commands/diagnose/otelconfigcheck.go +++ b/internal/commands/diagnose/otelconfigcheck.go @@ -5,6 +5,7 @@ import ( "embed" "github.com/observeinc/observe-agent/internal/commands/start" + logger "github.com/observeinc/observe-agent/internal/commands/util" "github.com/spf13/viper" "go.opentelemetry.io/collector/otelcol" ) @@ -14,10 +15,10 @@ type OtelConfigTestResult struct { Error string } -func checkOtelConfig(_ *viper.Viper) (any, error) { - colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings() +func checkOtelConfig(_ *viper.Viper) (bool, any, error) { + colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings(logger.WithCtx(context.Background(), logger.GetNop())) if err != nil { - return nil, err + return false, nil, err } if cleanup != nil { defer cleanup() @@ -26,16 +27,16 @@ func checkOtelConfig(_ *viper.Viper) (any, error) { // https://github.com/open-telemetry/opentelemetry-collector/blob/main/otelcol/command_validate.go col, err := otelcol.NewCollector(*colSettings) if err != nil { - return nil, err + return false, nil, err } err = col.DryRun(context.Background()) if err != nil { - return OtelConfigTestResult{ + return false, OtelConfigTestResult{ Passed: false, Error: err.Error(), }, nil } - return OtelConfigTestResult{ + return true, OtelConfigTestResult{ Passed: true, }, nil } diff --git a/internal/commands/diagnose/otelconfigcheck.tmpl b/internal/commands/diagnose/otelconfigcheck.tmpl index dbe90e51d..f2879ff02 100644 --- a/internal/commands/diagnose/otelconfigcheck.tmpl +++ b/internal/commands/diagnose/otelconfigcheck.tmpl @@ -1,5 +1,5 @@ -{{- if .Passed }} +{{- if .Passed -}} OTEL configuration is valid. -{{- else }} +{{- else -}} ⚠️ OTEL configuration validation failed with error {{ .Error }} -{{- end }} +{{- end -}} diff --git a/internal/commands/start/start.go b/internal/commands/start/start.go index a19b161cc..17a9308d6 100644 --- a/internal/commands/start/start.go +++ b/internal/commands/start/start.go @@ -41,8 +41,11 @@ func SetupAndGetConfigFiles(ctx context.Context) ([]string, func(), error) { return configFilePaths, cleanup, nil } -func SetupAndGenerateCollectorSettings() (*collector.CollectorSettings, func(), error) { - ctx := logger.WithCtx(context.Background(), logger.Get()) +func DefaultLoggerCtx() context.Context { + return logger.WithCtx(context.Background(), logger.Get()) +} + +func SetupAndGenerateCollectorSettings(ctx context.Context) (*collector.CollectorSettings, func(), error) { configFilePaths, cleanup, err := SetupAndGetConfigFiles(ctx) if err != nil { return nil, cleanup, err @@ -59,7 +62,7 @@ var startCmd = &cobra.Command{ This command reads in the local config and env vars and starts the collector on the current host.`, RunE: func(cmd *cobra.Command, args []string) error { - colSettings, cleanup, err := SetupAndGenerateCollectorSettings() + colSettings, cleanup, err := SetupAndGenerateCollectorSettings(DefaultLoggerCtx()) if err != nil { return err } diff --git a/internal/commands/version/version.go b/internal/commands/version/version.go index e71164421..527dbdf20 100644 --- a/internal/commands/version/version.go +++ b/internal/commands/version/version.go @@ -9,6 +9,7 @@ import ( "github.com/observeinc/observe-agent/build" "github.com/observeinc/observe-agent/internal/root" "github.com/spf13/cobra" + "github.com/spf13/viper" ) // versionCmd represents the version command @@ -18,8 +19,8 @@ var versionCmd = &cobra.Command{ Long: `Display the currently installed version of the observe-agent. This version is based on the package release.`, Run: func(cmd *cobra.Command, args []string) { - version := getVersion() - fmt.Printf("observe-agent version: %s\n", version) + fmt.Printf("observe-agent version: %s\n", getVersion()) + fmt.Printf("observe-agent config file: %s\n", getConfigFile()) }, } @@ -43,3 +44,11 @@ func getVersion() string { } return build.Version } + +func getConfigFile() string { + configFile := viper.ConfigFileUsed() + if configFile == "" { + configFile = "[none]" + } + return configFile +} diff --git a/internal/connections/confighandler.go b/internal/connections/confighandler.go index 88cf65036..6ebeb6c11 100644 --- a/internal/connections/confighandler.go +++ b/internal/connections/confighandler.go @@ -52,7 +52,7 @@ func GetAllOtelConfigFilePaths(ctx context.Context, tmpDir string) ([]string, st } configFilePaths = append(configFilePaths, overridePath) } - logger.FromCtx(ctx).Info(fmt.Sprint("Config file paths:", configFilePaths)) + logger.FromCtx(ctx).Debug(fmt.Sprint("Config file paths:", configFilePaths)) return configFilePaths, overridePath, nil } diff --git a/internal/root/root.go b/internal/root/root.go index 5361ab33c..6eeb847cd 100644 --- a/internal/root/root.go +++ b/internal/root/root.go @@ -55,7 +55,11 @@ func InitConfig() { viper.AutomaticEnv() // read in environment variables that match // If a config file is found, read it in. - if err := viper.ReadInConfig(); err == nil { - fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed()) + if err := viper.ReadInConfig(); err != nil { + if _, ok := err.(viper.ConfigFileNotFoundError); ok { + // Config file not found; ignore this error. + } else { + fmt.Fprintln(os.Stderr, "error reading config file:", err) + } } } diff --git a/main_windows.go b/main_windows.go index cee0ca631..f0c8fb7dc 100644 --- a/main_windows.go +++ b/main_windows.go @@ -27,7 +27,7 @@ func run() error { } root.CfgFile = os.Args[1] root.InitConfig() - colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings() + colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings(start.DefaultLoggerCtx()) if err != nil { return err } diff --git a/packaging/linux/config/observe-agent.yaml b/packaging/linux/config/observe-agent.yaml index 37b2880da..c68c408de 100644 --- a/packaging/linux/config/observe-agent.yaml +++ b/packaging/linux/config/observe-agent.yaml @@ -1,8 +1,8 @@ # Observe data token -token: "${OBSERVE_TOKEN}" +token: "" # Target Observe collection url -observe_url: "${OBSERVE_COLLECTION_ENDPOINT}" +observe_url: "" # Debug mode - Sets agent log level to debug debug: false