From 9ade1cf7aa25482afeed7c5353a719cd44d372b2 Mon Sep 17 00:00:00 2001
From: Matt Cotter <matthew.cotter@observeinc.com>
Date: Thu, 21 Nov 2024 15:21:39 -0600
Subject: [PATCH] feat: add new diagnostic check to validate the agent status
 (#126)

### Description

OB-32327 add new diagnostic check to validate the agent status, update
integration tests to separate config initialization from running the
diagnose check

### Checklist
- [x] Created tests which fail without the change (if possible)
- [ ] Extended the README / documentation, if necessary
---
 integration/README.md                         | 121 ++++-----
 integration/scripts/test_configure.py         | 146 +++-------
 integration/scripts/test_diagnose.py          |  69 +++++
 integration/scripts/test_ec2_connection.py    | 134 +++++----
 integration/scripts/test_install.py           | 146 +++++-----
 integration/scripts/test_start.py             | 228 ++++++++--------
 integration/scripts/test_version.py           | 146 +++++-----
 integration/scripts/utils.py                  | 257 +++++++++++++-----
 integration/tests/integration.tftest.hcl      |  32 ++-
 .../commands/diagnose/agentstatuscheck.go     |  59 ++++
 .../commands/diagnose/agentstatuscheck.tmpl   |   7 +
 internal/commands/diagnose/authcheck.go       |   9 +-
 internal/commands/diagnose/authcheck.tmpl     |  11 +-
 internal/commands/diagnose/configcheck.go     |  12 +-
 internal/commands/diagnose/configcheck.tmpl   |   9 +-
 .../commands/diagnose/configcheck_test.go     |   3 +-
 internal/commands/diagnose/diagnose.go        |  22 +-
 internal/commands/diagnose/otelconfigcheck.go |  13 +-
 .../commands/diagnose/otelconfigcheck.tmpl    |   6 +-
 internal/commands/start/start.go              |   9 +-
 internal/commands/version/version.go          |  13 +-
 internal/connections/confighandler.go         |   2 +-
 internal/root/root.go                         |   8 +-
 main_windows.go                               |   2 +-
 packaging/linux/config/observe-agent.yaml     |   4 +-
 25 files changed, 863 insertions(+), 605 deletions(-)
 create mode 100755 integration/scripts/test_diagnose.py
 create mode 100644 internal/commands/diagnose/agentstatuscheck.go
 create mode 100644 internal/commands/diagnose/agentstatuscheck.tmpl
diff --git a/integration/README.md b/integration/README.md
index b01a54c9c..fcf6a68e0 100644
--- a/integration/README.md
+++ b/integration/README.md
@@ -1,35 +1,35 @@
-## Integration Tests 
-
+# Integration Tests
 
 The root of this module location is intended to run integration tests using the terraform test framework. The tests are located at `integration/tests`
 
-The tests are run using the `terraform test -verbose` command from this folder `observe-agent/integration` 
+The tests are run using the `terraform test -verbose` command from this folder `observe-agent/integration`
 
-When the above command is run, the tests in the `integration/tests` directory are ran using the variables provided. The tests are ran in the order of the run blocks provided in `<test>.tftest.hcl` 
+When the above command is run, the tests in the `integration/tests` directory are ran using the variables provided. The tests are ran in the order of the run blocks provided in `<test>.tftest.hcl`
 
 Generally a test will do the following for any given EC2 Machine:
+
 - Create a machine using the variables provided below in `us-west-1`
-- Run a test using `observeinc/collection/aws//modules/testing/exec` module to accept python scripts located at `integration/tests/scripts` 
+- Run a test using `observeinc/collection/aws//modules/testing/exec` module to accept python scripts located at `integration/tests/scripts`
 
-### Pre-requisites
+## Pre-requisites
 
-Ensure you have  the following:
-- Built version of the agent ( in `observe-repos/observe-agent/dist`) using `go-releaser` 
+Ensure you have the following:
+
+- Built version of the agent ( in `observe-agent/dist`) using `go-releaser`
 - Blunderdome Admin Access in AWS (used to assume the `gh-observe_agent-repo` role for testing )
-- Observe Collection URL & Datastream Token to test with 
-- Generated Private & public key pair ( name to `test_key.pub` & `test_key.pem`) 
+- Observe Collection URL & Datastream Token to test with
+- Generated Private & public key pair ( name to `test_key.pub` & `test_key.pem`)
 - Terraform provider overide and terraform variables (see below section on how to do this)
 
-
 **Building the agent**:
 
-```
+```sh
 observe-agent git:(nikhil/update-RM) ✗ goreleaser release --snapshot --clean --verbose
 ```
 
 If agent distributable is not built, you may get the following message:
 
-```
+```txt
 │ Error: Test assertion failed
 │ 
 │   on tests/integration.tftest.hcl line 65, in run "test_install":
@@ -40,78 +40,69 @@ If agent distributable is not built, you may get the following message:
 │ Error in Installation Test
 ```
 
-
-**SSH Key Pairs**: 
+**SSH Key Pairs**:
 
 Generate in PEM format for the OpenSSH Key that will be used by Terraform EC2 Modules:
 
-```
+```sh
 ssh-keygen -m PEM
 ```
 
 For more info on generating SSH keys, see [here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-key-pairs.html#how-to-generate-your-own-key-and-import-it-to-aws)
 
-Ensure you have  `./test_key.pub`  and `./test_key.pem` in the `integration` directory. This can also be changed and specifed in the `integration/tests.auto.tfvars` file. 
-
-Ensure the extensions are correct! 
+Ensure you have  `./test_key.pub`  and `./test_key.pem` in the `integration` directory. This can also be changed and specifed in the `integration/tests.auto.tfvars` file.
 
+Ensure the extensions are correct!
 
 **AWS UI Access (optional)**:
 
-For AWS UI access for viewing machines: 
-1. Login to Britive Blunderdome 
+For AWS UI access for viewing machines:
 
-2. Navigate to AWS Console 
+1. Login to Britive Blunderdome
 
-<p align="left">
-  <img src="screenshots/aws-console.png" alt="AWS Console" width="300"/>
-</p>
+2. Navigate to AWS Console
 
-3. Click Switch role and input details for `nikhil-ps-account` which is the member account for integration testing within Blunderdome. Details are the following: 
-<p align="left">
-  <img src="screenshots/member-account.png" alt="Member Account" width="500"/>
-</p>
+   ![AWS Console](screenshots/aws-console.png)
 
-4. You can now access EC2 machines and download key pairs (same as what Github Actions workflow uses)
+3. Click Switch role and input details for `nikhil-ps-account` which is the member account for integration testing within Blunderdome. Details are the following:
 
-5. Ensure you're in `us-west-1` 
+    ![Member Account](screenshots/member-account.png)
 
+4. You can now access EC2 machines and download key pairs (same as what Github Actions workflow uses)
 
+5. Ensure you're in `us-west-1`
 
-### Terraform Variables 
+## Terraform Variables
 
-The tests are run using the following variables. These can be set in the `integration/tests.auto.tfvars` file for local testing. 
+The tests are run using the following variables. These can be set in the `integration/tests.auto.tfvars` file for local testing.
 
-```
-name_format        = "tf-observe-agent-test-%s"
-AWS_MACHINE= "AMAZON_LINUX_2023"  #Choose the AWS Machine to run the tests on 
+```terraform
+name_format      = "tf-observe-agent-test-%s"
+AWS_MACHINE      = "AMAZON_LINUX_2023"  #Choose the AWS Machine to run the tests on 
 PUBLIC_KEY_PATH  = "./test_key.pub" #Path to Public Key for EC2
 PRIVATE_KEY_PATH = "./test_key.pem" #Path to Private Key for EC2
-OBSERVE_URL = "https://<TENANT_ID.collect.observe-staging.com>" #Observe URL to use for testing
-OBSERVE_TOKEN ="<DATASTREAM_TOKEN_TO_TEST_OBSERVE_AGENT_WITH">
-``` 
+OBSERVE_URL      = "https://<TENANT_ID.collect.observe-staging.com>" #Observe URL to use for testing
+OBSERVE_TOKEN    = "<DATASTREAM_TOKEN_TO_TEST_OBSERVE_AGENT_WITH">
+```
 
-The PUBLIC & PRIVATE key pair can be generated by following the instructions in the "SSH Key Pairs" section above 
+The PUBLIC & PRIVATE key pair can be generated by following the instructions in the "SSH Key Pairs" section above
 
 Example of what the `integrations` folder contains after creating `.*tfvars` file and SSH Key Pair:
 
+![SSH Example](screenshots/ssh-example.png)]
 
-<p align="left">
-  <img src="screenshots/ssh-example.png" alt="SSH Example" width="200"/>
-</p>
-
-
-### Terraform Provider
+## Terraform Provider
 
 Note: You must also set the provider correctly. We use the following settings:
+
 - Region: `us-west-1`
 - Profile: `blunderdome`
-- IAM Role Assumed: `gh-observe_agent-repo` 
+- IAM Role Assumed: `gh-observe_agent-repo`
   - The above role has permissions to create and destroy EC2 instances. See `modules/setup_aws_backend/role.tf` for more details.
 
 The provider can be directly set in the `integration/tests/integration.tftest.hcl` as below:
 
-```
+```terraform
 provider "aws" {
   region  = "us-west-1" # Specify the AWS region
   profile = "blunderdome"
@@ -126,19 +117,14 @@ or through a `provider_override.tf` placed in `modules/create_ec2` directory.
 
 Example of this:
 
-<p align="left">
-  <img src="screenshots/provider-example.png" alt="SSH Example" width="200"/>
-</p>
-
-
-
+![Provider Example](screenshots/provider-example.png)
 
 > [!NOTE]  
-> For Terraform to access and assume the role properly, you MUST be logged into Blunderdome Admin in console and have the correct permissions! 
+> For Terraform to access and assume the role properly, you MUST be logged into Blunderdome Admin in console and have the correct permissions!
 
 Example of this:
 
-```
+```sh
 observe git:(master) ✗ s/aws-creds checkout blunderdome
 
 Checked out 'AWS Blunderdome Organization/460044344528 (observe-blunderdome)/BritiveBlunderdome-FullAWSAdmin' into awscli profile 'blunderdome'
@@ -146,14 +132,13 @@ Checked out 'AWS Blunderdome Organization/460044344528 (observe-blunderdome)/Bri
 observe git:(master) ✗ export AWS_PROFILE=blunderdome         
 ```
 
-
-### Local Testing (without terraform test)
+## Local Testing (without terraform test)
 
 Any of the python scripts in the `/scripts` directory can be tested by running them directly, granted an EC2 Machine exists. As the scripts rely on the outputs of `create_ec2` and `setup_observe_variables` modules to be passed in as environment variables, these environment variables can be manually set if the set up modules are not ran.
 
-The `/scripts/<test_xyz.py` expects the following environment variables to be set:
+The `/scripts/<test_xyz>.py` expects the following environment variables to be set:
 
-```
+```sh
 HOST="54.177.249.99" #HOST IP Address 
 USER="ubuntu" #HOST user to login as 
 KEY_FILENAME="./test_key.pem" #Private path to key 
@@ -162,19 +147,20 @@ MACHINE_CONFIG="ami_description:Ubuntu Server 22.04 LTS (HVM)- EBS General Purpo
 OBSERVE_URL="" #Observe URL to use for testing
 OBSERVE_TOKEN="" #Observe Token to use for testing
 PASSWORD="WindowsPassword to be used for testing" # Set to None for testing 
-
 ```
 
 Run the scripts from the folder as below:
-```
+
+```sh
 ➜  integration git:(nikhil/integration-testing-windows) ✗ pwd
 /Users/nikhil.dua/Documents/observe-repos/observe-agent/integration
-➜  integration git:(nikhil/integration-testing-windows) ✗ python3 scripts/test_installation.py
+➜  integration git:(nikhil/integration-testing-windows) ✗ python3 scripts/test_install.py
 ```
 
-Note: If testing Windows machines, the RDP password is redacted by default in the python scripts. 
+Note: If testing Windows machines, the RDP password is redacted by default in the python scripts.
 This can be turned off when disabling mask by setting below environment variable to `False` before running these scripts
-```
+
+```sh
 export MASK=False
 python3 scripts/test_ec2_connection.py
 ------------------------------
@@ -185,9 +171,6 @@ Env vars set to:
 Testing SSH connection to host 54.177.26.178 with timeout 120s
 ```
 
-### Architecture
+## Architecture
 
 The architecture diagram can be found ![here](screenshots/Observe-Agent.png)
-
-
-  
\ No newline at end of file
diff --git a/integration/scripts/test_configure.py b/integration/scripts/test_configure.py
index 3d1be31f0..7fff2ec36 100755
--- a/integration/scripts/test_configure.py
+++ b/integration/scripts/test_configure.py
@@ -1,128 +1,66 @@
 #!/usr/bin/env python3
-import os
-import sys
-import re
-import time 
 import utils as u
 
-@u.print_test_decorator
-def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:  
-
-    """
-    Test to validate connection of observe-agent to Observe 
 
-    Args:
-        remote_host (Host): instance to ssh into 
-        env_vars (dict): environment variables passed into for testing
+@u.print_test_decorator
+def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:
+    init_command = r'Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent init-config --token {} --observe_url {}'.format(
+        env_vars["observe_token"], env_vars["observe_url"]
+    )
 
-    Raises:
-        ValueError: Something failed with initial config or observe-agent -> observe connection 
-    """
- 
-    init_command='Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent init-config --token {} --observe_url {}'.format(env_vars["observe_token"], env_vars["observe_url"])
-    diagnose_command='Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent diagnose'
-    
-    #Set up correct config with observe url and token 
+    # Set up correct config with observe url and token
     result = remote_host.run_command(init_command)
+    if result.exited != 0 or result.stderr:
+        u.print_remote_result(result)
+        raise ValueError("❌ Error in init-config")
 
-    #Check diagnose command
-    result = remote_host.run_command(diagnose_command)
-    observe_val = False
-    for line in result.stdout.splitlines():      
-        if "Request to test URL responded with response code 200" in line:
-            print (" ✅ observe-agent -> observe validation passed! ")
-            observe_val = True
-            break        
-    if not observe_val:
-        print(result)
-        raise ValueError(f"❌ Failed: observe-agent -> observe validation")
-    
-    pass   
 
 @u.print_test_decorator
-def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:  
-    docker_prefix='sudo docker run \
-        --mount type=bind,source=/proc,target=/hostfs/proc,readonly \
-        --mount type=bind,source=/snap,target=/hostfs/snap,readonly \
-        --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \
-        --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \
-        --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \
-        --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \
-        --pid host \
-        $(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep SNAPSHOT)'
-     
-    init_command='init-config --token {} --observe_url {}'.format(env_vars["observe_token"], env_vars["observe_url"])
-    diagnose_command='diagnose'
-
-    #Set up correct config with observe url and token 
-    result = remote_host.run_command(docker_prefix + ' ' + init_command)
-
-    #Check diagnose command
-    result = remote_host.run_command(docker_prefix + ' ' + diagnose_command)
-    observe_val = False
-    for line in result.stdout.splitlines():      
-        if "Request to test URL responded with response code 200" in line:
-            print (" ✅ observe-agent -> observe validation passed! ")
-            observe_val = True
-            break        
-    if not observe_val:
-        print(result)
-        raise ValueError(f"❌ Failed: observe-agent -> observe validation")
+def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
+    docker_prefix = u.get_docker_prefix(remote_host, False)
+    init_command = "{} init-config --token {} --observe_url {}".format(
+        docker_prefix, env_vars["observe_token"], env_vars["observe_url"]
+    )
 
+    # Set up correct config with observe url and token
+    result = remote_host.run_command(init_command)
+    if result.exited != 0 or result.stderr:
+        u.print_remote_result(result)
+        raise ValueError("❌ Error in init-config")
 
-    pass
 
 @u.print_test_decorator
-def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:    
-
-    """
-    Test to validate connection of observe-agent to Observe 
+def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:
+    init_command = "sudo observe-agent init-config --token {} --observe_url {}".format(
+        env_vars["observe_token"], env_vars["observe_url"]
+    )
 
-    Args:
-        remote_host (Host): instance to ssh into 
-        env_vars (dict): environment variables passed into for testing
-
-    Raises:
-        ValueError: Something failed with initial config or observe-agent -> observe connection 
-    """
-
-    init_command='sudo observe-agent init-config --token {} --observe_url {}'.format(env_vars["observe_token"], env_vars["observe_url"])
-    diagnose_command='observe-agent diagnose'
-
-    #Set up correct config with observe url and token 
+    # Set up correct config with observe url and token
     result = remote_host.run_command(init_command)
+    if result.exited != 0 or result.stderr:
+        u.print_remote_result(result)
+        raise ValueError("❌ Error in init-config")
 
-    #Check diagnose command
-    result = remote_host.run_command(diagnose_command)
-    observe_val = False
-    for line in result.stdout.splitlines():      
-        if "Request to test URL responded with response code 200" in line:
-            print (" ✅ observe-agent -> observe validation passed! ")
-            observe_val = True
-            break        
-    if not observe_val:
-        print(result)
-        raise ValueError(f"❌ Failed: observe-agent -> observe validation")
-   
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     env_vars = u.get_env_vars(need_observe=True)
-    remote_host = u.Host(host_ip=env_vars["host"],
-                       username=env_vars["user"],
-                       key_file_path=env_vars["key_filename"],
-                       password=env_vars["password"])    
-    
-    #Test SSH Connection before starting test of interest 
-    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))   
-
-    if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]:
+    remote_host = u.Host(
+        host_ip=env_vars["host"],
+        username=env_vars["user"],
+        key_file_path=env_vars["key_filename"],
+        password=env_vars["password"],
+    )
+
+    # Test SSH Connection before starting test of interest
+    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))
+
+    if (
+        "redhat" in env_vars["machine_config"]["distribution"]
+        or "debian" in env_vars["machine_config"]["distribution"]
+    ):
         run_test_linux(remote_host, env_vars)
     elif "windows" in env_vars["machine_config"]["distribution"]:
         run_test_windows(remote_host, env_vars)
     elif "docker" in env_vars["machine_config"]["distribution"]:
         run_test_docker(remote_host, env_vars)
-
-    pass 
-
-
diff --git a/integration/scripts/test_diagnose.py b/integration/scripts/test_diagnose.py
new file mode 100755
index 000000000..3abe96724
--- /dev/null
+++ b/integration/scripts/test_diagnose.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+from fabric import Result
+
+import re
+import utils as u
+
+
+def _check_diagnose_result(result: Result) -> bool:
+    passed = re.search(r"All \d+ checks passed", result.stdout) is not None
+    if passed:
+        print(" ✅ observe-agent -> observe validation passed! ")
+    else:
+        u.print_remote_result(result)
+        raise ValueError(
+            f"❌ Failed: observe-agent -> observe validation (regex on diagnose output did not match)"
+        )
+
+
+@u.print_test_decorator
+def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:
+    diagnose_command = r'Set-Location "C:\Program Files\Observe\observe-agent"; ./observe-agent diagnose'
+
+    # Check diagnose command
+    result = remote_host.run_command(diagnose_command)
+    _check_diagnose_result(result)
+
+
+@u.print_test_decorator
+def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
+    container_id = u.get_docker_container(remote_host)
+    exec_prefix = f"sudo docker exec {container_id} ./observe-agent"
+    diagnose_command = exec_prefix + " diagnose"
+
+    # Check diagnose command
+    result = remote_host.run_command(diagnose_command)
+    _check_diagnose_result(result)
+
+
+@u.print_test_decorator
+def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:
+    diagnose_command = "observe-agent diagnose"
+
+    # Check diagnose command
+    result = remote_host.run_command(diagnose_command)
+    _check_diagnose_result(result)
+
+
+if __name__ == "__main__":
+
+    env_vars = u.get_env_vars()
+    remote_host = u.Host(
+        host_ip=env_vars["host"],
+        username=env_vars["user"],
+        key_file_path=env_vars["key_filename"],
+        password=env_vars["password"],
+    )
+
+    # Test SSH Connection before starting test of interest
+    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))
+
+    if (
+        "redhat" in env_vars["machine_config"]["distribution"]
+        or "debian" in env_vars["machine_config"]["distribution"]
+    ):
+        run_test_linux(remote_host, env_vars)
+    elif "windows" in env_vars["machine_config"]["distribution"]:
+        run_test_windows(remote_host, env_vars)
+    elif "docker" in env_vars["machine_config"]["distribution"]:
+        run_test_docker(remote_host, env_vars)
diff --git a/integration/scripts/test_ec2_connection.py b/integration/scripts/test_ec2_connection.py
index 7670b3364..e03592506 100755
--- a/integration/scripts/test_ec2_connection.py
+++ b/integration/scripts/test_ec2_connection.py
@@ -4,77 +4,88 @@
 import os
 import sys
 import re
-import time 
+import time
 import utils as u
 
-    
-@u.print_test_decorator
-def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:  
 
+@u.print_test_decorator
+def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:
     """
-    This test validates that the UserdataExecution.log finished successfully 
+    This test validates that the UserdataExecution.log finished successfully
     and ec2 instance is in stable state prior to running other
 
 
     Args:
-        remote_host (Host): instance to ssh into 
+        remote_host (Host): instance to ssh into
         env_vars (dict): environment variables passed into for testing
 
     Raises:
         RuntimeError: Failed to verify UserdataExecution.log or agent.logfile
     """
-    
-  
+
     tmp_file = "/tmp/UserdataExecution.log"
-    cloud_init_file_timeout = 240 # 4 minutes    
-    
-    if "2022" in env_vars["machine_name"]: #Windows 2022 -  Test windows cloud-init file finished successfully
+    cloud_init_file_timeout = 240  # 4 minutes
+
+    if (
+        "2022" in env_vars["machine_name"]
+    ):  # Windows 2022 -  Test windows cloud-init file finished successfully
         print("Windows 2022 detected")
-        cloud_init_file = r'/C:/ProgramData/Amazon/EC2Launch/log/agent.log'
-      
-        for _ in range(cloud_init_file_timeout):        
-            remote_host.get_file(cloud_init_file, tmp_file) # This command will automatically test connection 
-            with open(tmp_file) as file: #No encoding for windows 2022 needed 
+        cloud_init_file = r"/C:/ProgramData/Amazon/EC2Launch/log/agent.log"
+
+        for _ in range(cloud_init_file_timeout):
+            remote_host.get_file(
+                cloud_init_file, tmp_file
+            )  # This command will automatically test connection
+            with open(tmp_file) as file:  # No encoding for windows 2022 needed
                 content = file.read().lower()
-                if "script execution finished successfully"  in content:
+                if "script execution finished successfully" in content:
                     print(" ✅ Verified agent.log had completed successfully!")
-                    return 
+                    return
                 else:
                     print(" Looking for the agent.log file to finish completing...")
-            time.sleep(1)        
-        raise RuntimeError("❌ The agent.log file did not finish successfully in time")  
-    else: # Windows 2016/2019 -   Test windows cloud-init file finished successfully
+            time.sleep(1)
+        raise RuntimeError("❌ The agent.log file did not finish successfully in time")
+    else:  # Windows 2016/2019 -   Test windows cloud-init file finished successfully
         print("Windows 2016 or 2019 detected")
-        cloud_init_file = r'/C:/ProgramData/Amazon/EC2-Windows/Launch/Log/UserdataExecution.log'
-
-        for _ in range(cloud_init_file_timeout):        
-            remote_host.get_file(cloud_init_file, tmp_file) # This command will automatically test connection 
+        cloud_init_file = (
+            r"/C:/ProgramData/Amazon/EC2-Windows/Launch/Log/UserdataExecution.log"
+        )
+
+        for _ in range(cloud_init_file_timeout):
+            remote_host.get_file(
+                cloud_init_file, tmp_file
+            )  # This command will automatically test connection
             with open(tmp_file, encoding="utf-16") as file:
                 content = file.read().lower()
-                if "user data script completed"  in content:
+                if "user data script completed" in content:
                     print(" ✅ Verified UserdataExecution had completed successfully!")
-                    return 
+                    return
                 else:
-                    print(" Looking for the UserdataExecution.log file to finish completing...")
-            time.sleep(1)        
-        raise RuntimeError("❌ The UserdataExecution file did not finish successfully in time")  
+                    print(
+                        " Looking for the UserdataExecution.log file to finish completing..."
+                    )
+            time.sleep(1)
+        raise RuntimeError(
+            "❌ The UserdataExecution file did not finish successfully in time"
+        )
+
 
 @u.print_test_decorator
-def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:  
-    #Since our test is being done on a linux EC2, we can just check it initializes and runs similar to linux test
+def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
+    # Since our test is being done on a linux EC2, we can just check it initializes and runs similar to linux test
     run_test_linux(remote_host, env_vars)
-    pass 
-    
+    pass
+
 
 @u.print_test_decorator
-def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:    
+def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:
     """
-    This test validates that the cloud-init file finished successfully 
+    This test validates that the cloud-init file finished successfully
     and ec2 instance is in stable state prior to running other
 
 
     Args:
-        remote_host (Host): instance to ssh into 
+        remote_host (Host): instance to ssh into
         env_vars (dict): environment variables passed into for testing
 
     Raises:
@@ -83,34 +94,41 @@ def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:
 
     cloud_init_file = "/var/log/cloud-init-output.log"
     tmp_file = "/tmp/cloud-init-output.log"
-    cloud_init_file_timeout = 240 # 4 minutes
+    cloud_init_file_timeout = 240  # 4 minutes
 
-    #Test cloud-init file finished successfully
-    for _ in range(cloud_init_file_timeout):        
-        remote_host.get_file(cloud_init_file, tmp_file) # This command will automatically test connection 
+    # Test cloud-init file finished successfully
+    for _ in range(cloud_init_file_timeout):
+        remote_host.get_file(
+            cloud_init_file, tmp_file
+        )  # This command will automatically test connection
         with open(tmp_file, "r") as file:
             content = file.read().lower()
-            if "finished at"  in content:
+            if "finished at" in content:
                 print(" ✅ Verified cloud-init file had completed successfully!")
-                return 
+                return
             else:
-               print(" Looking for the cloud-init file to finish completing...")
-        time.sleep(1)        
-    raise RuntimeError("❌ The cloud-init file did not finish successfully in time")  
+                print(" Looking for the cloud-init file to finish completing...")
+        time.sleep(1)
+    raise RuntimeError("❌ The cloud-init file did not finish successfully in time")
+
+
+if __name__ == "__main__":
 
-if __name__ == '__main__':
-    
     env_vars = u.get_env_vars()
-    remote_host = u.Host(host_ip=env_vars["host"],
-                       username=env_vars["user"],
-                       key_file_path=env_vars["key_filename"],
-                       password=env_vars["password"])    
-    
-    #Test SSH Connection before starting test of interest 
-    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))   
-    
-
-    if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]:
+    remote_host = u.Host(
+        host_ip=env_vars["host"],
+        username=env_vars["user"],
+        key_file_path=env_vars["key_filename"],
+        password=env_vars["password"],
+    )
+
+    # Test SSH Connection before starting test of interest
+    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))
+
+    if (
+        "redhat" in env_vars["machine_config"]["distribution"]
+        or "debian" in env_vars["machine_config"]["distribution"]
+    ):
         run_test_linux(remote_host, env_vars)
     elif "windows" in env_vars["machine_config"]["distribution"]:
         run_test_windows(remote_host, env_vars)
diff --git a/integration/scripts/test_install.py b/integration/scripts/test_install.py
index 9a622c6a6..503f6a5a8 100755
--- a/integration/scripts/test_install.py
+++ b/integration/scripts/test_install.py
@@ -4,8 +4,8 @@
 import os
 import sys
 import re
-import time 
-import inspect 
+import time
+import inspect
 import utils as u
 
 
@@ -23,18 +23,20 @@ def _get_installation_package(env_vars: dict) -> tuple:
 
     """
     current_dir = os.getcwd()
-    dist_directory = os.path.abspath(os.path.join(current_dir, '..',  'dist'))
+    dist_directory = os.path.abspath(os.path.join(current_dir, "..", "dist"))
     print(f"Path to 'dist' directory: {dist_directory}")
 
     # List files in the directory
-    files = os.listdir(dist_directory)   
+    files = os.listdir(dist_directory)
 
     # Search criteria
     package_type = env_vars["machine_config"]["package_type"]
     architecture = env_vars["machine_config"]["architecture"]
     distribution = env_vars["machine_config"]["distribution"]
 
-    print(f"Looking for installation package '{package_type}' and architecture '{architecture}'")
+    print(
+        f"Looking for installation package '{package_type}' and architecture '{architecture}'"
+    )
 
     # Iterate through files and find matches
     for filename in files:
@@ -45,60 +47,76 @@ def _get_installation_package(env_vars: dict) -> tuple:
             full_path = os.path.join(dist_directory, filename)
             print(f"Found matching file {filename} at: {full_path}")
             return filename, full_path
-    u.die(f"❌ No matching file found for {distribution},{architecture},{package_type} in {dist_directory}: {', '.join(files)}")
+    u.die(
+        f"❌ No matching file found for {distribution},{architecture},{package_type} in {dist_directory}: {', '.join(files)}"
+    )
 
 
 @u.print_test_decorator
-def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:  
-
+def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:
     """
-    Test to install local observe-agent on a windows ec2 instance and validate command ran successfully 
+    Test to install local observe-agent on a windows ec2 instance and validate command ran successfully
 
     Args:
-        remote_host (Host): instance to ssh into 
+        remote_host (Host): instance to ssh into
         env_vars (dict): environment variables passed into for testing
 
     Raises:
         RuntimeError: Installation error in powershell script
     """
-    # Get built dist. installation package path for machine 
-    filename, full_path = _get_installation_package(env_vars)    
-
-    # Set windows home dir paths for consistency 
-    home_dir = r"/C:/Users/{}".format(env_vars["user"]) #for user in sftp 
-    home_dir_powershell = r"C:\Users\{}".format(env_vars["user"]) #for use in powershell script 
-    
-    # Find agent installation script path 
-    current_script_dir = os.path.dirname(os.path.abspath(__file__))   
-    ps_installation_script_path = os.path.join(current_script_dir, 'install_windows.ps1')
-
-
-    # Copy built distribution package to remote host home dir 
-    remote_host.put_file(local_path=full_path, remote_path=home_dir) #Eg: sftp to /C:/Users/Adminstrator/observe-agent_Windows_x86_64.zip
-
-    # Copy observe-agent powershell installation script to remote host home dir 
-    remote_host.put_file(local_path=ps_installation_script_path, remote_path=home_dir) #Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1
+    # Get built dist. installation package path for machine
+    filename, full_path = _get_installation_package(env_vars)
+
+    # Set windows home dir paths for consistency
+    home_dir = r"/C:/Users/{}".format(env_vars["user"])  # for user in sftp
+    home_dir_powershell = r"C:\Users\{}".format(
+        env_vars["user"]
+    )  # for use in powershell script
+
+    # Find agent installation script path
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    ps_installation_script_path = os.path.join(
+        current_script_dir, "install_windows.ps1"
+    )
+
+    # Copy built distribution package to remote host home dir
+    remote_host.put_file(
+        local_path=full_path, remote_path=home_dir
+    )  # Eg: sftp to /C:/Users/Adminstrator/observe-agent_Windows_x86_64.zip
+
+    # Copy observe-agent powershell installation script to remote host home dir
+    remote_host.put_file(
+        local_path=ps_installation_script_path, remote_path=home_dir
+    )  # Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1
 
     # Run install script and pass in distribution package path
     # Eg: .\install_windows.ps1 -local_installer C:\Users\Adminstrator\observe-agent_Windows_x86_64.zip
-    # observe-agent gets installed to C:\Program Files\observe-agent on ec2 machine 
-    result = remote_host.run_command('.\install_windows.ps1 -local_installer {}\{}'.format(home_dir_powershell, filename))
+    # observe-agent gets installed to C:\Program Files\observe-agent on ec2 machine
+    result = remote_host.run_command(
+        r".\install_windows.ps1 -local_installer {}\{}".format(
+            home_dir_powershell, filename
+        )
+    )
     print(result)
-    
-    if result.stderr: #Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr  
-        raise RuntimeError("❌ Installation error in install_windows.ps1 powershell script")  
-    else:        
+
+    if (
+        result.stderr
+    ):  # Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr
+        raise RuntimeError(
+            "❌ Installation error in install_windows.ps1 powershell script"
+        )
+    else:
         print("✅ Installation test passed")
-        
+
 
 @u.print_test_decorator
-def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:  
+def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
 
-    filename, full_path= _get_installation_package(env_vars)
+    filename, full_path = _get_installation_package(env_vars)
     home_dir = "/home/{}".format(env_vars["user"])
 
     remote_host.put_file(full_path, home_dir)
-    result = remote_host.run_command('sudo docker load --input {}'.format(filename))
+    result = remote_host.run_command("sudo docker load --input {}".format(filename))
     if result.stderr:
         print(result)
         raise RuntimeError("❌ Installation error in docker load")
@@ -107,51 +125,53 @@ def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
 
 
 @u.print_test_decorator
-def run_test_linux(remote_host: u.Host, env_vars: dict):       
+def run_test_linux(remote_host: u.Host, env_vars: dict):
     """
-    Test to install local observe-agent on a linux ec2 instance and validate command ran successfully 
+    Test to install local observe-agent on a linux ec2 instance and validate command ran successfully
 
     Args:
-        remote_host (Host): instance to ssh into 
+        remote_host (Host): instance to ssh into
         env_vars (dict): environment variables passed into for testing
 
     Raises:
-        RuntimeError: Unknown distribution type passed  
+        RuntimeError: Unknown distribution type passed
     """
-    filename, full_path= _get_installation_package(env_vars)
+    filename, full_path = _get_installation_package(env_vars)
     home_dir = "/home/{}".format(env_vars["user"])
 
     remote_host.put_file(full_path, home_dir)
     if "redhat" in env_vars["machine_config"]["distribution"]:
-        result = remote_host.run_command('cd ~ && sudo yum localinstall {} -y'.format(filename))
-    elif "debian" in env_vars["machine_config"]["distribution"] :
-        result = remote_host.run_command('cd ~ && sudo dpkg -i {}'.format(filename))
+        result = remote_host.run_command(
+            "cd ~ && sudo yum localinstall {} -y".format(filename)
+        )
+    elif "debian" in env_vars["machine_config"]["distribution"]:
+        result = remote_host.run_command("cd ~ && sudo dpkg -i {}".format(filename))
     else:
-        raise RuntimeError("❌ Unknown distribution type")  
-    
-    print(result)    
+        raise RuntimeError("❌ Unknown distribution type")
+
+    print(result)
     print("✅ Installation test passed")
 
 
+if __name__ == "__main__":
 
-if __name__ == '__main__':
-    
     env_vars = u.get_env_vars()
-    remote_host = u.Host(host_ip=env_vars["host"],
-                       username=env_vars["user"],
-                       key_file_path=env_vars["key_filename"],
-                       password=env_vars["password"])    
-    
-    #Test SSH Connection before starting test of interest 
-    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))   
-
-    if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]:
+    remote_host = u.Host(
+        host_ip=env_vars["host"],
+        username=env_vars["user"],
+        key_file_path=env_vars["key_filename"],
+        password=env_vars["password"],
+    )
+
+    # Test SSH Connection before starting test of interest
+    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))
+
+    if (
+        "redhat" in env_vars["machine_config"]["distribution"]
+        or "debian" in env_vars["machine_config"]["distribution"]
+    ):
         run_test_linux(remote_host, env_vars)
     elif "windows" in env_vars["machine_config"]["distribution"]:
         run_test_windows(remote_host, env_vars)
     elif "docker" in env_vars["machine_config"]["distribution"]:
         run_test_docker(remote_host, env_vars)
-
-
-
-
diff --git a/integration/scripts/test_start.py b/integration/scripts/test_start.py
index 6f437a46f..cf7aab88f 100755
--- a/integration/scripts/test_start.py
+++ b/integration/scripts/test_start.py
@@ -2,161 +2,165 @@
 import os
 import sys
 import re
-import time 
+import time
 import pprint
 import utils as u
 from collections import defaultdict
 
-def _check_status_loop(remote_host: u.Host, start_timeout: int, status_command: str) -> bool:
+
+def _check_status_loop(
+    remote_host: u.Host, start_timeout: int, status_command: str
+) -> bool:
     """Run Check Status Command in a loop to wait for observe-agent to start
 
     Args:
-        remote_host (Host): instance to ssh into 
-        start_timeout (int): timeout in seconds to wait for agent to start 
-        status_command (str): windows/linux status command to run 
+        remote_host (Host): instance to ssh into
+        start_timeout (int): timeout in seconds to wait for agent to start
+        status_command (str): windows/linux status command to run
 
     Returns:
         bool: agent_status
     """
-    
-   
-    agent_status=False
-    for _ in range(start_timeout):       
-            metrics_dict = defaultdict(list)
+
+    agent_status = False
+    for _ in range(start_timeout):
+        metrics_dict = defaultdict(list)
+        try:
             result = remote_host.run_command(status_command)
-            for line in result.stdout.splitlines():      
-                if ":" in line:
-                    metric, value = line.split(":", 1)
-                    metric = metric.strip()
-                    value = value.strip()                    
-                    metrics_dict[metric].append(value)
-                print(line)        
-            if metrics_dict["Status"] and metrics_dict["Status"][0] == "Running": 
-                print("✅ Observe Agent is active and running without errors!")
-                agent_status=True
-                break     
-            else:
-                print("❌ Observe Agent is not running. Retry Count is {}/{}...".format(_+1, start_timeout))
-                time.sleep(1)
+        except Exception as e:
+            print("Ignoring exception: ", e)
+            time.sleep(1)
+            continue
+        for line in result.stdout.splitlines():
+            if ":" in line:
+                metric, value = line.split(":", 1)
+                metric = metric.strip()
+                value = value.strip()
+                metrics_dict[metric].append(value)
+            print(line)
+        if metrics_dict["Status"] and metrics_dict["Status"][0] == "Running":
+            print("✅ Observe Agent is active and running without errors!")
+            agent_status = True
+            break
+        print(
+            "❌ Observe Agent is not running. Retry Count is {}/{}...".format(
+                _ + 1, start_timeout
+            )
+        )
+        time.sleep(1)
     return agent_status
-    
+
 
 @u.print_test_decorator
-def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:  
+def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:
     """
-    Test to check if observe-agent is running correctly 
+    Test to check if observe-agent is running correctly
 
     Args:
-        remote_host (Host): instance to ssh into 
-        env_vars (dict): environment variables passed into for testing 
-
-    """ 
-
-    #status
-    start_command=r'.\start_agent_windows.ps1'
-    status_command=r'Get-Service ObserveAgent;Set-Location "${Env:Programfiles}\Observe\observe-agent"; ./observe-agent status'
-    start_timeout = 30 #how long to wait for observe-agent to start
-
-    # Get windows home dir paths for consistency 
-    home_dir = r"/C:/Users/{}".format(env_vars["user"]) #for user in sftp 
-    
-    # Find start agent script path  
-    current_script_dir = os.path.dirname(os.path.abspath(__file__))  
-    ps_installation_script_path = os.path.join(current_script_dir, 'start_agent_windows.ps1')
-
-    #Copy start_agent powershell installation script to remote host home dir 
-    remote_host.put_file(local_path=ps_installation_script_path, remote_path=home_dir) #Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1
-    # Run start_agent script 
+        remote_host (Host): instance to ssh into
+        env_vars (dict): environment variables passed into for testing
+
+    """
+
+    # status
+    start_command = r".\start_agent_windows.ps1"
+    status_command = r'Get-Service ObserveAgent;Set-Location "${Env:Programfiles}\Observe\observe-agent"; ./observe-agent status'
+    start_timeout = 30  # how long to wait for observe-agent to start
+
+    # Get windows home dir paths for consistency
+    home_dir = r"/C:/Users/{}".format(env_vars["user"])  # for user in sftp
+
+    # Find start agent script path
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    ps_installation_script_path = os.path.join(
+        current_script_dir, "start_agent_windows.ps1"
+    )
+
+    # Copy start_agent powershell installation script to remote host home dir
+    remote_host.put_file(
+        local_path=ps_installation_script_path, remote_path=home_dir
+    )  # Eg: sftp to /C:/Users/Adminstrator/install_windows.ps1
+    # Run start_agent script
     result = remote_host.run_command(start_command)
     print(result)
 
-    if result.stderr: #Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr  
-        raise RuntimeError("❌ Error in start_agent_windows.ps1 powershell script")  
-    
-    #Check Agent Status 
-    agent_status=_check_status_loop(remote_host, start_timeout, status_command)
+    if (
+        result.stderr
+    ):  # Powershell script failure does not cause command failure as the installation command succeeds so we need to check the stderr
+        raise RuntimeError("❌ Error in start_agent_windows.ps1 powershell script")
+
+    # Check Agent Status
+    agent_status = _check_status_loop(remote_host, start_timeout, status_command)
     if not agent_status:
-            u.die("❌ Error in Observe Agent Status Test ")
+        u.die("❌ Error in Observe Agent Status Test ")
+
 
 @u.print_test_decorator
-def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:  
-
-   docker_prefix='sudo docker run -d --restart always \
-        --mount type=bind,source=/proc,target=/hostfs/proc,readonly \
-        --mount type=bind,source=/snap,target=/hostfs/snap,readonly \
-        --mount type=bind,source=/boot,target=/hostfs/boot,readonly \
-        --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \
-        --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \
-        --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \
-        --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \
-        --pid host \
-        $(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep SNAPSHOT)'
-   start_command='start'
-   get_container_command = (
-    "sudo docker ps --filter \"status=running\" --format \"{{.ID}} {{.Image}} {{.CreatedAt}}\" | "
-    "grep \"SNAPSHOT\" | sort -k3 -r | head -n 1 | awk '{print $1}'"
-    )  
-   start_timeout = 30 #how long to wait for observe-agent to start   
-
-   #Start Observe Agent 
-   result = remote_host.run_command(docker_prefix + ' ' + start_command)  
-   if result.stderr:
+def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
+    docker_prefix = u.get_docker_prefix(remote_host, True)
+    start_command = "start"
+    start_timeout = 30  # how long to wait for observe-agent to start
+
+    # Start Observe Agent
+    result = remote_host.run_command(docker_prefix + " " + start_command)
+    if result.stderr:
         u.die("❌ Error starting observe-agent container")
+    else:
+        print("✅ Observe Agent started successfully: " + result.stdout)
 
-   #Get Observe Agent Container ID
-   container_id = remote_host.run_command(get_container_command)
-   status_command='sudo docker exec {} ./observe-agent status'.format(container_id.stdout.strip()) 
-   if not container_id:
-        u.die("❌ Error in finding observe-agent container")
+    # Get Observe Agent Container ID
+    container_id = u.get_docker_container(remote_host)
+    status_command = f"sudo docker exec {container_id} ./observe-agent status"
 
-   #Check Agent Status 
-   agent_status=_check_status_loop(remote_host, start_timeout, status_command)
-   if not agent_status:
+    # Check Agent Status
+    agent_status = _check_status_loop(remote_host, start_timeout, status_command)
+    if not agent_status:
         u.die("❌ Error in Observe Agent Status Test ")
 
 
 @u.print_test_decorator
-def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:    
-
-   """
-    Test to check if observe-agent is running correctly 
+def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:
+    """
+    Test to check if observe-agent is running correctly
 
     Args:
-        remote_host (Host): instance to ssh into 
-        env_vars (dict): environment variables passed into for testing 
+        remote_host (Host): instance to ssh into
+        env_vars (dict): environment variables passed into for testing
 
-    """ 
+    """
 
-   start_command='sudo systemctl enable --now observe-agent'
-   status_command='observe-agent status'
-   start_timeout = 30 #how long to wait for observe-agent to start
+    start_command = "sudo systemctl enable --now observe-agent"
+    status_command = "observe-agent status"
+    start_timeout = 30  # how long to wait for observe-agent to start
 
+    # Start Observe Agent
+    remote_host.run_command(start_command)
 
-   #Start Observe Agent 
-   remote_host.run_command(start_command)
-   
-   #Check Agent Status 
-   agent_status=_check_status_loop(remote_host, start_timeout, status_command)
-   if not agent_status:
+    # Check Agent Status
+    agent_status = _check_status_loop(remote_host, start_timeout, status_command)
+    if not agent_status:
         u.die("❌ Error in Observe Agent Status Test ")
-        
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
 
     env_vars = u.get_env_vars()
-    remote_host = u.Host(host_ip=env_vars["host"],
-                       username=env_vars["user"],
-                       key_file_path=env_vars["key_filename"],
-                       password=env_vars["password"])    
-    
-    #Test SSH Connection before starting test of interest 
-    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))   
-
-    if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]:
+    remote_host = u.Host(
+        host_ip=env_vars["host"],
+        username=env_vars["user"],
+        key_file_path=env_vars["key_filename"],
+        password=env_vars["password"],
+    )
+
+    # Test SSH Connection before starting test of interest
+    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))
+
+    if (
+        "redhat" in env_vars["machine_config"]["distribution"]
+        or "debian" in env_vars["machine_config"]["distribution"]
+    ):
         run_test_linux(remote_host, env_vars)
     elif "windows" in env_vars["machine_config"]["distribution"]:
         run_test_windows(remote_host, env_vars)
     elif "docker" in env_vars["machine_config"]["distribution"]:
         run_test_docker(remote_host, env_vars)
-
-
diff --git a/integration/scripts/test_version.py b/integration/scripts/test_version.py
index c8d44a59c..962f97806 100755
--- a/integration/scripts/test_version.py
+++ b/integration/scripts/test_version.py
@@ -1,142 +1,138 @@
 #!/usr/bin/env python3
 import os
-import sys
 import re
-import time 
 import utils as u
 
+
 def _extract_version_config(result: any) -> tuple:
-    """Extract version name and config file from ssh result output 
+    """Extract version name and config file from ssh result output
 
     Args:
         result (any): ssh result output
 
     Returns:
-        tuple: config_file, version of the installed observe-agent package 
+        tuple: config_file, version of the installed observe-agent package
     """
- 
-     # Split the output by newlines and extract everything after the colon
-    for line in result.stdout.splitlines():      
-        if ":" in line:
-            _, version = line.split(":", 1)
-            version = version.strip()  # Remove leading/trailing whitespace
-        print(f"Version: {version}")
-    for line in result.stderr.splitlines():      
-        if ":" in line:
-            _, config_file = line.split(":", 1)
-            config_file = config_file.strip()  # Remove leading/trailing whitespace
-        print(f"Config File: {config_file}")
+
+    # Split the output by newlines and extract everything after the colon
+    version_match = re.search(r"version: (.*)(?:\n|$)", result.stdout)
+    if version_match is not None:
+        version = version_match.group(1).strip()
+    else:
+        raise ValueError(
+            f"❌ Failed: observe-agent version output did not match regex. Output: {result.stdout}"
+        )
+
+    config_match = re.search(r"config file: (.*)(?:\n|$)", result.stdout)
+    if config_match is not None:
+        config_file = config_match.group(1).strip()
+    else:
+        raise ValueError(
+            f"❌ Failed: observe-agent version output did not match regex. Output: {result.stdout}"
+        )
+    print(f"Version: {version}, Config File: {config_file}")
     return config_file, version
 
 
 @u.print_test_decorator
-def run_test_windows(remote_host:u.Host, env_vars: dict) -> None:  
-
+def run_test_windows(remote_host: u.Host, env_vars: dict) -> None:
     """
-    Test to validate observe-agent version and config file loaded is correct 
+    Test to validate observe-agent version and config file loaded is correct
 
     Args:
-        remote_host (Host): instance to ssh into 
+        remote_host (Host): instance to ssh into
         env_vars (dict): environment variables passed into for testing
 
     Raises:
         ValueError: if version or config file is invalid
     """
 
-    config_file_windows = 'C:\\Program Files\\Observe\\observe-agent\\observe-agent.yaml'
-    #Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2 
-    version_pattern = re.compile(r'^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$')
+    config_file_windows = (
+        "C:\\Program Files\\Observe\\observe-agent\\observe-agent.yaml"
+    )
+    # Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2
+    version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$")
 
-    result = remote_host.run_command('Set-Location "${Env:Programfiles}\\Observe\\observe-agent"; ./observe-agent version')    
+    result = remote_host.run_command(
+        'Set-Location "${Env:Programfiles}\\Observe\\observe-agent"; ./observe-agent version'
+    )
     config_file, version = _extract_version_config(result)
-     
+
     if config_file != config_file_windows:
         raise ValueError(f" ❌ Invalid config file: {config_file}")
     if not version_pattern.match(version):
         raise ValueError(f" ❌ Invalid version: {version}")
 
-    print (" ✅ Verified version and config file succesfully! ")
+    print(" ✅ Verified version and config file succesfully! ")
 
-    pass   
+    pass
 
 
 @u.print_test_decorator
-def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:  
-    docker_prefix='sudo docker run \
-        --mount type=bind,source=/proc,target=/hostfs/proc,readonly \
-        --mount type=bind,source=/snap,target=/hostfs/snap,readonly \
-        --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \
-        --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \
-        --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \
-        --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \
-        --pid host \
-        $(sudo docker images --format "{{.Repository}}:{{.Tag}}" | grep SNAPSHOT)'
-    config_file_linux = '/etc/observe-agent/observe-agent.yaml'
-    version_pattern = re.compile(r'^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$')
-    home_dir = "/home/{}".format(env_vars["user"])
-
-    # Upload default observe-agent.yaml to remote host home dir 
-    # mount via $(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml
-    observe_agent_file_path = os.path.abspath(os.path.join(os.getcwd(), '..',  'packaging/linux/config/observe-agent.yaml'))
-    print(f"Path to 'observe-agent.yaml' file: {observe_agent_file_path }")
-    remote_host.put_file(local_path=observe_agent_file_path, remote_path=home_dir)
-
-    #Run command to get version & config-file info 
-    result = remote_host.run_command('{} version'.format(docker_prefix))
+def run_test_docker(remote_host: u.Host, env_vars: dict) -> None:
+    u.upload_default_docker_config(env_vars, remote_host)
+    docker_prefix = u.get_docker_prefix(remote_host, False)
+    config_file_linux = "/etc/observe-agent/observe-agent.yaml"
+    version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$")
+
+    # Run command to get version & config-file info
+    result = remote_host.run_command(docker_prefix + " version")
     config_file, version = _extract_version_config(result)
 
-    
-    if config_file != config_file_linux: 
+    if config_file != config_file_linux:
         raise ValueError(f" ❌ Invalid config file: {config_file}")
     if not version_pattern.match(version):
         raise ValueError(f" ❌ Invalid version: {version}")
 
-    print (" ✅ Verified version and config file succesfully! ")    
+    print(" ✅ Verified version and config file succesfully! ")
 
-@u.print_test_decorator
-def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:    
 
+@u.print_test_decorator
+def run_test_linux(remote_host: u.Host, env_vars: dict) -> None:
     """
-    Test to validate observe-agent version and config file loaded is correct 
+    Test to validate observe-agent version and config file loaded is correct
 
     Args:
-        remote_host (Host): instance to ssh into 
+        remote_host (Host): instance to ssh into
         env_vars (dict): environment variables passed into for testing
 
     Raises:
         ValueError: if version or config file is invalid
     """
-    config_file_linux = '/etc/observe-agent/observe-agent.yaml'
-    #Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2 
-    version_pattern = re.compile(r'^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$')
-  
-    result = remote_host.run_command('observe-agent version')    
+    config_file_linux = "/etc/observe-agent/observe-agent.yaml"
+    # Can match 0.2.2-SNAPSHOT-b6e1491 or 0.2.2
+    version_pattern = re.compile(r"^\d+\.\d+\.\d+(-[A-Za-z0-9-]+)?$")
+
+    result = remote_host.run_command("observe-agent version")
     config_file, version = _extract_version_config(result)
-    
+
     if config_file != config_file_linux:
         raise ValueError(f" ❌ Invalid config file: {config_file}")
     if not version_pattern.match(version):
         raise ValueError(f" ❌ Invalid version: {version}")
 
-    print (" ✅ Verified version and config file succesfully! ")
+    print(" ✅ Verified version and config file succesfully! ")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     env_vars = u.get_env_vars()
-    remote_host = u.Host(host_ip=env_vars["host"],
-                       username=env_vars["user"],
-                       key_file_path=env_vars["key_filename"],
-                       password=env_vars["password"])    
-    
-    #Test SSH Connection before starting test of interest 
-    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))   
-
-    if "redhat" in env_vars["machine_config"]["distribution"] or "debian" in env_vars["machine_config"]["distribution"]:
+    remote_host = u.Host(
+        host_ip=env_vars["host"],
+        username=env_vars["user"],
+        key_file_path=env_vars["key_filename"],
+        password=env_vars["password"],
+    )
+
+    # Test SSH Connection before starting test of interest
+    remote_host.test_conection(int(env_vars["machine_config"]["sleep"]))
+
+    if (
+        "redhat" in env_vars["machine_config"]["distribution"]
+        or "debian" in env_vars["machine_config"]["distribution"]
+    ):
         run_test_linux(remote_host, env_vars)
     elif "windows" in env_vars["machine_config"]["distribution"]:
         run_test_windows(remote_host, env_vars)
     elif "docker" in env_vars["machine_config"]["distribution"]:
         run_test_docker(remote_host, env_vars)
-
-
diff --git a/integration/scripts/utils.py b/integration/scripts/utils.py
index 6fe677d11..2bb2f6090 100644
--- a/integration/scripts/utils.py
+++ b/integration/scripts/utils.py
@@ -1,37 +1,47 @@
+from typing import Any, Dict
 from socket import error as socket_error
 
-from fabric import Connection
+from fabric import Connection, Result
 from paramiko.ssh_exception import AuthenticationException, NoValidConnectionsError
 
 import os
 import sys
-import re
-import time 
-import json 
-import pprint 
+import time
 
 
 def die(message: str) -> None:
     print(message, file=sys.stderr)
     sys.exit(1)
 
-def mask_credentials(env_vars):
+
+def print_remote_result(result: Result) -> None:
+    print(str(result))
+
+
+def mask_credentials(env_vars: Dict[str, Any]) -> Dict[str, Any]:
     masked_env_vars = env_vars.copy()
-    #Only mask if vars exist 
-    if masked_env_vars["password"] and masked_env_vars["password"] is not None and masked_env_vars["password"] != "None" :
-        masked_env_vars["password"] = '*' * 5
-    if masked_env_vars["observe_token"] and masked_env_vars["observe_token"] is not None and masked_env_vars["observe_token"] != "None": 
-        masked_env_vars["observe_token"] = '*' * 5
+    # Only mask if vars exist
+    if (
+        masked_env_vars["password"]
+        and masked_env_vars["password"] is not None
+        and masked_env_vars["password"] != "None"
+    ):
+        masked_env_vars["password"] = "*" * 5
+    if (
+        masked_env_vars["observe_token"]
+        and masked_env_vars["observe_token"] is not None
+        and masked_env_vars["observe_token"] != "None"
+    ):
+        masked_env_vars["observe_token"] = "*" * 5
     return masked_env_vars
 
-def get_env_vars(need_observe: bool = False) -> dict:
-    
 
+def get_env_vars(need_observe: bool = False) -> Dict[str, Any]:
     """Gets environmental variables from OS and returns a dict of env_vars
 
     Args:
         need_observe (bool, optional): whether or not to require observe url/token variables.
-          Defaults to False.       
+          Defaults to False.
 
     Returns:
         _type_: dict of environment variables
@@ -39,45 +49,60 @@ def get_env_vars(need_observe: bool = False) -> dict:
     host = os.environ.get("HOST")
     user = os.environ.get("USER")
     key_filename = os.environ.get("KEY_FILENAME")
-    password=os.environ.get("PASSWORD")
-    machine_name=os.environ.get("MACHINE_NAME")
-    machine_config_string=os.environ.get("MACHINE_CONFIG")
-    observe_url=os.environ.get("OBSERVE_URL")
-    observe_token=os.environ.get("OBSERVE_TOKEN")
+    password = os.environ.get("PASSWORD")
+    machine_name = os.environ.get("MACHINE_NAME")
+    machine_config_string = os.environ.get("MACHINE_CONFIG")
+    observe_url = os.environ.get("OBSERVE_URL")
+    observe_token = os.environ.get("OBSERVE_TOKEN")
 
     mask = os.getenv("MASK", "True").lower() not in ("false", "0", "f", "no", "n")
 
-
     if host is None:
-        die("Error: HOST environment variable is not set. This should be an output variable from create_ec2 module")
+        die(
+            "Error: HOST environment variable is not set. This should be an output variable from create_ec2 module"
+        )
 
     if user is None:
-        die("Error: USER environment variable is not set. This should be an output variable from create_ec2 module")
+        die(
+            "Error: USER environment variable is not set. This should be an output variable from create_ec2 module"
+        )
 
     if key_filename is None:
-        die("Error: KEY_FILENAME environment variable is not set. This should be an output variable from create_ec2 module")
+        die(
+            "Error: KEY_FILENAME environment variable is not set. This should be an output variable from create_ec2 module"
+        )
 
-    if (password == 'None' or password is None) and "WINDOWS" in machine_name:
-        die("Error: Windows is specified but PASSWORD environment variable is not set. This should be an output variable from create_ec2 module")
+    if (password == "None" or password is None) and "WINDOWS" in machine_name:
+        die(
+            "Error: Windows is specified but PASSWORD environment variable is not set. This should be an output variable from create_ec2 module"
+        )
 
     if machine_name is None:
-        die("Error: MACHINE_NAME environment variable is not set. This should be an output variable from create_ec2 module")
+        die(
+            "Error: MACHINE_NAME environment variable is not set. This should be an output variable from create_ec2 module"
+        )
 
     if machine_config_string is None:
-        die("Error: MACHINE_CONFIG environment variable is not set. This should be an output variable from create_ec2 module")
+        die(
+            "Error: MACHINE_CONFIG environment variable is not set. This should be an output variable from create_ec2 module"
+        )
 
     if observe_url is None and need_observe:
-        die("Error: OBSERVE_URL environment variable is not set. This should be an output variable from setup_observe_variables module")
+        die(
+            "Error: OBSERVE_URL environment variable is not set. This should be an output variable from setup_observe_variables module"
+        )
     if observe_token is None and need_observe:
-        die("Error: OBSERVE_TOKEN environment variable is not set. This should be an output variable from setup_observe_variables module")
+        die(
+            "Error: OBSERVE_TOKEN environment variable is not set. This should be an output variable from setup_observe_variables module"
+        )
 
-     # Split the string into key-value pairs
-    pairs = machine_config_string.split(',')
+    # Split the string into key-value pairs
+    pairs = machine_config_string.split(",")
     data = {}
     for pair in pairs:
-        key, value = pair.split(':', 1)  #
+        key, value = pair.split(":", 1)  #
         data[key] = value
-    
+
     env_vars = {
         "host": host,
         "user": user,
@@ -86,20 +111,20 @@ def get_env_vars(need_observe: bool = False) -> dict:
         "machine_name": machine_name,
         "machine_config": data,
         "observe_url": observe_url,
-        "observe_token": observe_token
+        "observe_token": observe_token,
     }
 
     # Mask sensitive vars before printing
     masked_env_vars = mask_credentials(env_vars)
 
-    print("-"*30)
+    print("-" * 30)
     if mask:
         print("Masking Enabled")
-        print("Env vars set to: \n",  masked_env_vars )
+        print("Env vars set to: \n", masked_env_vars)
     else:
         print("Masking Disabled")
-        print("Env vars set to: \n",  env_vars )
-    print("-"*30)
+        print("Env vars set to: \n", env_vars)
+    print("-" * 30)
 
     return env_vars
 
@@ -112,51 +137,65 @@ def wrapper(*args, **kwargs):
         result = func(*args, **kwargs)
         print("*" * 30)
         return result
+
     return wrapper
 
-class ExampleException(Exception):  #We can put our custom exceptions here 
+
+class ExampleException(Exception):  # We can put our custom exceptions here
     pass
 
 
 class Host(object):
+    """Host class for SSH into EC2 instances"""
 
-    """Host class for SSH into EC2 instances 
-    """
-    def __init__(self, host_ip, username, key_file_path,password=None):
+    def __init__(self, host_ip, username, key_file_path, password=None):
         self.host_ip = host_ip
         self.username = username
         self.key_file_path = key_file_path
-        self.password=password
+        self.password = password
 
     def _get_connection(self) -> Connection:
-        connect_kwargs = {'key_filename': self.key_file_path,                          
-                          'password': self.password ,
-                          'timeout': 60,                      
-                          }
-        return Connection(host=self.host_ip, user=self.username, port=22,
-                          connect_kwargs=connect_kwargs)
-
-    def run_command(self, command):
+        connect_kwargs = {
+            "key_filename": self.key_file_path,
+            "password": self.password,
+            "timeout": 60,
+        }
+        return Connection(
+            host=self.host_ip,
+            user=self.username,
+            port=22,
+            connect_kwargs=connect_kwargs,
+        )
+
+    def run_command(self, command) -> Result:
         try:
             with self._get_connection() as connection:
-                print('Running `{0}` on {1}'.format(command, self.host_ip))
-                result = connection.run(command, warn=True, hide=True)                
+                print("Running `{0}` on {1}".format(command, self.host_ip))
+                result = connection.run(command, warn=True, hide=True)
         except (socket_error, AuthenticationException) as exc:
             self._raise_authentication_err(exc)
 
         if result.failed:
             raise ExampleException(
-                'The command `{0}` on host {1} failed with the error: '
-                '{2}'.format(command, self.host_ip, str(result.stderr)))
-        
-        return result
+                "The command `{0}` on host {1} failed with the error: "
+                "{2}\n\nCommand output: {3}".format(
+                    command,
+                    self.host_ip,
+                    str(result.stderr) or "<empty>",
+                    str(result.stdout) or "<empty>",
+                )
+            )
 
+        return result
 
     def put_file(self, local_path, remote_path) -> None:
         try:
             with self._get_connection() as connection:
-                print('Copying {0} to {1} on host {2}'.format(
-                    local_path, remote_path, self.host_ip))
+                print(
+                    "Copying {0} to {1} on host {2}".format(
+                        local_path, remote_path, self.host_ip
+                    )
+                )
                 connection.put(local_path, remote_path)
         except (socket_error, AuthenticationException) as exc:
             self._raise_authentication_err(exc)
@@ -164,21 +203,25 @@ def put_file(self, local_path, remote_path) -> None:
     def get_file(self, remote_path, local_path) -> None:
         try:
             with self._get_connection() as connection:
-                print('Copying {0} to {1} from host {2}'.format(
-                    remote_path, local_path, self.host_ip))
+                print(
+                    "Copying {0} to {1} from host {2}".format(
+                        remote_path, local_path, self.host_ip
+                    )
+                )
                 connection.get(remote_path, local_path)
         except (socket_error, AuthenticationException) as exc:
             self._raise_authentication_err(exc)
 
-    def _raise_authentication_err(self, exc):
+    def _raise_authentication_err(self, exc) -> None:
         raise ExampleException(
             "SSH: could not connect to {host} "
             "(username: {user}, key: {key}): {exc}".format(
-                host=self.host_ip, user=self.username,
-                key=self.key_file_path, exc=exc))
-    
-    def test_conection(self, timeout=60):
-        """Tests SSH connection to the host 
+                host=self.host_ip, user=self.username, key=self.key_file_path, exc=exc
+            )
+        )
+
+    def test_conection(self, timeout=60) -> None:
+        """Tests SSH connection to the host
 
         Args:
             timeout (int, optional): how long to wait for the connection to be established. Defaults to 60.
@@ -186,16 +229,92 @@ def test_conection(self, timeout=60):
         Raises:
             RuntimeError: SSH connection failures if the timeout is reached and no valid connection found
         """
-        print("Testing SSH connection to host {} with timeout {}s".format(self.host_ip, timeout))
+        print(
+            "Testing SSH connection to host {} with timeout {}s".format(
+                self.host_ip, timeout
+            )
+        )
         for _ in range(timeout):
             connection = self._get_connection()
             try:
                 connection.open()
                 print("✅ SSH connection successful")
                 connection.close()
-                return 
+                return
             except (socket_error, NoValidConnectionsError) as exc:
                 print(f"❌ SSH connection failed: {exc}")
             time.sleep(1)
         raise RuntimeError(" ❌ The SSH connection failed")
 
+
+def get_docker_image(remote_host: Host) -> str:
+    result = remote_host.run_command(
+        'sudo docker images --format "{{.Repository}}:{{.Tag}}"'
+    )
+    images = [line.strip() for line in result.stdout.splitlines() if "SNAPSHOT" in line]
+    if len(images) != 1:
+        die("❌ Error in finding observe-agent image\n" + str(result))
+
+    return images[0]
+
+
+def get_docker_prefix(remote_host: Host, detach: bool) -> str:
+    image = get_docker_image(remote_host)
+    return f'sudo docker run {"-d --restart on-failure" if detach else ""} \
+        --mount type=bind,source=/proc,target=/hostfs/proc,readonly \
+        --mount type=bind,source=/snap,target=/hostfs/snap,readonly \
+        --mount type=bind,source=/boot,target=/hostfs/boot,readonly \
+        --mount type=bind,source=/var/lib,target=/hostfs/var/lib,readonly \
+        --mount type=bind,source=/var/log,target=/hostfs/var/log,readonly \
+        --mount type=bind,source=/var/lib/docker/containers,target=/var/lib/docker/containers,readonly \
+        --mount type=bind,source=$(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml \
+        --pid host {image}'
+
+
+def upload_default_docker_config(env_vars: dict, remote_host: Host) -> None:
+    home_dir = "/home/{}".format(env_vars["user"])
+    # Upload default observe-agent.yaml to remote host home dir
+    # mount via $(pwd)/observe-agent.yaml,target=/etc/observe-agent/observe-agent.yaml
+    observe_agent_file_path = os.path.abspath(
+        os.path.join(os.getcwd(), "..", "packaging/linux/config/observe-agent.yaml")
+    )
+    print(f"Path to 'observe-agent.yaml' file: {observe_agent_file_path }")
+    remote_host.put_file(local_path=observe_agent_file_path, remote_path=home_dir)
+
+
+def get_docker_container(remote_host: Host) -> str:
+    get_container_command = 'sudo docker ps --filter "status=running" --format "{{.ID}} {{.Image}} {{.CreatedAt}}"'
+    result = remote_host.run_command(get_container_command)
+    running = [
+        line.strip() for line in result.stdout.splitlines() if "SNAPSHOT" in line
+    ]
+    if len(running) == 0:
+        # No container matched our filter. Get logs from all containers to help debug.
+        result = remote_host.run_command('sudo docker ps --format "{{.ID}}"')
+        if result.stdout != "":
+            container_ids = result.stdout.splitlines()
+            for container_id in container_ids:
+                print(
+                    "Logs for container {}:".format(container_id),
+                    file=sys.stderr,
+                )
+                result = remote_host.run_command(
+                    "sudo docker logs {}".format(container_id)
+                )
+                print_remote_result(result)
+        else:
+            print_remote_result(result)
+        die(
+            "❌ Error in finding observe-agent container; command output:\n{}\ncommand error:\n{}".format(
+                result.stdout or "<empty>",
+                result.stderr or "<empty>",
+            )
+        )
+        return ""
+    if len(running) > 1:
+        die(
+            "❌ Error in finding observe-agent container, too many snapshots running:\n"
+            + result.stdout
+        )
+    # Only one snapshot running; return the ID from the first line.
+    return running[0].split()[0]
diff --git a/integration/tests/integration.tftest.hcl b/integration/tests/integration.tftest.hcl
index 01fac5a6b..7e69c7f46 100644
--- a/integration/tests/integration.tftest.hcl
+++ b/integration/tests/integration.tftest.hcl
@@ -15,7 +15,6 @@ run "setup_observe_variables" {
 }
 
 
-
 run "test_ec2_connection" {
   module {
     source  = "observeinc/collection/aws//modules/testing/exec"
@@ -41,8 +40,6 @@ run "test_ec2_connection" {
 }
 
 
-
-
 run "test_install" {
   module {
     source  = "observeinc/collection/aws//modules/testing/exec"
@@ -68,8 +65,6 @@ run "test_install" {
 }
 
 
-
-
 run "test_version" {
   module {
     source  = "observeinc/collection/aws//modules/testing/exec"
@@ -95,8 +90,6 @@ run "test_version" {
 }
 
 
-
-
 run "test_configure" {
   module {
     source  = "observeinc/collection/aws//modules/testing/exec"
@@ -123,6 +116,7 @@ run "test_configure" {
   }
 }
 
+
 run "test_start" {
   module {
     source  = "observeinc/collection/aws//modules/testing/exec"
@@ -147,3 +141,27 @@ run "test_start" {
   }
 }
 
+
+run "test_diagnose" {
+  module {
+    source  = "observeinc/collection/aws//modules/testing/exec"
+    version = "2.9.0"
+  }
+
+  variables {
+    command = "python3 ./scripts/test_diagnose.py"
+    env_vars = {
+      HOST           = run.setup_ec2.public_ip
+      USER           = run.setup_ec2.user_name
+      KEY_FILENAME   = run.setup_ec2.private_key_path
+      PASSWORD       = run.setup_ec2.password
+      MACHINE_NAME   = run.setup_ec2.machine_name
+      MACHINE_CONFIG = run.setup_ec2.machine_config
+    }
+  }
+
+  assert {
+    condition     = output.error == ""
+    error_message = "Error in Diagnose Test"
+  }
+}
diff --git a/internal/commands/diagnose/agentstatuscheck.go b/internal/commands/diagnose/agentstatuscheck.go
new file mode 100644
index 000000000..87a46a21f
--- /dev/null
+++ b/internal/commands/diagnose/agentstatuscheck.go
@@ -0,0 +1,59 @@
+package diagnose
+
+import (
+	"embed"
+
+	"github.com/observeinc/observe-agent/internal/commands/status"
+	"github.com/spf13/viper"
+)
+
+type StatusTestResult struct {
+	Passed       bool
+	AgentRunning bool
+	Error        string
+}
+
+func checkStatus(_ *viper.Viper) (bool, any, error) {
+	data, err := status.GetStatusData()
+	if err != nil {
+		return false, StatusTestResult{
+			Passed:       false,
+			AgentRunning: false,
+			Error:        err.Error(),
+		}, nil
+	}
+	if data.Status != status.Running.String() {
+		return false, StatusTestResult{
+			Passed:       false,
+			AgentRunning: false,
+			Error:        "agent is not running",
+		}, nil
+	}
+	if data.AgentMetrics == (status.AgentMetrics{}) {
+		return false, StatusTestResult{
+			Passed:       false,
+			AgentRunning: true,
+			Error:        "agent metrics are not available",
+		}, nil
+	}
+	return true, StatusTestResult{
+		Passed:       true,
+		AgentRunning: true,
+	}, nil
+}
+
+const agentStatusCheckTemplate = "agentstatuscheck.tmpl"
+
+var (
+	//go:embed agentstatuscheck.tmpl
+	agentStatusCheckTemplateFS embed.FS
+)
+
+func agentstatusDiagnostic() Diagnostic {
+	return Diagnostic{
+		check:        checkStatus,
+		checkName:    "Agent Status Check",
+		templateName: agentStatusCheckTemplate,
+		templateFS:   agentStatusCheckTemplateFS,
+	}
+}
diff --git a/internal/commands/diagnose/agentstatuscheck.tmpl b/internal/commands/diagnose/agentstatuscheck.tmpl
new file mode 100644
index 000000000..eef7b8efb
--- /dev/null
+++ b/internal/commands/diagnose/agentstatuscheck.tmpl
@@ -0,0 +1,7 @@
+{{- if .Passed -}}
+Observe agent is running and metrics are available.
+{{- else if not .AgentRunning -}}
+⚠️ Observe agent is not running. {{- if .Error }} Error: {{ .Error }}{{ end }}
+{{- else -}}
+⚠️ Observe agent status check failed with error: {{ .Error }}
+{{- end -}}
diff --git a/internal/commands/diagnose/authcheck.go b/internal/commands/diagnose/authcheck.go
index e6c559eb8..09733682a 100644
--- a/internal/commands/diagnose/authcheck.go
+++ b/internal/commands/diagnose/authcheck.go
@@ -9,11 +9,6 @@ import (
 	"github.com/spf13/viper"
 )
 
-const (
-	ChallengeURL = "https://175914298205.collect.observeinc.com/.well-known/fastly/logging/challenge"
-	AuthCheckURL = "https://175914298205.collect.observeinc.com/status"
-)
-
 type NetworkTestResult struct {
 	URL          string
 	ResponseCode int
@@ -69,11 +64,11 @@ func makeTestRequest(URL string, headers map[string]string) NetworkTestResult {
 	}
 }
 
-func makeAuthTestRequest(v *viper.Viper) (any, error) {
+func makeAuthTestRequest(v *viper.Viper) (bool, any, error) {
 	collector_url := v.GetString("observe_url")
 	authToken := fmt.Sprintf("Bearer %s", v.GetString("token"))
 	authTestResponse := makeTestRequest(collector_url, map[string]string{"Authorization": authToken})
-	return authTestResponse, nil
+	return authTestResponse.Passed, authTestResponse, nil
 }
 
 // const networkcheckTemplate = "networkcheck.tmpl"
diff --git a/internal/commands/diagnose/authcheck.tmpl b/internal/commands/diagnose/authcheck.tmpl
index 9433bc852..4581ce6ac 100644
--- a/internal/commands/diagnose/authcheck.tmpl
+++ b/internal/commands/diagnose/authcheck.tmpl
@@ -1,13 +1,12 @@
 Running auth check against {{ .URL }}
-{{- if .Passed }}
+
+{{- if .Passed -}}
 Request to test URL responded with response code {{ .ResponseCode }}
-{{- else }}
-{{- if eq .ResponseCode 401 }} 
+{{- else if eq .ResponseCode 401 -}}
 ⚠️ Request to test URL failed with error {{ .Error }}.
 
 Remediation
 Please check that the token is present in the `observe-agent.yaml` config file and that the token is valid. 
-{{- else }}
+{{- else -}}
 ⚠️ Request to test URL failed with error {{ .Error }} and response code {{ .ResponseCode }}.
-{{- end }}
-{{ end }}
+{{- end -}}
diff --git a/internal/commands/diagnose/configcheck.go b/internal/commands/diagnose/configcheck.go
index 736cdfa35..9e4857b23 100644
--- a/internal/commands/diagnose/configcheck.go
+++ b/internal/commands/diagnose/configcheck.go
@@ -17,18 +17,18 @@ type ConfigTestResult struct {
 	Error          string
 }
 
-func checkConfig(v *viper.Viper) (any, error) {
+func checkConfig(v *viper.Viper) (bool, any, error) {
 	configFile := v.ConfigFileUsed()
 	if configFile == "" {
-		return nil, fmt.Errorf("no config file defined")
+		return false, nil, fmt.Errorf("no config file defined")
 	}
 	contents, err := os.ReadFile(configFile)
 	if err != nil {
-		return nil, err
+		return false, nil, err
 	}
 	var conf config.AgentConfig
 	if err = yaml.Unmarshal(contents, &conf); err != nil {
-		return ConfigTestResult{
+		return false, ConfigTestResult{
 			ConfigFile:     configFile,
 			ParseSucceeded: false,
 			IsValid:        false,
@@ -36,14 +36,14 @@ func checkConfig(v *viper.Viper) (any, error) {
 		}, nil
 	}
 	if err = conf.Validate(); err != nil {
-		return ConfigTestResult{
+		return false, ConfigTestResult{
 			ConfigFile:     configFile,
 			ParseSucceeded: true,
 			IsValid:        false,
 			Error:          err.Error(),
 		}, nil
 	}
-	return ConfigTestResult{
+	return true, ConfigTestResult{
 		ConfigFile:     configFile,
 		ParseSucceeded: true,
 		IsValid:        true,
diff --git a/internal/commands/diagnose/configcheck.tmpl b/internal/commands/diagnose/configcheck.tmpl
index 6f804f8bd..e65d294ce 100644
--- a/internal/commands/diagnose/configcheck.tmpl
+++ b/internal/commands/diagnose/configcheck.tmpl
@@ -1,9 +1,10 @@
 Running check on observe-agent config file {{ .ConfigFile }}
-{{- if .IsValid }}
+
+{{- if .IsValid -}}
 Config file is valid.
-{{- else if .ParseSucceeded}}
+{{- else if .ParseSucceeded -}}
 ⚠️ Config file validation failed with error {{ .Error }}
-{{- else }}
+{{- else -}}
 ⚠️ Config file could not be parsed as YAML
 {{ .Error }}
-{{- end }}
+{{- end -}}
diff --git a/internal/commands/diagnose/configcheck_test.go b/internal/commands/diagnose/configcheck_test.go
index 01db3ccc7..686df8e1d 100644
--- a/internal/commands/diagnose/configcheck_test.go
+++ b/internal/commands/diagnose/configcheck_test.go
@@ -66,7 +66,7 @@ func Test_checkConfig(t *testing.T) {
 
 		v := viper.New()
 		v.SetConfigFile(f.Name())
-		resultAny, err := checkConfig(v)
+		success, resultAny, err := checkConfig(v)
 		assert.NoError(t, err)
 		result, ok := resultAny.(ConfigTestResult)
 		assert.True(t, ok)
@@ -77,6 +77,7 @@ func Test_checkConfig(t *testing.T) {
 		}
 		assert.Equal(t, tc.shouldParse, result.ParseSucceeded)
 		assert.Equal(t, tc.isValid, result.IsValid)
+		assert.Equal(t, tc.isValid && tc.shouldParse, success)
 		assert.Equal(t, f.Name(), result.ConfigFile)
 	}
 }
diff --git a/internal/commands/diagnose/diagnose.go b/internal/commands/diagnose/diagnose.go
index 16d65ae19..c2db1bccb 100644
--- a/internal/commands/diagnose/diagnose.go
+++ b/internal/commands/diagnose/diagnose.go
@@ -15,7 +15,7 @@ import (
 )
 
 type Diagnostic struct {
-	check        func(*viper.Viper) (any, error)
+	check        func(*viper.Viper) (bool, any, error)
 	checkName    string
 	templateName string
 	templateFS   embed.FS
@@ -24,6 +24,7 @@ type Diagnostic struct {
 var diagnostics = []Diagnostic{
 	configDiagnostic(),
 	otelconfigDiagnostic(),
+	agentstatusDiagnostic(),
 	authDiagnostic(),
 }
 
@@ -35,10 +36,14 @@ var diagnoseCmd = &cobra.Command{
 to attempt to identify issues that could cause the agent to function improperly.`,
 	Run: func(cmd *cobra.Command, args []string) {
 		v := viper.GetViper()
-		fmt.Print("Running diagnosis checks...\n")
+		fmt.Print("Running diagnosis checks...")
+		var failedChecks []string
 		for _, diagnostic := range diagnostics {
-			fmt.Printf("\n%s\n================\n\n", diagnostic.checkName)
-			data, err := diagnostic.check(v)
+			fmt.Printf("\n\n\n%s\n==================\n", diagnostic.checkName)
+			success, data, err := diagnostic.check(v)
+			if !success {
+				failedChecks = append(failedChecks, diagnostic.checkName)
+			}
 			if err != nil {
 				fmt.Printf("⚠️ Failed to run check: %s\n", err.Error())
 				continue
@@ -51,6 +56,15 @@ to attempt to identify issues that could cause the agent to function improperly.
 				continue
 			}
 		}
+		if len(failedChecks) > 0 {
+			fmt.Printf("\n\n\n❌ %d out of %d checks failed:\n", len(failedChecks), len(diagnostics))
+			for _, check := range failedChecks {
+				fmt.Printf("  - %s\n", check)
+			}
+			os.Exit(1)
+		} else {
+			fmt.Printf("\n✅ All %d checks passed!\n", len(diagnostics))
+		}
 	},
 }
 
diff --git a/internal/commands/diagnose/otelconfigcheck.go b/internal/commands/diagnose/otelconfigcheck.go
index 5a33976b5..72a6886ce 100644
--- a/internal/commands/diagnose/otelconfigcheck.go
+++ b/internal/commands/diagnose/otelconfigcheck.go
@@ -5,6 +5,7 @@ import (
 	"embed"
 
 	"github.com/observeinc/observe-agent/internal/commands/start"
+	logger "github.com/observeinc/observe-agent/internal/commands/util"
 	"github.com/spf13/viper"
 	"go.opentelemetry.io/collector/otelcol"
 )
@@ -14,10 +15,10 @@ type OtelConfigTestResult struct {
 	Error  string
 }
 
-func checkOtelConfig(_ *viper.Viper) (any, error) {
-	colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings()
+func checkOtelConfig(_ *viper.Viper) (bool, any, error) {
+	colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings(logger.WithCtx(context.Background(), logger.GetNop()))
 	if err != nil {
-		return nil, err
+		return false, nil, err
 	}
 	if cleanup != nil {
 		defer cleanup()
@@ -26,16 +27,16 @@ func checkOtelConfig(_ *viper.Viper) (any, error) {
 	// https://github.com/open-telemetry/opentelemetry-collector/blob/main/otelcol/command_validate.go
 	col, err := otelcol.NewCollector(*colSettings)
 	if err != nil {
-		return nil, err
+		return false, nil, err
 	}
 	err = col.DryRun(context.Background())
 	if err != nil {
-		return OtelConfigTestResult{
+		return false, OtelConfigTestResult{
 			Passed: false,
 			Error:  err.Error(),
 		}, nil
 	}
-	return OtelConfigTestResult{
+	return true, OtelConfigTestResult{
 		Passed: true,
 	}, nil
 }
diff --git a/internal/commands/diagnose/otelconfigcheck.tmpl b/internal/commands/diagnose/otelconfigcheck.tmpl
index dbe90e51d..f2879ff02 100644
--- a/internal/commands/diagnose/otelconfigcheck.tmpl
+++ b/internal/commands/diagnose/otelconfigcheck.tmpl
@@ -1,5 +1,5 @@
-{{- if .Passed }}
+{{- if .Passed -}}
 OTEL configuration is valid.
-{{- else }}
+{{- else -}}
 ⚠️ OTEL configuration validation failed with error {{ .Error }}
-{{- end }}
+{{- end -}}
diff --git a/internal/commands/start/start.go b/internal/commands/start/start.go
index a19b161cc..17a9308d6 100644
--- a/internal/commands/start/start.go
+++ b/internal/commands/start/start.go
@@ -41,8 +41,11 @@ func SetupAndGetConfigFiles(ctx context.Context) ([]string, func(), error) {
 	return configFilePaths, cleanup, nil
 }
 
-func SetupAndGenerateCollectorSettings() (*collector.CollectorSettings, func(), error) {
-	ctx := logger.WithCtx(context.Background(), logger.Get())
+func DefaultLoggerCtx() context.Context {
+	return logger.WithCtx(context.Background(), logger.Get())
+}
+
+func SetupAndGenerateCollectorSettings(ctx context.Context) (*collector.CollectorSettings, func(), error) {
 	configFilePaths, cleanup, err := SetupAndGetConfigFiles(ctx)
 	if err != nil {
 		return nil, cleanup, err
@@ -59,7 +62,7 @@ var startCmd = &cobra.Command{
 This command reads in the local config and env vars and starts the 
 collector on the current host.`,
 	RunE: func(cmd *cobra.Command, args []string) error {
-		colSettings, cleanup, err := SetupAndGenerateCollectorSettings()
+		colSettings, cleanup, err := SetupAndGenerateCollectorSettings(DefaultLoggerCtx())
 		if err != nil {
 			return err
 		}
diff --git a/internal/commands/version/version.go b/internal/commands/version/version.go
index e71164421..527dbdf20 100644
--- a/internal/commands/version/version.go
+++ b/internal/commands/version/version.go
@@ -9,6 +9,7 @@ import (
 	"github.com/observeinc/observe-agent/build"
 	"github.com/observeinc/observe-agent/internal/root"
 	"github.com/spf13/cobra"
+	"github.com/spf13/viper"
 )
 
 // versionCmd represents the version command
@@ -18,8 +19,8 @@ var versionCmd = &cobra.Command{
 	Long: `Display the currently installed version of the observe-agent. This version
 is based on the package release.`,
 	Run: func(cmd *cobra.Command, args []string) {
-		version := getVersion()
-		fmt.Printf("observe-agent version: %s\n", version)
+		fmt.Printf("observe-agent version: %s\n", getVersion())
+		fmt.Printf("observe-agent config file: %s\n", getConfigFile())
 	},
 }
 
@@ -43,3 +44,11 @@ func getVersion() string {
 	}
 	return build.Version
 }
+
+func getConfigFile() string {
+	configFile := viper.ConfigFileUsed()
+	if configFile == "" {
+		configFile = "[none]"
+	}
+	return configFile
+}
diff --git a/internal/connections/confighandler.go b/internal/connections/confighandler.go
index 88cf65036..6ebeb6c11 100644
--- a/internal/connections/confighandler.go
+++ b/internal/connections/confighandler.go
@@ -52,7 +52,7 @@ func GetAllOtelConfigFilePaths(ctx context.Context, tmpDir string) ([]string, st
 		}
 		configFilePaths = append(configFilePaths, overridePath)
 	}
-	logger.FromCtx(ctx).Info(fmt.Sprint("Config file paths:", configFilePaths))
+	logger.FromCtx(ctx).Debug(fmt.Sprint("Config file paths:", configFilePaths))
 	return configFilePaths, overridePath, nil
 }
 
diff --git a/internal/root/root.go b/internal/root/root.go
index 5361ab33c..6eeb847cd 100644
--- a/internal/root/root.go
+++ b/internal/root/root.go
@@ -55,7 +55,11 @@ func InitConfig() {
 	viper.AutomaticEnv() // read in environment variables that match
 
 	// If a config file is found, read it in.
-	if err := viper.ReadInConfig(); err == nil {
-		fmt.Fprintln(os.Stderr, "Using config file:", viper.ConfigFileUsed())
+	if err := viper.ReadInConfig(); err != nil {
+		if _, ok := err.(viper.ConfigFileNotFoundError); ok {
+			// Config file not found; ignore this error.
+		} else {
+			fmt.Fprintln(os.Stderr, "error reading config file:", err)
+		}
 	}
 }
diff --git a/main_windows.go b/main_windows.go
index cee0ca631..f0c8fb7dc 100644
--- a/main_windows.go
+++ b/main_windows.go
@@ -27,7 +27,7 @@ func run() error {
 		}
 		root.CfgFile = os.Args[1]
 		root.InitConfig()
-		colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings()
+		colSettings, cleanup, err := start.SetupAndGenerateCollectorSettings(start.DefaultLoggerCtx())
 		if err != nil {
 			return err
 		}
diff --git a/packaging/linux/config/observe-agent.yaml b/packaging/linux/config/observe-agent.yaml
index 37b2880da..c68c408de 100644
--- a/packaging/linux/config/observe-agent.yaml
+++ b/packaging/linux/config/observe-agent.yaml
@@ -1,8 +1,8 @@
 # Observe data token
-token: "${OBSERVE_TOKEN}"
+token: "<OBSERVE TOKEN>"
 
 # Target Observe collection url
-observe_url: "${OBSERVE_COLLECTION_ENDPOINT}"
+observe_url: "<OBSERVE COLLECTION ENDPOINT>"
 
 # Debug mode - Sets agent log level to debug
 debug: false