diff --git a/.github/workflows/k8s-scaling-test.yml b/.github/workflows/k8s-scaling-test.yml index 9453a8031..f3e563bde 100644 --- a/.github/workflows/k8s-scaling-test.yml +++ b/.github/workflows/k8s-scaling-test.yml @@ -22,6 +22,19 @@ jobs: matrix: include: - k8s-version: 'v1.31.2' + test-strategy: chart_test_autoscaling_job_count_chaos + cluster: 'minikube' + helm-version: 'v3.16.3' + docker-version: '27.3.1' + python-version: '3.13' + - k8s-version: 'v1.31.2' + test-strategy: chart_test_autoscaling_job_count_max_sessions + cluster: 'minikube' + helm-version: 'v3.16.3' + docker-version: '27.3.1' + python-version: '3.13' + - k8s-version: 'v1.31.2' + test-strategy: chart_test_autoscaling_job_count cluster: 'minikube' helm-version: 'v3.16.3' docker-version: '27.3.1' @@ -103,54 +116,12 @@ jobs: timeout_minutes: 30 max_attempts: 3 command: | - NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count_chaos - - name: Upload results - if: always() - uses: actions/upload-artifact@main - with: - name: chart_test_autoscaling_job_count_chaos - path: ./tests/tests/*.md - if-no-files-found: ignore - - name: Test Selenium Grid on Kubernetes with Autoscaling - uses: nick-invision/retry@master - with: - timeout_minutes: 30 - max_attempts: 3 - command: | - NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count_max_sessions - - name: Upload results - if: always() - uses: actions/upload-artifact@main - with: - name: chart_test_autoscaling_job_count_max_sessions - path: ./tests/tests/*.md - if-no-files-found: ignore - - name: Test Selenium Grid on Kubernetes with Autoscaling - uses: nick-invision/retry@master - with: - timeout_minutes: 30 - max_attempts: 3 - command: | - NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count_strategy_accurate - - name: Upload results - if: always() - uses: actions/upload-artifact@main - with: - name: chart_test_autoscaling_job_count_strategy_accurate - path: ./tests/tests/*.md - if-no-files-found: ignore - - name: Test Selenium Grid on Kubernetes with Autoscaling - uses: nick-invision/retry@master - with: - timeout_minutes: 30 - max_attempts: 3 - command: | - NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count + NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make ${{ matrix.test-strategy }} - name: Upload results if: always() uses: actions/upload-artifact@main with: - name: chart_test_autoscaling_job_count + name: ${{ matrix.test-strategy }} path: ./tests/tests/*.md if-no-files-found: ignore - name: Cleanup Kubernetes cluster diff --git a/Makefile b/Makefile index 01443f95f..ba038566c 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ SBOM_OUTPUT := $(or $(SBOM_OUTPUT),$(SBOM_OUTPUT),package_versions.txt) KEDA_TAG_PREV_VERSION := $(or $(KEDA_TAG_PREV_VERSION),$(KEDA_TAG_PREV_VERSION),2.16.0-selenium-grid) KEDA_TAG_VERSION := $(or $(KEDA_TAG_VERSION),$(KEDA_TAG_VERSION),2.16.0-selenium-grid) KEDA_BASED_NAME := $(or $(KEDA_BASED_NAME),$(KEDA_BASED_NAME),ndviet) -KEDA_BASED_TAG := $(or $(KEDA_BASED_TAG),$(KEDA_BASED_TAG),2.16.0-selenium-grid-20241127) +KEDA_BASED_TAG := $(or $(KEDA_BASED_TAG),$(KEDA_BASED_TAG),2.16.0-selenium-grid-20241128) all: hub \ distributor \ @@ -961,6 +961,36 @@ chart_test_autoscaling_playwright_connect_grid: TEMPLATE_OUTPUT_FILENAME="k8s_playwright_connect_grid_basicAuth_secureIngress_ingressPublicIP_autoScaling_patchKEDA.yaml" \ ./tests/charts/make/chart_test.sh JobAutoscaling +chart_test_autoscaling_job_count_chaos: + MATRIX_TESTS=AutoScalingTestsScaleChaos \ + make chart_test_autoscaling_job_count + +chart_test_autoscaling_job_count_max_sessions: + MAX_SESSIONS_FIREFOX=2 MAX_SESSIONS_EDGE=2 MAX_SESSIONS_CHROME=2 \ + make chart_test_autoscaling_job_count + +chart_test_autoscaling_job_count: + MATRIX_TESTS=$(or $(MATRIX_TESTS), "AutoscalingTestsScaleUp") SCALING_STRATEGY=$(or $(SCALING_STRATEGY), "default") \ + PLATFORMS=$(PLATFORMS) RELEASE_NAME=selenium TEST_PATCHED_KEDA=true SELENIUM_GRID_PROTOCOL=http SELENIUM_GRID_HOST=localhost SELENIUM_GRID_PORT=80 \ + SELENIUM_GRID_PORT=80 SELENIUM_GRID_MONITORING=false CLEAR_POD_HISTORY=true SET_MAX_REPLICAS=100 ENABLE_VIDEO_RECORDER=false \ + VERSION=$(TAG_VERSION) VIDEO_TAG=$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) KEDA_BASED_NAME=$(KEDA_BASED_NAME) KEDA_BASED_TAG=$(KEDA_BASED_TAG) NAMESPACE=$(NAMESPACE) BINDING_VERSION=$(BINDING_VERSION) BASE_VERSION=$(BASE_VERSION) \ + ./tests/charts/make/chart_test.sh JobAutoscaling + +chart_test_autoscaling_deployment_count_chaos: + MATRIX_TESTS=AutoScalingTestsScaleChaos \ + make chart_test_autoscaling_deployment_count + +chart_test_autoscaling_deployment_count_max_sessions: + MAX_SESSIONS_FIREFOX=3 MAX_SESSIONS_EDGE=2 MAX_SESSIONS_CHROME=2 \ + make chart_test_autoscaling_deployment_count + +chart_test_autoscaling_deployment_count: + MATRIX_TESTS=$(or $(MATRIX_TESTS), "AutoscalingTestsScaleUp") \ + PLATFORMS=$(PLATFORMS) RELEASE_NAME=selenium TEST_PATCHED_KEDA=true SELENIUM_GRID_PROTOCOL=http SELENIUM_GRID_HOST=localhost SELENIUM_GRID_PORT=80 \ + SELENIUM_GRID_PORT=80 SELENIUM_GRID_MONITORING=false CLEAR_POD_HISTORY=true SET_MAX_REPLICAS=100 ENABLE_VIDEO_RECORDER=false \ + VERSION=$(TAG_VERSION) VIDEO_TAG=$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) KEDA_BASED_NAME=$(KEDA_BASED_NAME) KEDA_BASED_TAG=$(KEDA_BASED_TAG) NAMESPACE=$(NAMESPACE) BINDING_VERSION=$(BINDING_VERSION) BASE_VERSION=$(BASE_VERSION) \ + ./tests/charts/make/chart_test.sh DeploymentAutoscaling + chart_test_delete: helm del test -n selenium || true helm del selenium -n selenium || true diff --git a/tests/AutoscalingTests/__init__.py b/tests/AutoscalingTests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/AutoscalingTests/common.py b/tests/AutoscalingTests/common.py new file mode 100644 index 000000000..3a425a375 --- /dev/null +++ b/tests/AutoscalingTests/common.py @@ -0,0 +1,90 @@ +import unittest +import random +import time +import subprocess +import signal +import concurrent.futures +import csv +import os +from selenium import webdriver +from selenium.webdriver.firefox.options import Options as FirefoxOptions +from selenium.webdriver.edge.options import Options as EdgeOptions +from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium.webdriver.remote.client_config import ClientConfig +from csv2md.table import Table + +BROWSER = { + "chrome": ChromeOptions(), + "firefox": FirefoxOptions(), + "edge": EdgeOptions(), +} + +CLIENT_CONFIG = ClientConfig( + remote_server_addr=f"http://localhost/selenium/wd/hub", + keep_alive=True, + timeout=3600, +) + +FIELD_NAMES = ["Iteration", "New request sessions", "Requests accepted time", "Sessions failed", "New scaled pods", "Total sessions", "Total pods", "Gaps"] + +def get_pod_count(): + result = subprocess.run(["kubectl", "get", "pods", "-A", "--no-headers"], capture_output=True, text=True) + return len([line for line in result.stdout.splitlines() if "selenium-node-" in line and "Running" in line]) + +def create_session(browser_name): + return webdriver.Remote(command_executor=CLIENT_CONFIG.remote_server_addr, options=BROWSER[browser_name], client_config=CLIENT_CONFIG) + +def wait_for_count_matches(sessions, timeout=10, interval=5): + elapsed = 0 + while elapsed < timeout: + pod_count = get_pod_count() + if pod_count == len(sessions): + break + print(f"VALIDATING: Waiting for pods to match sessions... ({elapsed}/{timeout} seconds elapsed)") + time.sleep(interval) + elapsed += interval + if pod_count != len(sessions): + print(f"WARN: Mismatch between pod count and session count after {timeout} seconds. Gaps: {pod_count - len(sessions)}") + else: + print(f"PASS: Pod count matches session count after {elapsed} seconds.") + +def close_all_sessions(sessions): + for session in sessions: + session.quit() + sessions.clear() + return sessions + +def create_sessions_in_parallel(new_request_sessions): + failed_jobs = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(create_session, random.choice(list(BROWSER.keys()))) for _ in range(new_request_sessions)] + sessions = [] + for future in concurrent.futures.as_completed(futures): + try: + sessions.append(future.result()) + except Exception as e: + print(f"ERROR: Failed to create session: {e}") + failed_jobs += 1 + print(f"Total failed jobs: {failed_jobs}") + return sessions + +def randomly_quit_sessions(sessions, sublist_size): + if sessions: + sessions_to_quit = random.sample(sessions, min(sublist_size, len(sessions))) + for session in sessions_to_quit: + session.quit() + sessions.remove(session) + print(f"QUIT: {len(sessions_to_quit)} sessions have been randomly quit.") + return sessions + +def export_results_to_csv(output_file, field_names, results): + with open(output_file, mode="w") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=field_names) + writer.writeheader() + writer.writerows(results) + +def export_results_csv_to_md(csv_file, md_file): + with open(csv_file) as f: + table = Table.parse_csv(f) + with open(md_file, mode="w") as f: + f.write(table.markdown()) diff --git a/tests/AutoscalingTests/test_scale_chaos.py b/tests/AutoscalingTests/test_scale_chaos.py new file mode 100644 index 000000000..f82dae18b --- /dev/null +++ b/tests/AutoscalingTests/test_scale_chaos.py @@ -0,0 +1,58 @@ +import unittest +import random +import time +import signal +import csv +from csv2md.table import Table +from .common import * + +SESSIONS = [] +RESULTS = [] + +def signal_handler(signum, frame): + print("Signal received, quitting all sessions...") + close_all_sessions(SESSIONS) + +signal.signal(signal.SIGTERM, signal_handler) +signal.signal(signal.SIGINT, signal_handler) + +class SeleniumAutoscalingTests(unittest.TestCase): + def test_run_tests(self): + try: + for iteration in range(10): + new_request_sessions = random.randint(2, 15) + start_time = time.time() + start_pods = get_pod_count() + new_sessions = create_sessions_in_parallel(new_request_sessions) + failed_sessions = new_request_sessions - len(new_sessions) + end_time = time.time() + stop_pods = get_pod_count() + SESSIONS.extend(new_sessions) + elapsed_time = end_time - start_time + new_scaled_pods = stop_pods - start_pods + total_sessions = len(SESSIONS) + total_pods = get_pod_count() + RESULTS.append({ + FIELD_NAMES[0]: iteration + 1, + FIELD_NAMES[1]: new_request_sessions, + FIELD_NAMES[2]: f"{elapsed_time:.2f} s", + FIELD_NAMES[3]: failed_sessions, + FIELD_NAMES[4]: new_scaled_pods, + FIELD_NAMES[5]: total_sessions, + FIELD_NAMES[6]: total_pods, + FIELD_NAMES[7]: total_pods - total_sessions, + }) + print(f"ADDING: Created {new_request_sessions} new sessions in {elapsed_time:.2f} seconds.") + print(f"INFO: Total sessions: {total_sessions}") + print(f"INFO: Total pods: {total_pods}") + randomly_quit_sessions(SESSIONS, 10) + time.sleep(15) + finally: + print(f"FINISH: Closing {len(SESSIONS)} sessions.") + close_all_sessions(SESSIONS) + output_file = f"tests/scale_up_results_{random.randint(1, 10000)}" + export_results_to_csv(f"{output_file}.csv", FIELD_NAMES, RESULTS) + export_results_csv_to_md(f"{output_file}.csv", f"{output_file}.md") + +if __name__ == "__main__": + unittest.main() diff --git a/tests/AutoscalingTests/test_scale_up.py b/tests/AutoscalingTests/test_scale_up.py new file mode 100644 index 000000000..8faad820a --- /dev/null +++ b/tests/AutoscalingTests/test_scale_up.py @@ -0,0 +1,59 @@ +import unittest +import random +import time +import signal +import csv +from csv2md.table import Table +from .common import * + +SESSIONS = [] +RESULTS = [] + +def signal_handler(signum, frame): + print("Signal received, quitting all sessions...") + close_all_sessions(SESSIONS) + +signal.signal(signal.SIGTERM, signal_handler) +signal.signal(signal.SIGINT, signal_handler) + +class SeleniumAutoscalingTests(unittest.TestCase): + def test_run_tests(self): + try: + for iteration in range(10): + new_request_sessions = random.randint(1, 3) + start_time = time.time() + start_pods = get_pod_count() + new_sessions = create_sessions_in_parallel(new_request_sessions) + failed_sessions = new_request_sessions - len(new_sessions) + end_time = time.time() + stop_pods = get_pod_count() + SESSIONS.extend(new_sessions) + elapsed_time = end_time - start_time + new_scaled_pods = stop_pods - start_pods + total_sessions = len(SESSIONS) + total_pods = get_pod_count() + RESULTS.append({ + FIELD_NAMES[0]: iteration + 1, + FIELD_NAMES[1]: new_request_sessions, + FIELD_NAMES[2]: f"{elapsed_time:.2f} s", + FIELD_NAMES[3]: failed_sessions, + FIELD_NAMES[4]: new_scaled_pods, + FIELD_NAMES[5]: total_sessions, + FIELD_NAMES[6]: total_pods, + FIELD_NAMES[7]: total_pods - total_sessions, + }) + print(f"ADDING: Created {new_request_sessions} new sessions in {elapsed_time:.2f} seconds.") + print(f"INFO: Total sessions: {total_sessions}") + print(f"INFO: Total pods: {total_pods}") + if iteration % 4 == 0: + randomly_quit_sessions(SESSIONS, 15) + time.sleep(15) + finally: + print(f"FINISH: Closing {len(SESSIONS)} sessions.") + close_all_sessions(SESSIONS) + output_file = f"tests/scale_up_results_{random.randint(1, 10000)}" + export_results_to_csv(f"{output_file}.csv", FIELD_NAMES, RESULTS) + export_results_csv_to_md(f"{output_file}.csv", f"{output_file}.md") + +if __name__ == "__main__": + unittest.main() diff --git a/tests/bootstrap.sh b/tests/bootstrap.sh index 6b4ec608e..df42805ca 100755 --- a/tests/bootstrap.sh +++ b/tests/bootstrap.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash +set -o xtrace + +MATRIX_TESTS=${MATRIX_TESTS:-"default"} + cd tests || true if [ "${CI:-false}" = "false" ]; then @@ -14,10 +18,18 @@ else python3 -m pip install selenium==${BINDING_VERSION} | grep -v 'Requirement already satisfied' fi -python3 -m pip install docker requests chardet | grep -v 'Requirement already satisfied' +python3 -m pip install -r requirements.txt | grep -v 'Requirement already satisfied' -python3 test.py $1 -ret_code=$? +if [ "$1" = "AutoscalingTestsScaleUp" ]; then + python3 -m unittest AutoscalingTests.test_scale_up + ret_code=$? +elif [ "$1" = "AutoScalingTestsScaleChaos" ]; then + python3 -m unittest AutoscalingTests.test_scale_chaos + ret_code=$? +else + python3 test.py $1 + ret_code=$? +fi if [ "${CI:-false}" = "false" ]; then deactivate diff --git a/tests/charts/ci/DeploymentAutoscaling-values.yaml b/tests/charts/ci/DeploymentAutoscaling-values.yaml index bf02551cc..5d4d1e26d 100644 --- a/tests/charts/ci/DeploymentAutoscaling-values.yaml +++ b/tests/charts/ci/DeploymentAutoscaling-values.yaml @@ -5,7 +5,7 @@ autoscaling: maxReplicaCount: 4 pollingInterval: 10 scaledObjectOptions: - cooldownPeriod: 30 + cooldownPeriod: ${AUTOSCALING_COOLDOWN_PERIOD} terminationGracePeriodSeconds: 360 # Configuration for chrome nodes @@ -47,6 +47,8 @@ chromeNode: value: "1080" - name: TZ value: "Asia/Saigon" + - name: SE_NODE_SESSION_TIMEOUT + value: "3600" readinessProbe: enabled: &readinessProbe true livenessProbe: diff --git a/tests/charts/ci/JobAutoscaling-values.yaml b/tests/charts/ci/JobAutoscaling-values.yaml index 118344258..e0e02a8cd 100644 --- a/tests/charts/ci/JobAutoscaling-values.yaml +++ b/tests/charts/ci/JobAutoscaling-values.yaml @@ -22,6 +22,8 @@ chromeNode: value: "1080" - name: TZ value: "Asia/Saigon" + - name: SE_NODE_SESSION_TIMEOUT + value: "3600" readinessProbe: enabled: &readinessProbe false livenessProbe: diff --git a/tests/charts/ci/base-recorder-values.yaml b/tests/charts/ci/base-recorder-values.yaml index 4e75424ba..bc688ee5a 100644 --- a/tests/charts/ci/base-recorder-values.yaml +++ b/tests/charts/ci/base-recorder-values.yaml @@ -11,7 +11,7 @@ # AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}" videoRecorder: - enabled: true + enabled: ${ENABLE_VIDEO_RECORDER} extraVolumes: # - name: videos # persistentVolumeClaim: diff --git a/tests/charts/make/chart_test.sh b/tests/charts/make/chart_test.sh index a5e11cd74..e14a9ccbc 100755 --- a/tests/charts/make/chart_test.sh +++ b/tests/charts/make/chart_test.sh @@ -23,6 +23,8 @@ HUB_CHECKS_INTERVAL=${HUB_CHECKS_INTERVAL:-45} HUB_CHECKS_MAX_ATTEMPTS=${HUB_CHECKS_MAX_ATTEMPTS:-6} WEB_DRIVER_WAIT_TIMEOUT=${WEB_DRIVER_WAIT_TIMEOUT:-120} AUTOSCALING_POLL_INTERVAL=${AUTOSCALING_POLL_INTERVAL:-20} +AUTOSCALING_COOLDOWN_PERIOD=${AUTOSCALING_COOLDOWN_PERIOD:-"1800"} +ENABLE_VIDEO_RECORDER=${ENABLE_VIDEO_RECORDER:-"true"} SCALING_STRATEGY=${SCALING_STRATEGY:-"default"} SKIP_CLEANUP=${SKIP_CLEANUP:-"true"} # For debugging purposes, retain the cluster after the test run CHART_CERT_PATH=${CHART_CERT_PATH:-"${CHART_PATH}/certs/tls.crt"} @@ -116,6 +118,8 @@ export SELENIUM_NAMESPACE=${SELENIUM_NAMESPACE} export TEST_PV_CLAIM_NAME=${TEST_PV_CLAIM_NAME} export HOST_PATH=$(realpath ./tests/videos) export SELENIUM_ENABLE_MANAGED_DOWNLOADS=${SELENIUM_ENABLE_MANAGED_DOWNLOADS} +export AUTOSCALING_COOLDOWN_PERIOD=${AUTOSCALING_COOLDOWN_PERIOD} +export ENABLE_VIDEO_RECORDER=${ENABLE_VIDEO_RECORDER} RECORDER_VALUES_FILE=${TEST_VALUES_PATH}/base-recorder-values.yaml MATRIX_BROWSER_VALUES_FILE=${TEST_VALUES_PATH}/${MATRIX_BROWSER}-values.yaml envsubst < ${RECORDER_VALUES_FILE} > ./tests/tests/base-recorder-values.yaml @@ -189,6 +193,13 @@ if [ "${SELENIUM_GRID_AUTOSCALING}" = "true" ] && [ -n "${SET_MAX_REPLICAS}" ]; " fi +if [ "${SELENIUM_GRID_AUTOSCALING}" = "true" ] && [ "${CLEAR_POD_HISTORY}" = "true" ]; then + HELM_COMMAND_SET_IMAGES="${HELM_COMMAND_SET_IMAGES} \ + --set autoscaling.scaledJobOptions.successfulJobsHistoryLimit=0 \ + --set autoscaling.scaledJobOptions.failedJobsHistoryLimit=0 \ + " +fi + if [ "${CHART_ENABLE_INGRESS_HOSTNAME}" = "true" ]; then if [[ ! $(cat /etc/hosts) == *"${HOSTNAME_ADDRESS}"* ]]; then sudo -- sh -c -e "echo \"$(hostname -I | cut -d' ' -f1) ${HOSTNAME_ADDRESS}\" >> /etc/hosts" @@ -440,6 +451,10 @@ elif [ "${MATRIX_TESTS}" = "CDPTests" ]; then if [ "${TEST_PLATFORMS}" = "linux/amd64" ]; then ./tests/CDPTests/bootstrap.sh "MicrosoftEdge" fi +elif [ "${MATRIX_TESTS}" = "AutoscalingTestsScaleUp" ]; then + ./tests/bootstrap.sh ${MATRIX_TESTS} +elif [ "${MATRIX_TESTS}" = "AutoScalingTestsScaleChaos" ]; then + ./tests/bootstrap.sh ${MATRIX_TESTS} else ./tests/bootstrap.sh ${MATRIX_BROWSER} fi diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 000000000..bac1b366e --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,4 @@ +docker +requests +chardet +csv2md