Add test for autoscaling count

[skip ci] Signed-off-by: Viet Nguyen Duc <[email protected]>
SeleniumHQ · Dec 1, 2024 · d1ac580 · d1ac580
1 parent 31fba3c
commit d1ac580
Show file tree

Hide file tree

Showing 12 changed files with 293 additions and 50 deletions.
diff --git a/.github/workflows/k8s-scaling-test.yml b/.github/workflows/k8s-scaling-test.yml
@@ -22,6 +22,19 @@ jobs:
       matrix:
         include:
           - k8s-version: 'v1.31.2'
+            test-strategy: chart_test_autoscaling_job_count_chaos
+            cluster: 'minikube'
+            helm-version: 'v3.16.3'
+            docker-version: '27.3.1'
+            python-version: '3.13'
+          - k8s-version: 'v1.31.2'
+            test-strategy: chart_test_autoscaling_job_count_max_sessions
+            cluster: 'minikube'
+            helm-version: 'v3.16.3'
+            docker-version: '27.3.1'
+            python-version: '3.13'
+          - k8s-version: 'v1.31.2'
+            test-strategy: chart_test_autoscaling_job_count
             cluster: 'minikube'
             helm-version: 'v3.16.3'
             docker-version: '27.3.1'
@@ -103,54 +116,12 @@ jobs:
           timeout_minutes: 30
           max_attempts: 3
           command: |
-            NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count_chaos
-      - name: Upload results
-        if: always()
-        uses: actions/upload-artifact@main
-        with:
-          name: chart_test_autoscaling_job_count_chaos
-          path: ./tests/tests/*.md
-          if-no-files-found: ignore
-      - name: Test Selenium Grid on Kubernetes with Autoscaling
-        uses: nick-invision/retry@master
-        with:
-          timeout_minutes: 30
-          max_attempts: 3
-          command: |
-            NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count_max_sessions
-      - name: Upload results
-        if: always()
-        uses: actions/upload-artifact@main
-        with:
-          name: chart_test_autoscaling_job_count_max_sessions
-          path: ./tests/tests/*.md
-          if-no-files-found: ignore
-      - name: Test Selenium Grid on Kubernetes with Autoscaling
-        uses: nick-invision/retry@master
-        with:
-          timeout_minutes: 30
-          max_attempts: 3
-          command: |
-            NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count_strategy_accurate
-      - name: Upload results
-        if: always()
-        uses: actions/upload-artifact@main
-        with:
-          name: chart_test_autoscaling_job_count_strategy_accurate
-          path: ./tests/tests/*.md
-          if-no-files-found: ignore
-      - name: Test Selenium Grid on Kubernetes with Autoscaling
-        uses: nick-invision/retry@master
-        with:
-          timeout_minutes: 30
-          max_attempts: 3
-          command: |
-            NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make chart_test_autoscaling_job_count
+            NAME=${IMAGE_REGISTRY} VERSION=${BRANCH} BUILD_DATE=${BUILD_DATE} TEST_UPGRADE_CHART=false make ${{ matrix.test-strategy }}
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@main
         with:
-          name: chart_test_autoscaling_job_count
+          name: ${{ matrix.test-strategy }}
           path: ./tests/tests/*.md
           if-no-files-found: ignore
       - name: Cleanup Kubernetes cluster

diff --git a/Makefile b/Makefile
@@ -29,7 +29,7 @@ SBOM_OUTPUT := $(or $(SBOM_OUTPUT),$(SBOM_OUTPUT),package_versions.txt)
 KEDA_TAG_PREV_VERSION := $(or $(KEDA_TAG_PREV_VERSION),$(KEDA_TAG_PREV_VERSION),2.16.0-selenium-grid)
 KEDA_TAG_VERSION := $(or $(KEDA_TAG_VERSION),$(KEDA_TAG_VERSION),2.16.0-selenium-grid)
 KEDA_BASED_NAME := $(or $(KEDA_BASED_NAME),$(KEDA_BASED_NAME),ndviet)
-KEDA_BASED_TAG := $(or $(KEDA_BASED_TAG),$(KEDA_BASED_TAG),2.16.0-selenium-grid-20241127)
+KEDA_BASED_TAG := $(or $(KEDA_BASED_TAG),$(KEDA_BASED_TAG),2.16.0-selenium-grid-20241128)
 
 all: hub \
 	distributor \
@@ -961,6 +961,36 @@ chart_test_autoscaling_playwright_connect_grid:
 	TEMPLATE_OUTPUT_FILENAME="k8s_playwright_connect_grid_basicAuth_secureIngress_ingressPublicIP_autoScaling_patchKEDA.yaml" \
 	./tests/charts/make/chart_test.sh JobAutoscaling
 
+chart_test_autoscaling_job_count_chaos:
+	MATRIX_TESTS=AutoScalingTestsScaleChaos \
+	make chart_test_autoscaling_job_count
+
+chart_test_autoscaling_job_count_max_sessions:
+	MAX_SESSIONS_FIREFOX=2 MAX_SESSIONS_EDGE=2 MAX_SESSIONS_CHROME=2 \
+	make chart_test_autoscaling_job_count
+
+chart_test_autoscaling_job_count:
+	MATRIX_TESTS=$(or $(MATRIX_TESTS), "AutoscalingTestsScaleUp") SCALING_STRATEGY=$(or $(SCALING_STRATEGY), "default") \
+	PLATFORMS=$(PLATFORMS) RELEASE_NAME=selenium TEST_PATCHED_KEDA=true SELENIUM_GRID_PROTOCOL=http SELENIUM_GRID_HOST=localhost SELENIUM_GRID_PORT=80 \
+	SELENIUM_GRID_PORT=80 SELENIUM_GRID_MONITORING=false CLEAR_POD_HISTORY=true SET_MAX_REPLICAS=100 ENABLE_VIDEO_RECORDER=false \
+	VERSION=$(TAG_VERSION) VIDEO_TAG=$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) KEDA_BASED_NAME=$(KEDA_BASED_NAME) KEDA_BASED_TAG=$(KEDA_BASED_TAG) NAMESPACE=$(NAMESPACE) BINDING_VERSION=$(BINDING_VERSION) BASE_VERSION=$(BASE_VERSION) \
+	./tests/charts/make/chart_test.sh JobAutoscaling
+
+chart_test_autoscaling_deployment_count_chaos:
+	MATRIX_TESTS=AutoScalingTestsScaleChaos \
+	make chart_test_autoscaling_deployment_count
+
+chart_test_autoscaling_deployment_count_max_sessions:
+	MAX_SESSIONS_FIREFOX=3 MAX_SESSIONS_EDGE=2 MAX_SESSIONS_CHROME=2 \
+	make chart_test_autoscaling_deployment_count
+
+chart_test_autoscaling_deployment_count:
+	MATRIX_TESTS=$(or $(MATRIX_TESTS), "AutoscalingTestsScaleUp") \
+	PLATFORMS=$(PLATFORMS) RELEASE_NAME=selenium TEST_PATCHED_KEDA=true SELENIUM_GRID_PROTOCOL=http SELENIUM_GRID_HOST=localhost SELENIUM_GRID_PORT=80 \
+	SELENIUM_GRID_PORT=80 SELENIUM_GRID_MONITORING=false CLEAR_POD_HISTORY=true SET_MAX_REPLICAS=100 ENABLE_VIDEO_RECORDER=false \
+	VERSION=$(TAG_VERSION) VIDEO_TAG=$(FFMPEG_TAG_VERSION)-$(BUILD_DATE) KEDA_BASED_NAME=$(KEDA_BASED_NAME) KEDA_BASED_TAG=$(KEDA_BASED_TAG) NAMESPACE=$(NAMESPACE) BINDING_VERSION=$(BINDING_VERSION) BASE_VERSION=$(BASE_VERSION) \
+	./tests/charts/make/chart_test.sh DeploymentAutoscaling
+
 chart_test_delete:
 	helm del test -n selenium || true
 	helm del selenium -n selenium || true

diff --git a/tests/AutoscalingTests/__init__.py b/tests/AutoscalingTests/__init__.py
diff --git a/tests/AutoscalingTests/common.py b/tests/AutoscalingTests/common.py
@@ -0,0 +1,90 @@
+import unittest
+import random
+import time
+import subprocess
+import signal
+import concurrent.futures
+import csv
+import os
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options as FirefoxOptions
+from selenium.webdriver.edge.options import Options as EdgeOptions
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.remote.client_config import ClientConfig
+from csv2md.table import Table
+
+BROWSER = {
+    "chrome": ChromeOptions(),
+    "firefox": FirefoxOptions(),
+    "edge": EdgeOptions(),
+}
+
+CLIENT_CONFIG = ClientConfig(
+    remote_server_addr=f"http://localhost/selenium/wd/hub",
+    keep_alive=True,
+    timeout=3600,
+)
+
+FIELD_NAMES = ["Iteration", "New request sessions", "Requests accepted time", "Sessions failed", "New scaled pods", "Total sessions", "Total pods", "Gaps"]
+
+def get_pod_count():
+    result = subprocess.run(["kubectl", "get", "pods", "-A", "--no-headers"], capture_output=True, text=True)
+    return len([line for line in result.stdout.splitlines() if "selenium-node-" in line and "Running" in line])
+
+def create_session(browser_name):
+    return webdriver.Remote(command_executor=CLIENT_CONFIG.remote_server_addr, options=BROWSER[browser_name], client_config=CLIENT_CONFIG)
+
+def wait_for_count_matches(sessions, timeout=10, interval=5):
+    elapsed = 0
+    while elapsed < timeout:
+        pod_count = get_pod_count()
+        if pod_count == len(sessions):
+            break
+        print(f"VALIDATING: Waiting for pods to match sessions... ({elapsed}/{timeout} seconds elapsed)")
+        time.sleep(interval)
+        elapsed += interval
+    if pod_count != len(sessions):
+        print(f"WARN: Mismatch between pod count and session count after {timeout} seconds. Gaps: {pod_count - len(sessions)}")
+    else:
+        print(f"PASS: Pod count matches session count after {elapsed} seconds.")
+
+def close_all_sessions(sessions):
+    for session in sessions:
+        session.quit()
+    sessions.clear()
+    return sessions
+
+def create_sessions_in_parallel(new_request_sessions):
+    failed_jobs = 0
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = [executor.submit(create_session, random.choice(list(BROWSER.keys()))) for _ in range(new_request_sessions)]
+        sessions = []
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                sessions.append(future.result())
+            except Exception as e:
+                print(f"ERROR: Failed to create session: {e}")
+                failed_jobs += 1
+    print(f"Total failed jobs: {failed_jobs}")
+    return sessions
+
+def randomly_quit_sessions(sessions, sublist_size):
+    if sessions:
+        sessions_to_quit = random.sample(sessions, min(sublist_size, len(sessions)))
+        for session in sessions_to_quit:
+            session.quit()
+            sessions.remove(session)
+        print(f"QUIT: {len(sessions_to_quit)} sessions have been randomly quit.")
+    return sessions
+
+def export_results_to_csv(output_file, field_names, results):
+    with open(output_file, mode="w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=field_names)
+        writer.writeheader()
+        writer.writerows(results)
+
+def export_results_csv_to_md(csv_file, md_file):
+    with open(csv_file) as f:
+        table = Table.parse_csv(f)
+    with open(md_file, mode="w") as f:
+        f.write(table.markdown())
diff --git a/tests/AutoscalingTests/test_scale_chaos.py b/tests/AutoscalingTests/test_scale_chaos.py
@@ -0,0 +1,58 @@
+import unittest
+import random
+import time
+import signal
+import csv
+from csv2md.table import Table
+from .common import *
+
+SESSIONS = []
+RESULTS = []
+
+def signal_handler(signum, frame):
+    print("Signal received, quitting all sessions...")
+    close_all_sessions(SESSIONS)
+
+signal.signal(signal.SIGTERM, signal_handler)
+signal.signal(signal.SIGINT, signal_handler)
+
+class SeleniumAutoscalingTests(unittest.TestCase):
+    def test_run_tests(self):
+        try:
+            for iteration in range(10):
+                new_request_sessions = random.randint(2, 15)
+                start_time = time.time()
+                start_pods = get_pod_count()
+                new_sessions = create_sessions_in_parallel(new_request_sessions)
+                failed_sessions = new_request_sessions - len(new_sessions)
+                end_time = time.time()
+                stop_pods = get_pod_count()
+                SESSIONS.extend(new_sessions)
+                elapsed_time = end_time - start_time
+                new_scaled_pods = stop_pods - start_pods
+                total_sessions = len(SESSIONS)
+                total_pods = get_pod_count()
+                RESULTS.append({
+                    FIELD_NAMES[0]: iteration + 1,
+                    FIELD_NAMES[1]: new_request_sessions,
+                    FIELD_NAMES[2]: f"{elapsed_time:.2f} s",
+                    FIELD_NAMES[3]: failed_sessions,
+                    FIELD_NAMES[4]: new_scaled_pods,
+                    FIELD_NAMES[5]: total_sessions,
+                    FIELD_NAMES[6]: total_pods,
+                    FIELD_NAMES[7]: total_pods - total_sessions,
+                })
+                print(f"ADDING: Created {new_request_sessions} new sessions in {elapsed_time:.2f} seconds.")
+                print(f"INFO: Total sessions: {total_sessions}")
+                print(f"INFO: Total pods: {total_pods}")
+                randomly_quit_sessions(SESSIONS, 10)
+                time.sleep(15)
+        finally:
+            print(f"FINISH: Closing {len(SESSIONS)} sessions.")
+            close_all_sessions(SESSIONS)
+            output_file = f"tests/scale_up_results_{random.randint(1, 10000)}"
+            export_results_to_csv(f"{output_file}.csv", FIELD_NAMES, RESULTS)
+            export_results_csv_to_md(f"{output_file}.csv", f"{output_file}.md")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/AutoscalingTests/test_scale_up.py b/tests/AutoscalingTests/test_scale_up.py
@@ -0,0 +1,59 @@
+import unittest
+import random
+import time
+import signal
+import csv
+from csv2md.table import Table
+from .common import *
+
+SESSIONS = []
+RESULTS = []
+
+def signal_handler(signum, frame):
+    print("Signal received, quitting all sessions...")
+    close_all_sessions(SESSIONS)
+
+signal.signal(signal.SIGTERM, signal_handler)
+signal.signal(signal.SIGINT, signal_handler)
+
+class SeleniumAutoscalingTests(unittest.TestCase):
+    def test_run_tests(self):
+        try:
+            for iteration in range(10):
+                new_request_sessions = random.randint(1, 3)
+                start_time = time.time()
+                start_pods = get_pod_count()
+                new_sessions = create_sessions_in_parallel(new_request_sessions)
+                failed_sessions = new_request_sessions - len(new_sessions)
+                end_time = time.time()
+                stop_pods = get_pod_count()
+                SESSIONS.extend(new_sessions)
+                elapsed_time = end_time - start_time
+                new_scaled_pods = stop_pods - start_pods
+                total_sessions = len(SESSIONS)
+                total_pods = get_pod_count()
+                RESULTS.append({
+                    FIELD_NAMES[0]: iteration + 1,
+                    FIELD_NAMES[1]: new_request_sessions,
+                    FIELD_NAMES[2]: f"{elapsed_time:.2f} s",
+                    FIELD_NAMES[3]: failed_sessions,
+                    FIELD_NAMES[4]: new_scaled_pods,
+                    FIELD_NAMES[5]: total_sessions,
+                    FIELD_NAMES[6]: total_pods,
+                    FIELD_NAMES[7]: total_pods - total_sessions,
+                })
+                print(f"ADDING: Created {new_request_sessions} new sessions in {elapsed_time:.2f} seconds.")
+                print(f"INFO: Total sessions: {total_sessions}")
+                print(f"INFO: Total pods: {total_pods}")
+                if iteration % 4 == 0:
+                    randomly_quit_sessions(SESSIONS, 15)
+                time.sleep(15)
+        finally:
+            print(f"FINISH: Closing {len(SESSIONS)} sessions.")
+            close_all_sessions(SESSIONS)
+            output_file = f"tests/scale_up_results_{random.randint(1, 10000)}"
+            export_results_to_csv(f"{output_file}.csv", FIELD_NAMES, RESULTS)
+            export_results_csv_to_md(f"{output_file}.csv", f"{output_file}.md")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/bootstrap.sh b/tests/bootstrap.sh
@@ -1,4 +1,8 @@
 #!/usr/bin/env bash
+set -o xtrace
+
+MATRIX_TESTS=${MATRIX_TESTS:-"default"}
+
 cd tests || true
 
 if [ "${CI:-false}" = "false" ]; then
@@ -14,10 +18,18 @@ else
   python3 -m pip install selenium==${BINDING_VERSION} | grep -v 'Requirement already satisfied'
 fi
 
-python3 -m pip install docker requests chardet | grep -v 'Requirement already satisfied'
+python3 -m pip install -r requirements.txt | grep -v 'Requirement already satisfied'
 
-python3 test.py $1
-ret_code=$?
+if [ "$1" = "AutoscalingTestsScaleUp" ]; then
+  python3 -m unittest AutoscalingTests.test_scale_up
+  ret_code=$?
+elif [ "$1" = "AutoScalingTestsScaleChaos" ]; then
+  python3 -m unittest AutoscalingTests.test_scale_chaos
+  ret_code=$?
+else
+  python3 test.py $1
+  ret_code=$?
+fi
 
 if [ "${CI:-false}" = "false" ]; then
   deactivate

diff --git a/tests/charts/ci/DeploymentAutoscaling-values.yaml b/tests/charts/ci/DeploymentAutoscaling-values.yaml
@@ -5,7 +5,7 @@ autoscaling:
     maxReplicaCount: 4
     pollingInterval: 10
   scaledObjectOptions:
-    cooldownPeriod: 30
+    cooldownPeriod: ${AUTOSCALING_COOLDOWN_PERIOD}
   terminationGracePeriodSeconds: 360
 
 # Configuration for chrome nodes
@@ -47,6 +47,8 @@ chromeNode:
       value: "1080"
     - name: TZ
       value: "Asia/Saigon"
+    - name: SE_NODE_SESSION_TIMEOUT
+      value: "3600"
   readinessProbe:
     enabled: &readinessProbe true
   livenessProbe:

diff --git a/tests/charts/ci/JobAutoscaling-values.yaml b/tests/charts/ci/JobAutoscaling-values.yaml
@@ -22,6 +22,8 @@ chromeNode:
       value: "1080"
     - name: TZ
       value: "Asia/Saigon"
+    - name: SE_NODE_SESSION_TIMEOUT
+      value: "3600"
   readinessProbe:
     enabled: &readinessProbe false
   livenessProbe:

diff --git a/tests/charts/ci/base-recorder-values.yaml b/tests/charts/ci/base-recorder-values.yaml
@@ -11,7 +11,7 @@
 #      AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}"
 
 videoRecorder:
-  enabled: true
+  enabled: ${ENABLE_VIDEO_RECORDER}
   extraVolumes:
 #   - name: videos
 #     persistentVolumeClaim: