From 785194d2655c4836a17f4db0e05c992626f74bae Mon Sep 17 00:00:00 2001 From: Kirill Rysin Date: Thu, 16 Jan 2025 19:07:42 +0300 Subject: [PATCH 1/2] add features and fixed bugs --- ydb/tests/stability/library/__main__.py | 95 ++++++++++++++++++------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/ydb/tests/stability/library/__main__.py b/ydb/tests/stability/library/__main__.py index d17b91d47cab..9b0a43e5fa4c 100644 --- a/ydb/tests/stability/library/__main__.py +++ b/ydb/tests/stability/library/__main__.py @@ -10,7 +10,7 @@ from library.python import resource -logging.getLogger().setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(sys.stderr)) from ydb.tests.library.harness.kikimr_cluster import ExternalKiKiMRCluster # noqa @@ -60,43 +60,63 @@ def _unpack_resource(self, name): return path_to_unpack def perform_checks(self): + safety_violations = safety_warden_factory(self.kikimr_cluster, self.ssh_username).list_of_safety_violations() liveness_violations = liveness_warden_factory(self.kikimr_cluster, self.ssh_username).list_of_liveness_violations + coredumps_search_results = {} + for node in self.kikimr_cluster.nodes.values(): + result = node.ssh_command('find /coredumps/ -type f | wc -l', raise_on_error=False) + coredumps_search_results[node.host.split(':')[0]] = int(result.decode('utf-8')) - count = 0 - report = [] - - print("SAFETY WARDEN (total: {})".format(len(safety_violations))) + print("SAFETY WARDEN:") for i, violation in enumerate(safety_violations): print("[{}]".format(i)) print(violation) print() - - print("LIVENESS WARDEN (total: {})".format(len(liveness_violations))) + + print("LIVENESS WARDEN:") for i, violation in enumerate(liveness_violations): print("[{}]".format(i)) print(violation) - print() - return count, "\n".join(report) + + print("SAFETY WARDEN (total: {})".format(len(safety_violations))) + print("LIVENESS WARDEN (total: {})".format(len(liveness_violations))) + print("COREDUMPS:") + for node in coredumps_search_results: + print(f' {node}: {coredumps_search_results[node]}') def start_nemesis(self): for node in self.kikimr_cluster.nodes.values(): node.ssh_command("sudo service nemesis restart", raise_on_error=True) + def stop_workloads(self): + for node in self.kikimr_cluster.nodes.values(): + node.ssh_command( + 'sudo pkill screen', + raise_on_error=True + ) def stop_nemesis(self): for node in self.kikimr_cluster.nodes.values(): node.ssh_command("sudo service nemesis stop", raise_on_error=False) + def cleanup(self, mode = 'all'): + self.stop_nemesis() + for node in self.kikimr_cluster.nodes.values(): + if mode in ['all', 'dumps']: + node.ssh_command('sudo rm -rf /coredumps/*', raise_on_error=False) + if mode in ['all', 'logs']: + node.ssh_command('sudo rm -rf /Berkanavt/kikimr_31003/logs/*', raise_on_error=False) + node.ssh_command('sudo rm -rf /Berkanavt/kikimr/logs/*', raise_on_error=False) + node.ssh_command('sudo rm -rf /Berkanavt/nemesis/log/*', raise_on_error=False) + if mode == 'all': + node.ssh_command('sudo pkill screen', raise_on_error=False) + node.ssh_command('sudo rm -rf /Berkanavt/kikimr/bin/*', raise_on_error=False) + def deploy_ydb(self): - self._stop_nemesis() + self.cleanup() self.kikimr_cluster.start() - # cleanup nemesis logs - for node in self.kikimr_cluster.nodes.values(): - node.ssh_command('sudo rm -rf /Berkanavt/nemesis/logs/*', raise_on_error=False) - node.ssh_command('sudo pkill screen', raise_on_error=False) - with open(self._unpack_resource("tbl_profile.txt")) as f: self.kikimr_cluster.client.console_request(f.read()) @@ -122,15 +142,17 @@ def deploy_tools(self): for node in self.kikimr_cluster.nodes.values(): node.ssh_command(["sudo", "mkdir", "-p", STRESS_BINARIES_DEPLOY_PATH], raise_on_error=False) for artifact in self.artifacts: - node.copy_file_or_dir( - artifact, - os.path.join( + node_artifact_path = os.path.join( STRESS_BINARIES_DEPLOY_PATH, os.path.basename( artifact ) ) + node.copy_file_or_dir( + artifact, + node_artifact_path ) + node.ssh_command(f"sudo chmod 777 {node_artifact_path}", raise_on_error=False) def path_type(path): @@ -170,14 +192,18 @@ def parse_args(): type=str, nargs="+", choices=[ + "cleanup", + "cleanup_logs", + "cleanup_dumps", "deploy_ydb", "deploy_tools", "start_nemesis", "stop_nemesis", + "start_all_workloads", "start_workload_simple_queue_row", "start_workload_simple_queue_column", "start_workload_olap_workload", - "stop_workload", + "stop_workloads", "perform_checks", ], help="actions to execute", @@ -197,8 +223,28 @@ def main(): for action in args.actions: if action == "deploy_ydb": stability_cluster.deploy_ydb() + if action == "cleanup": + stability_cluster.cleanup() + if action == "cleanup_logs": + stability_cluster.cleanup('logs') + if action == "cleanup_dups": + stability_cluster.cleanup('dumps') if action == "deploy_tools": stability_cluster.deploy_tools() + if action == "start_all_workloads": + for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()): + node.ssh_command( + 'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/simple_queue --database /Root/db1 --mode row; done"', + raise_on_error=True + ) + node.ssh_command( + 'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/simple_queue --database /Root/db1 --mode column; done"', + raise_on_error=True + ) + node.ssh_command( + 'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/olap_workload --database /Root/db1; done"', + raise_on_error=True + ) if action == "start_workload_simple_queue_row": for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()): node.ssh_command( @@ -217,12 +263,8 @@ def main(): 'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/olap_workload --database /Root/db1; done"', raise_on_error=True ) - if action == "stop_workload": - for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()): - node.ssh_command( - 'sudo pkill screen', - raise_on_error=True - ) + if action == "stop_workloads": + stability_cluster.stop_workloads() if action == "stop_nemesis": stability_cluster.stop_nemesis() @@ -231,8 +273,7 @@ def main(): stability_cluster.start_nemesis() if action == "perform_checks": - count, report = stability_cluster.perform_checks() - print(report) + stability_cluster.perform_checks() if __name__ == "__main__": From 9c8e6cc9b445cd84c9900226f7ee2765670d1c10 Mon Sep 17 00:00:00 2001 From: Kirill Rysin Date: Fri, 17 Jan 2025 17:33:07 +0300 Subject: [PATCH 2/2] howto stability test --- ydb/tests/stability/library/howto.md | 114 +++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 ydb/tests/stability/library/howto.md diff --git a/ydb/tests/stability/library/howto.md b/ydb/tests/stability/library/howto.md new file mode 100644 index 000000000000..3772b3b48471 --- /dev/null +++ b/ydb/tests/stability/library/howto.md @@ -0,0 +1,114 @@ +# How to test stability +1) build ydbd (not stripped) + ``` + ./ya make --build=profile -DCFLAGS=-fno-omit-frame-pointer --thinlto ydb/apps/ydbd + ``` +2) build library + ``` + ./ya make /ydb/tests/stability/library + ``` +3) deploy ydb to test specific build version + ``` + cd /ydb/tests/stability/library; ./library deploy_ydb --cluster_path=cluster.yaml --ydbd_path=ydb/apps/ydbd/ydbd + ``` +4) deploy tools + ``` + ./library deploy_tools --cluster_path=cluster.yaml --ydbd_path=ydb/apps/ydbd/ydbd + ``` +5) start workload: + - `start_all_workloads` - it start all listed bellow worloads + - `start_workload_simple_queue_row` - create + - `start_workload_simple_queue_column` + - `start_workload_olap_workload` + + ``` + ./library start_all_workloads --cluster_path=cluster.yaml --ydbd_path=ydb/apps/ydbd/ydbd + ``` + to stop workload, use command `stop_workloads` - stops all worloads + + to check is it working on node host + - workload simple_queue row - ``ps -aux | grep "/Berkanavt/nemesis/bin/simple" | grep row | grep -v grep + `` + - workload simple_queue column - ``ps -aux | grep "/Berkanavt/nemesis/bin/simple" | grep column | grep -v grep + `` + - workload simple_queue column - ``ps -aux | grep "/Berkanavt/nemesis/bin/olap_workload" | grep -v grep + `` + +6) start nemesis: + ``` + ./library start_nemesis --cluster_path=cluster.yaml --ydbd_path=ydb/apps/ydbd/ydbd + ``` + to stop, use the command `stop_nemesis` + + +7) Check states + 1) yq to get all node hosts + ``` + sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq + sudo chmod +x /usr/local/bin/yq + ``` + ``yq e '.hosts[].name' cluster.yaml> > hosts.txt`` + 2) Get status of nemesis and workloads (ad-hoc) + ``` + parallel-ssh -h hosts.txt -i ' + if systemctl is-active --quiet nemesis; then + echo "nemesis: Active" + else + echo "nemesis: Down" + fi + if ps aux | grep "/Berkanavt/nemesis/bin/olap_workload" | grep -v grep > /dev/null; then + echo "olap_workload: Running" + else + echo "olap_workload: Stopped" + fi + if ps aux | grep "/Berkanavt/nemesis/bin/simple" | grep column | grep -v grep > /dev/null; then + echo "simple_queue_column: Running" + else + echo "simple_queue_column: Stopped" + fi + if ps aux | grep "/Berkanavt/nemesis/bin/simple" | grep row | grep -v grep > /dev/null; then + echo "simple_queue_column: Running" + else + echo "simple_queue_column: Stopped" + fi + ' + ``` +8) check cluster stability + 1) ``perform_check`` - return summary of errors and coredumps for cluster: + + ``` + SAFETY WARDEN (total: 8) + LIVENESS WARDEN (total: 0) + COREDUMPS: + ydb-sas-testing-0000.search.yandex.net: 1 + ydb-sas-testing-0001.search.yandex.net: 0 + ydb-sas-testing-0002.search.yandex.net: 1 + ydb-sas-testing-0003.search.yandex.net: 1 + ydb-sas-testing-0004.search.yandex.net: 0 + ydb-sas-testing-0005.search.yandex.net: 1 + ydb-sas-testing-0006.search.yandex.net: 2 + ydb-sas-testing-0007.search.yandex.net: 0 + ``` + to run: + ``` + ./library perform_check --cluster_path=cluster.yaml --ydbd_path=ydb/apps/ydbd/ydbd + ``` + 2) get cluster traces (ad-hoc) + ``` + '' > combined_traces.txt; parallel-ssh -h hosts.txt -i " + zgrep -E 'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception' /Berkanavt/kikimr_31003/logs/kikimr.start.* -A 30 | + awk ' + { + split(\$0, parts, \":\") + curr_file = parts[1] + + if (curr_file != prev_file) { + if (prev_file != \"\") + print \"\n\n\n---\n\n\n\" + prev_file = curr_file + } + print + }' | sed '/--/a\\n\n' + " >> combined_traces.txt + ``` +9) create issue in github about new traces \ No newline at end of file