Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stability test library: add features and fixed bugs #13452

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 68 additions & 27 deletions ydb/tests/stability/library/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from library.python import resource

logging.getLogger().setLevel(logging.DEBUG)
logging.getLogger().setLevel(logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(sys.stderr))

from ydb.tests.library.harness.kikimr_cluster import ExternalKiKiMRCluster # noqa
Expand Down Expand Up @@ -60,43 +60,63 @@ def _unpack_resource(self, name):
return path_to_unpack

def perform_checks(self):

safety_violations = safety_warden_factory(self.kikimr_cluster, self.ssh_username).list_of_safety_violations()
liveness_violations = liveness_warden_factory(self.kikimr_cluster, self.ssh_username).list_of_liveness_violations
coredumps_search_results = {}
for node in self.kikimr_cluster.nodes.values():
result = node.ssh_command('find /coredumps/ -type f | wc -l', raise_on_error=False)
coredumps_search_results[node.host.split(':')[0]] = int(result.decode('utf-8'))

count = 0
report = []

print("SAFETY WARDEN (total: {})".format(len(safety_violations)))
print("SAFETY WARDEN:")
for i, violation in enumerate(safety_violations):
print("[{}]".format(i))
print(violation)
print()

print("LIVENESS WARDEN (total: {})".format(len(liveness_violations)))
print("LIVENESS WARDEN:")
for i, violation in enumerate(liveness_violations):
print("[{}]".format(i))
print(violation)

print()
return count, "\n".join(report)

print("SAFETY WARDEN (total: {})".format(len(safety_violations)))
print("LIVENESS WARDEN (total: {})".format(len(liveness_violations)))
print("COREDUMPS:")
for node in coredumps_search_results:
print(f' {node}: {coredumps_search_results[node]}')

def start_nemesis(self):
for node in self.kikimr_cluster.nodes.values():
node.ssh_command("sudo service nemesis restart", raise_on_error=True)

def stop_workloads(self):
for node in self.kikimr_cluster.nodes.values():
node.ssh_command(
'sudo pkill screen',
raise_on_error=True
)
def stop_nemesis(self):
for node in self.kikimr_cluster.nodes.values():
node.ssh_command("sudo service nemesis stop", raise_on_error=False)

def cleanup(self, mode = 'all'):
self.stop_nemesis()
for node in self.kikimr_cluster.nodes.values():
if mode in ['all', 'dumps']:
node.ssh_command('sudo rm -rf /coredumps/*', raise_on_error=False)
if mode in ['all', 'logs']:
node.ssh_command('sudo rm -rf /Berkanavt/kikimr_31003/logs/*', raise_on_error=False)
node.ssh_command('sudo rm -rf /Berkanavt/kikimr/logs/*', raise_on_error=False)
node.ssh_command('sudo rm -rf /Berkanavt/nemesis/log/*', raise_on_error=False)
if mode == 'all':
node.ssh_command('sudo pkill screen', raise_on_error=False)
node.ssh_command('sudo rm -rf /Berkanavt/kikimr/bin/*', raise_on_error=False)

def deploy_ydb(self):
self._stop_nemesis()
self.cleanup()
self.kikimr_cluster.start()

# cleanup nemesis logs
for node in self.kikimr_cluster.nodes.values():
node.ssh_command('sudo rm -rf /Berkanavt/nemesis/logs/*', raise_on_error=False)
node.ssh_command('sudo pkill screen', raise_on_error=False)

with open(self._unpack_resource("tbl_profile.txt")) as f:
self.kikimr_cluster.client.console_request(f.read())

Expand All @@ -122,15 +142,17 @@ def deploy_tools(self):
for node in self.kikimr_cluster.nodes.values():
node.ssh_command(["sudo", "mkdir", "-p", STRESS_BINARIES_DEPLOY_PATH], raise_on_error=False)
for artifact in self.artifacts:
node.copy_file_or_dir(
artifact,
os.path.join(
node_artifact_path = os.path.join(
STRESS_BINARIES_DEPLOY_PATH,
os.path.basename(
artifact
)
)
node.copy_file_or_dir(
artifact,
node_artifact_path
)
node.ssh_command(f"sudo chmod 777 {node_artifact_path}", raise_on_error=False)


def path_type(path):
Expand Down Expand Up @@ -170,14 +192,18 @@ def parse_args():
type=str,
nargs="+",
choices=[
"cleanup",
"cleanup_logs",
"cleanup_dumps",
"deploy_ydb",
"deploy_tools",
"start_nemesis",
"stop_nemesis",
"start_all_workloads",
"start_workload_simple_queue_row",
"start_workload_simple_queue_column",
"start_workload_olap_workload",
"stop_workload",
"stop_workloads",
"perform_checks",
],
help="actions to execute",
Expand All @@ -197,8 +223,28 @@ def main():
for action in args.actions:
if action == "deploy_ydb":
stability_cluster.deploy_ydb()
if action == "cleanup":
stability_cluster.cleanup()
if action == "cleanup_logs":
stability_cluster.cleanup('logs')
if action == "cleanup_dups":
stability_cluster.cleanup('dumps')
if action == "deploy_tools":
stability_cluster.deploy_tools()
if action == "start_all_workloads":
for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()):
node.ssh_command(
'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/simple_queue --database /Root/db1 --mode row; done"',
raise_on_error=True
)
node.ssh_command(
'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/simple_queue --database /Root/db1 --mode column; done"',
raise_on_error=True
)
node.ssh_command(
'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/olap_workload --database /Root/db1; done"',
raise_on_error=True
)
if action == "start_workload_simple_queue_row":
for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()):
node.ssh_command(
Expand All @@ -217,12 +263,8 @@ def main():
'screen -d -m bash -c "while true; do /Berkanavt/nemesis/bin/olap_workload --database /Root/db1; done"',
raise_on_error=True
)
if action == "stop_workload":
for node_id, node in enumerate(stability_cluster.kikimr_cluster.nodes.values()):
node.ssh_command(
'sudo pkill screen',
raise_on_error=True
)
if action == "stop_workloads":
stability_cluster.stop_workloads()

if action == "stop_nemesis":
stability_cluster.stop_nemesis()
Expand All @@ -231,8 +273,7 @@ def main():
stability_cluster.start_nemesis()

if action == "perform_checks":
count, report = stability_cluster.perform_checks()
print(report)
stability_cluster.perform_checks()


if __name__ == "__main__":
Expand Down
114 changes: 114 additions & 0 deletions ydb/tests/stability/library/howto.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# How to test stability
1) build ydbd (not stripped)
```
./ya make --build=profile -DCFLAGS=-fno-omit-frame-pointer --thinlto ydb/apps/ydbd
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А откуда это взято? Я бы просто ya make -r ydb/apps/ydbd предложил собирать

```
2) build library
```
./ya make /ydb/tests/stability/library
Copy link
Collaborator

@maximyurchuk maximyurchuk Jan 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тоже предложил бы собирать как ya make -r

```
3) deploy ydb to test specific build version
```
cd /ydb/tests/stability/library; ./library deploy_ydb --cluster_path=<path/to/>cluster.yaml --ydbd_path=<path/to/>ydb/apps/ydbd/ydbd
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/ydb/tests/stability/library -- первый слеш лишний

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<path/to/>ydb/apps/ydbd/ydbd -- вот эта конструкция выглядит на мой вкус странно, предлагаю хотя бы так
repo_root/ydb/apps/ydbd/ydbd
или
<repo_root>/ydb/apps/ydbd/ydbd

Смущают слеши в треугольных скобках

```
4) deploy tools
```
./library deploy_tools --cluster_path=<path/to/>cluster.yaml --ydbd_path=<path/to/>ydb/apps/ydbd/ydbd
```
5) start workload:
- `start_all_workloads` - it start all listed bellow worloads
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

worloads -> workloads

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bellow -> below

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Предлагаю убрать слово it возможно

- `start_workload_simple_queue_row` - create
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

- create -- что то неподчищенное как будто

- `start_workload_simple_queue_column`
- `start_workload_olap_workload`

```
./library start_all_workloads --cluster_path=<path/to/>cluster.yaml --ydbd_path=<path/to/>ydb/apps/ydbd/ydbd
```
to stop workload, use command `stop_workloads` - stops all worloads
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

- stops all worloads как будто лишнее, предложение же с этого началось


to check is it working on node host
- workload simple_queue row - ``ps -aux | grep "/Berkanavt/nemesis/bin/simple" | grep row | grep -v grep
``
- workload simple_queue column - ``ps -aux | grep "/Berkanavt/nemesis/bin/simple" | grep column | grep -v grep
``
- workload simple_queue column - ``ps -aux | grep "/Berkanavt/nemesis/bin/olap_workload" | grep -v grep
``

6) start nemesis:
```
./library start_nemesis --cluster_path=<path/to/>cluster.yaml --ydbd_path=<path/to/>ydb/apps/ydbd/ydbd
```
to stop, use the command `stop_nemesis`


7) Check states
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

У тебя выше пункты с маленькой буквы, здесь с большой начались

Предлагаю везде с большой сделать

1) yq to get all node hosts
```
sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Какая то стремноватая тема качать бинарь с гитхаба и руками его ставить в систему

Я предлагаю утилиту доработать чтобы она это в питоне делала

sudo chmod +x /usr/local/bin/yq
```
``yq e '.hosts[].name' <path/to/>cluster.yaml> > hosts.txt``
2) Get status of nemesis and workloads (ad-hoc)
```
parallel-ssh -h hosts.txt -i '
if systemctl is-active --quiet nemesis; then
echo "nemesis: Active"
else
echo "nemesis: Down"
fi
if ps aux | grep "/Berkanavt/nemesis/bin/olap_workload" | grep -v grep > /dev/null; then
echo "olap_workload: Running"
else
echo "olap_workload: Stopped"
fi
if ps aux | grep "/Berkanavt/nemesis/bin/simple" | grep column | grep -v grep > /dev/null; then
echo "simple_queue_column: Running"
else
echo "simple_queue_column: Stopped"
fi
if ps aux | grep "/Berkanavt/nemesis/bin/simple" | grep row | grep -v grep > /dev/null; then
echo "simple_queue_column: Running"
else
echo "simple_queue_column: Stopped"
fi
'
```
8) check cluster stability
1) ``perform_check`` - return summary of errors and coredumps for cluster:

```
SAFETY WARDEN (total: 8)
LIVENESS WARDEN (total: 0)
COREDUMPS:
ydb-sas-testing-0000.search.yandex.net: 1
ydb-sas-testing-0001.search.yandex.net: 0
ydb-sas-testing-0002.search.yandex.net: 1
ydb-sas-testing-0003.search.yandex.net: 1
ydb-sas-testing-0004.search.yandex.net: 0
ydb-sas-testing-0005.search.yandex.net: 1
ydb-sas-testing-0006.search.yandex.net: 2
ydb-sas-testing-0007.search.yandex.net: 0
```
to run:
```
./library perform_check --cluster_path=<path/to/>cluster.yaml --ydbd_path=<path/to/>ydb/apps/ydbd/ydbd
```
2) get cluster traces (ad-hoc)
```
'' > combined_traces.txt; parallel-ssh -h hosts.txt -i "
zgrep -E 'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception' /Berkanavt/kikimr_31003/logs/kikimr.start.* -A 30 |
awk '
{
split(\$0, parts, \":\")
curr_file = parts[1]

if (curr_file != prev_file) {
if (prev_file != \"\")
print \"\n\n\n---\n\n\n\"
prev_file = curr_file
}
print
}' | sed '/--/a\\n\n'
" >> combined_traces.txt
```
9) create issue in github about new traces
Loading