forked from openshift/etcd-tools
-
Notifications
You must be signed in to change notification settings - Fork 19
/
must-gather-analyzer.sh
executable file
·709 lines (579 loc) · 25.9 KB
/
must-gather-analyzer.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
#!/bin/bash
MUST_PATH=$1
# PLOT=$2
STAMP=$(date +%Y-%m-%d_%H-%M-%S)
#REPORT_FOLDER="$HOME/ETCD-SUMMARY_$STAMP"
ORIG_PATH=$(pwd)
OUTPUT_PATH=$ORIG_PATH/DATA
mkdir -p $OUTPUT_PATH
NODES=()
MASTER=()
INFRA=()
WORKER=()
OCS=()
ETCD=()
#mkdir -p $REPORT_FOLDER
#echo "created $REPORT_FOLDER"
echo -e ""
# TERMINAL COLORS -----------------------------------------------------------------
NONE='\033[00m'
RED='\033[01;31m'
GREEN='\033[01;32m'
YELLOW='\033[01;33m'
BLACK='\033[30m'
BLUE='\033[34m'
VIOLET='\033[35m'
CYAN='\033[36m'
GREY='\033[37m'
# MAIN --------------------------
cd $MUST_PATH
cd $(echo */)
# ls
# CLUSTER VERSION ---------------
[ -e "cluster-scoped-resources/config.openshift.io/clusterversions.yaml" ] && OCP_VERSION=$(cat cluster-scoped-resources/config.openshift.io/clusterversions.yaml |grep "Cluster version is"| grep -Po "(\d+\.)+\d+") || echo -e "no clusterversion.yaml found."
if [ -z "$OCP_VERSION" ]; then
echo -e "Cluster version is EMPTY!"
echo -e "IMPORTANT: cluster version file might be missing or corrupted due to ongoing upgrade (moving between versions)."
else
echo -e "Cluster version is $OCP_VERSION"
fi
#supported version check
if [[ "$OCP_VERSION" == *"4.10"* || "$OCP_VERSION" == *"4.9"* ]];
then
echo -e "${RED}[WARNING] UNSUPPORTED OLD VERSION!!! ${NONE}"
# else
# echo -e " ${RED}[WARNING]${NONE} Found $OVERLOAD overloaded messages while there should be zero of them."
fi
echo -e ""
echo -e ""
# LIST NODES --------------------
cd cluster-scoped-resources/core/nodes
NODES_NUMBER=$(ls|wc -l)
echo -e "There are $NODES_NUMBER nodes in cluster"
# STORAGE
cd ../persistentvolumes
[ -d "../persistentvolumes" ] && PVCS=$(ls) && PV_NUMBER=$(ls|wc -l) && echo -e "There are $PV_NUMBER PVs in cluster" || echo -e "${RED}No PV files found. MISSING.${NONE}"
# echo "" > $OUTPUT_PATH/pvcs
# for i in $PVCS; do
# echo $(cat $i |grep "storageClassName"|grep -v "f:storageClassName") >> $OUTPUT_PATH/pvcs
# done
# echo -e "Class:"
# cat $OUTPUT_PATH/pvcs |sort -u|uniq
# NODES
cd ../nodes
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/master:')" ] && MASTER+=("${filename::-5}") && NODES+=("$filename [master]") || true
done
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/infra:')" ] && INFRA+=("${filename::-5}") && NODES+=("$filename [infra]") || true
done
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/worker:')" ] && WORKER+=("${filename::-5}") && NODES+=("$filename [worker]") || true
done
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep -w 'cluster.ocs.openshift.io/openshift-storage')" ] && OCS+=("${filename::-5}") || true
done
echo -e ""
echo -e "${GREEN}- NODES --------------------${NONE}"
echo -e ""
# MASTERS / CONTROL PLANES
echo -e "${#MASTER[@]} masters"
# check if there's no more than supported number of masters (which is 3)
if (( ${#MASTER[@]} > 3 )); then
echo -e " ${RED}[WARNING] only 3 masters are supported, you have ${#MASTER[@]}.${NONE}"
fi
# check if any master is missing
if (( ${#MASTER[@]} < 3 )); then
echo -e " [WARNING] you have only ${#MASTER[@]} masters. Investigate SOSreport from missing one!"
fi
echo -e ""
echo -e "Minimum 4 vCPU (good for small or development cluster where stability doesn't matter much)."
echo -e "Minimum 16 GB RAM (for medium and large clusters)."
echo -e ""
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/master:')" ] && echo -e "- $filename" && cat $filename |grep cpu|grep -v "f:cpu"|grep -v "m" || true
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/master:')" ] && cat $filename |grep memory|grep -v "f:memory"|grep -v 'message' |grep -v 'k:'| head -n 1 || true
#[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/master:')" ] && cat $filename |grep type|| true
done
echo -e ""
# INFRA NODES
echo -e "${#INFRA[@]} infra nodes"
# check for infra nodes and suggest consideration
if (( ${#INFRA[@]} < 1 )); then
echo -e " ${RED}[WARNING]${NONE} no INFRA nodes or not properly tagged with node-role.kubernetes.io/infra=\"\"."
echo -e " Condsider adding infra nodes to offload masters."
fi
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/infra:')" ] && echo -e "- $filename" && cat $filename |grep cpu|grep -v "f:cpu"|grep -v "m" || true
[ ! -z "$(cat $filename |grep node-role|grep -w 'node-role.kubernetes.io/infra:')" ] && cat $filename |grep memory|grep -v "f:memory"|grep -v 'message'|grep -v 'k:'| head -n 1 || true
done
echo -e ""
# WORKERS
echo -e "${#WORKER[@]} workers"
echo -e ""
echo -e "${#OCS[@]} OCS storage nodes"
# check for infra nodes and suggest consideration
# if (( ${#OCS[@]} < 1 )); then
# echo -e " ${RED}[WARNING]${NONE} no INFRA nodes or not properly tagged with node-role.kubernetes.io/infra=\"\"."
# fi
for filename in *.yaml; do
[ -e "$filename" ] || continue
[ ! -z "$(cat $filename |grep -w 'openshift-storage:')" ] && echo -e "- $filename" && cat $filename |grep cpu|grep -v "f:cpu"|grep -v "m" || true
[ ! -z "$(cat $filename |grep -w 'openshift-storage:')" ] && cat $filename |grep memory|grep -v "f:memory"|grep -v 'message'|grep -v 'k:'| head -n 1 || true
done
echo -e ""
echo -e "${GREEN}- NETWORKING --------------${NONE}"
echo -e ""
# Clusternetwork.yaml only exists for openshift-sdn
# OVN-K doesn't use it
# Better get it from the 'network' object at either 'config.openshift.io' or 'operator.openshift.io' api groups
#networkType
cd ../../config.openshift.io
cat networks.yaml|grep 'networkType' |uniq
cat networks.yaml|grep 'cidr' |uniq
# [ -d "../persistentvolumes" ] && PVCS=$(ls) && PV_NUMBER=$(ls|wc -l) && echo -e "There are $PV_NUMBER PVs in cluster" || echo -e "${RED}No PV files found. MISSING.${NONE}"
# [ -d "../persistentvolumes" ] && PVCS=$(ls) && PV_NUMBER=$(ls|wc -l) && echo -e "There are $PV_NUMBER PVs in cluster" || echo -e "${RED}No PV files found. MISSING.${NONE}"
# cd ../../../cluster-scoped-resources/network.openshift.io/clusternetworks/
# cat default.yaml |grep CIDR
# cat default.yaml |grep plugin
# cat default.yaml | grep serviceNetwork
echo -e ""
echo -e "${GREEN}- openshift-ingress router pods:${NONE}"
echo -e ""
cd $MUST_PATH
cd $(echo */)
cd namespaces/openshift-ingress/pods
for router in $(ls); do
echo -e "" > $OUTPUT_PATH/$router.log
WATCH=$(cat $router/router/router/logs/current.log |grep 'Unexpected watch close'|wc -l)
RERR=$(cat $router/router/router/logs/current.log |grep 'error on the server'|wc -l)
DEAD=$(cat $router/router/router/logs/current.log |grep 'context deadline exceeded'|wc -l)
CLTIME=$(cat $router/router/router/logs/current.log |grep 'timeout'|wc -l)
PRC=$(cat $router/router/router/logs/current.log |grep 'process'|wc -l)
CLK=$(cat $router/router/router/logs/current.log |grep 'clock'|wc -l)
BFR=$(cat $router/router/router/logs/current.log |grep 'buffer'|wc -l)
# cat $router/router/router/logs/current.log |grep 'process'
echo -e ""
echo -e "${GREEN}[$router]:${NONE}"
echo -e ""
if [[ "$WATCH" -eq 0 ]];
then
echo -e " no 'Unexpected watch close' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $WATCH 'Unexpected watch close' messages."
fi
if [[ "$RERR" -eq 0 ]];
then
echo -e " no 'error on the server' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $RERR 'error on the server' messages."
fi
if [[ "$DEAD" -eq 0 ]];
then
echo -e " no 'context deadline exceeded' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $DEAD 'context deadline exceeded' messages."
fi
if [[ "$CLTIME" -eq 0 ]];
then
echo -e " no 'ClientTimeout' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $CLTIME 'ClientTimeout' messages."
fi
if [[ "$PRC" -eq 0 ]];
then
echo -e " no 'Failed to open XYZ for getting process status' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $PRC 'Failed to open XYZ for getting process status' messages."
fi
if [[ "$CLK" -eq 0 ]];
then
echo -e " no 'clock' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $CLK 'clock' messages."
fi
if [[ "$BFR" -eq 0 ]];
then
echo -e " no 'buffer' message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} we found $BFR 'buffer' messages."
fi
done
echo -e ""
echo -e "TIP: Additionaly check sosreports for dropped packets and RX/TX errors."
echo -e ""
echo -e ""
# omc get clusterversion
# omc get co | grep -v -e "True.*False.*False"
# omc get nodes | grep -v -e " Ready "
# omc get mcp | grep -v -e "True.*False.*False"
# omc get pods -A -o wide | grep -v -e "Running" -e "Completed"
# omc get machinehealthcheck -n openshift-machine-api
# omc get csv -A | grep -v -e Succeeded
# omc get events -A | grep -v -e " Normal "
# for i in $(omc -n openshift-etcd get pods -l app=etcd -o name); do echo "-- $i"; omc -n openshift-etcd logs $i -c etcd 2>&1 | awk -v min=999 '/took too long/ {t++} /context deadline exceeded/ {b++} /finished scheduled compaction/ {gsub("\"",""); sub("ms}",""); split($0,a,":"); if (a[12]<min) min=a[12]; if (a[12]>max) max=a[12]; avg+=a[12]; c++} END{printf "took too long: %d\ndeadline exceeded: %d\n",t,b; printf "compaction times:\n min: %d\n max: %d\n avg:%d\n",min,max,avg/c}'; done
# router_check() {
# echo -e ""
# echo -e "[ROUTER openshift-ingress check]"
# echo -e ""
# [ -d "$ORIG_PATH/namespaces/openshift-ingress/pods" ] && echo "found directory" || return
# i=0
# cd $ORIG_PATH/namespaces/openshift-ingress/pods
# for router in $(ls); do
# echo "processing $router"
# echo -e "" > $OUTPUT_PATH/$router.log
# cat $router/router/router/logs/current.log |grep 'Unexpected watch close'|cut -d ' ' -f1| \
# xargs -I {} echo -e "{} Unexpected watch close [$router] !!!" | while read -r line; do echo -e "$line" >> $OUTPUT_PATH/$router.log; done
# cat $router/router/router/logs/current.log |grep 'error on the server'|cut -d ' ' -f1| \
# xargs -I {} echo -e "{} error on the server [$router]" | while read -r line; do echo -e "$line" >> $OUTPUT_PATH/$router.log; done
# # cat $router/router/router/logs/current.log |grep 'process'|cut -d ' ' -f1| \
# # xargs -I {} echo -e "{} LEADER changed [$router] !" | while read -r line; do echo -e "$line" >> $OUTPUT_PATH/$router.log; done
# # cat $router/router/router/logs/current.log |grep 'clock'|cut -d ' ' -f1| \
# # xargs -I {} echo -e "{} NTP clock difference [$router] !!" | while read -r line; do echo -e "$line" >> $OUTPUT_PATH/$router.log; done
# # cat $router/router/router/logs/current.log |grep 'buffer'|cut -d ' ' -f1| \
# # xargs -I {} echo -e "{} BUFF [$router] !!" | while read -r line; do echo -e "$line" >> $OUTPUT_PATH/$router.log; done
# #increment color
# i=$((${i}+1))
# done
# i=0
# cat $OUTPUT_PATH/router*.log > $OUTPUT_PATH/output_router_logs.log
# sort -t: -k2 -k3 $OUTPUT_PATH/output_router_logs.log > $OUTPUT_PATH/sorted.tmp
# cat $OUTPUT_PATH/sorted.tmp > $OUTPUT_PATH/output_router_logs.log
# }
# ETCD ---------------------------
cd $MUST_PATH
cd $(echo */)
echo -e ""
echo -e "${GREEN}- ETCD --------------------${NONE}"
echo -e ""
cd namespaces/openshift-etcd/pods
for dirs in $(ls |grep -v guard|grep -v installer|grep -v quorum|grep -v pruner); do
[ -e "$dirs" ] || continue
[ ! -z "$(ls |grep -v guard|grep -v installer|grep -v quorum|grep -v pruner)" ] && ETCD+=("${dirs}") || true
#echo -e "adding $dirs"
done
# check if there's no more than supported number of masters (which is 3)
if (( ${#ETCD[@]} > 3 )); then
echo -e " [WARNING] only 3 etcd members are supported, you have ${#ETCD[@]}."
else
echo -e " found 3 etcd members - GOOD"
fi
# check if any master is missing
if (( ${#ETCD[@]} < 3 )); then
echo -e " [WARNING] you have only ${#ETCD[@]} etcd members. Investigate logs from missing one!"
fi
# echo -e "${#ETCD[@]} etcd members"
for member in "${ETCD[@]}"; do
echo -e "\n${GREEN}-[$member] ---${NONE}\n"
# echo -e ""
OVERLOAD=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|wc -l)
OVERLOADN=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep network|wc -l)
RAPOVERN=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep network |tail -n 1 |grep "remote-peer-active\":false")
OVERLOADC=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep disk|wc -l)
LAST=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|tail -1 |cut -d ':' -f1|cut -c 1-10)
LOGEND=$(cat $member/etcd/etcd/logs/current.log|tail -1 |cut -d ':' -f1|cut -c 1-10)
CLOCK=$(cat $member/etcd/etcd/logs/current.log|grep 'clock difference'|wc -l)
LASTNTP=$(cat $member/etcd/etcd/logs/current.log|grep 'clock difference'|tail -1)
LONGDRIFT=$(cat $member/etcd/etcd/logs/current.log|grep 'clock-drift'|wc -l)
LASTLONGDRIFT=$(cat $member/etcd/etcd/logs/current.log|grep 'clock-drift'|tail -1)
TOOK=$(cat $member/etcd/etcd/logs/current.log|grep 'apply request took too long'|wc -l)
HEART=$(cat $member/etcd/etcd/logs/current.log|grep 'failed to send out heartbeat on time'|wc -l)
SPACE=$(cat $member/etcd/etcd/logs/current.log|grep 'database space exceeded'|wc -l)
LEADER=$(cat $member/etcd/etcd/logs/current.log|grep 'leader changed'|wc -l)
OVRL=0
NTP=0
HR=0
TK=0
LED=0
# overloaded
if [[ "$OVERLOAD" -eq 0 ]];
then
echo -e " no overloaded message - ${GREEN}OK!${NONE}"
else
echo -e " ${RED}[WARNING]${NONE} Found $OVERLOAD overloaded messages while there should be zero of them."
echo -e ""
if [[ -n $RAPOVERN || $RAPOVERN -lt 1 ]]; then
echo -e " - $OVERLOADN x OVERLOADED NETWORK in $member"
echo -e " (high network or remote storage latency, the peer is not responding, missing the availability to connect to another member)"
else
echo -e " - $RAPOVERN x OVERLOADED NETWORK in $member"
echo -e " (high network or remote storage latency, the peer is responding, but too slow or only occasionally)"
fi
echo -e ""
echo -e " - $OVERLOADC x OVERLOADED DISK/CPU in $member (slow storage or lack of CPU on masters)"
echo -e ""
echo -e " Log ends on $LOGEND"
if [ "$LAST" = "$LOGEND" ]; then
echo -e " Warnings last seen on $LAST. ${RED}TODAY!${NONE}"
TOD=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep disk|grep $LAST |wc -l)
echo -e " Today seen $TOD times."
YESTER=$(date -d "$LOGEND - 24 hours" +%Y-%m-%d)
YEST=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep $YESTER|wc -l)
YESDISK=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep disk|grep $YESTER|wc -l)
YESNET=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep network|grep $YESTER|wc -l)
YESNETRAP=$(cat $member/etcd/etcd/logs/current.log|grep 'overload'|grep network |tail -n 1 |grep "remote-peer-active\":false"|grep $YESTER|wc -l)
echo -e " Yesterday seen $YEST times."
echo -e " ${YESDISK}x slow disk"
echo -e " ${YESNET}x high network or remote storage latency, the peer is not responding"
echo -e " ${YESNETRAP}x high network or remote storage latency, the peer is responding, but too slow"
echo -e ""
else
echo -e " Warnings last seen on $LAST. ${GREEN}NOT TODAY!${NONE}"
fi
echo -e ""
echo -e " SOLUTION: Review ETCD and CPU metrics as this could be caused by CPU bottleneck or slow disk (or combination of both)."
echo -e " In case of SAN, issue might be network latency rather than storage itself."
echo -e " TIP: collect and investigate metrics as in https://access.redhat.com/solutions/5489721 "
echo -e ""
fi
# took too long
if [ "$TOOK" != "0" ]; then
echo -e " ${RED}[WARNING]${NONE} we found $TOOK 'apply request took too long' messages. (You should be concerned only with several thousands of messages)"
echo -e " $SUMMARY"
TK=$(($TK+$TOOK))
echo -e ""
else
echo -e " no 'apply request took too long' messages"
echo -e ""
fi
# compaction
echo -e " [ETCD compaction]\n"
echo -e " To avoid running out of space for writes to the keyspace, the etcd keyspace history must be compacted. (OCP by default auto-defragment)"
echo -e " Compaction should be below 200ms on small cluster, below 500ms on medium cluster and below 800ms on large cluster."
echo -e " IMPORTANT: if compaction vary too much (difference 200ms+) it could mean masters are using shared storage or network storage with bad latency."
echo -e ""
cat $member/etcd/etcd/logs/current.log|grep compaction| tail -8 > $OUTPUT_PATH/$member-compat.data
echo -e " last compaction:\n"
cat $OUTPUT_PATH/$member-compat.data| while read line
do
CHECK=$(echo $line|tail -8|cut -d ':' -f12| rev | cut -c9- | rev|cut -c2- |grep -E '[0-9]')
[[ ! -z "$(echo $CHECK |grep -E '[0-9]s')" ]] && echo -e "${RED} $CHECK <---- TOO HIGH!${NONE}" || echo " $CHECK"
done
echo -e ""
# ntp
echo -e " [NTP]"
if [ "$CLOCK" != "0" ]; then
echo -e "${RED}[WARNING]${NONE} we found $CLOCK ntp clock difference messages in $1"
NTP=$(($NTP+$CLOCK))
echo -e " Last occurrence:"
echo -e " $LASTNTP"| cut -d " " -f1
echo -e " Log ends at "
echo -e " $LOGENDNTP"| cut -d " " -f1
echo -e ""
echo -e " Long drift: $LONGDRIFT"
echo -e " Last long drift:"
echo -e " $LASTLONGDRIFT"
echo -e ""
echo -e " SOLUTION: When clocks are out of sync with each other they are causing I/O timeouts and the liveness probe is failing which makes the ETCD pod to restart frequently. Check if Chrony is enabled, running, and in sync with:"
echo -e " - chronyc sources"
echo -e " - chronyc tracking"
echo -e ""
else
echo -e " no NTP related warnings found - ${GREEN}OK!${NONE}"
fi
# heartbeat
echo -e ""
echo -e " [HEARTBEAT]"
if [ "$HEART" != "0" ]; then
echo -e " ${RED}[WARNING]${NONE} we found $HEART failed to send out heartbeat on time messages. Usually this issue is caused by a slow disk."
HR=$(($HR+$HEART))
echo -e ""
echo -e " NOTE:"
echo -e " etcd has a 100ms tolerance for requests, which doesn't leave much time for latency."
echo -e " Requests between members that take longer than 100ms will receive these errors, which if frequent enough"
echo -e " can cause instability in the cluster and frequent leader re-elections."
else
echo -e " no 'failed to send out heartbeat on time' messages found - ${GREEN}OK!${NONE}"
fi
# space
echo -e ""
echo -e " [DB SPACE]"
if [ "$SPACE" != "0" ]; then
echo -e " ${RED}[WARNING]${NONE} we found $SPACE 'database space exceeded'"
SP=$(($SP+$SPACE))
echo -e ""
echo -e "SOLUTION: Defragment and clean up ETCD, remove unused secrets or deployments."
echo -e ""
else
echo -e " no 'database space exceeded' messages found - ${GREEN}OK!${NONE}"
fi
# leader changes
echo -e ""
echo -e " [LEADER CHANGES]"
if [ "$LEADER" != "0" ]; then
echo -e " ${RED}[WARNING]${NONE} we found $LEADER 'leader changed'"
LED=$(($LED+$LEADER))
echo -e ""
echo -e " NOTE:"
echo -e " When a leader fails, the etcd cluster automatically elects a new leader. The election does not happen instantly once the leader fails."
echo -e " During the leader election the cluster cannot process any writes. Write requests sent during the election are queued for processing"
echo -e " until a new leader is elected. Writes already sent to the old leader but not yet committed may be lost."
else
echo -e " no 'leader changed' messages found - ${GREEN}OK!${NONE}"
fi
done
echo -e ""
echo -e ""
echo -e "[API CONSUMERS kube-apiserver on masters]"
echo -e ""
cd $MUST_PATH
cd $(echo */)
[ -d "audit_logs/kube-apiserver/" ] && echo -e "Audit logs found. Processing." || echo -e "${RED}No audit logs found. MISSING.${NONE}" && exit 0
cd audit_logs/kube-apiserver/
AUDIT_LOGS=$(ls *.gz|grep audit)
node=""
for i in $AUDIT_LOGS; do
#echo -e "[ extracting $i ]"
gzip -d $i
done;
AUDIT_LOGS=$(ls *.log)
for i in $AUDIT_LOGS; do
echo -e "[ processing $i ]"
if [[ $i == *".log"* ]]; then
cat $i |jq '.user.username' -r > $OUTPUT_PATH/$(echo $i|cut -d ' ' -f2)_2sort.log
sort $OUTPUT_PATH/$(echo $i|cut -d ' ' -f2)_2sort.log | uniq -c | sort -bgr| head -10
echo -e ""
else
node=$i
continue
fi
done;
# etcd_took_too_long() {
# TOOKS_MS=()
# MS=$(cat $1/etcd/etcd/logs/current.log|grep 'apply request took too long'|tail -1)
# echo $MS
# TOOK=$(cat $1/etcd/etcd/logs/current.log|grep 'apply request took too long'|wc -l)
# SUMMARY=$(cat $1/etcd/etcd/logs/current.log |awk -v min=999 '/apply request took too long/ {t++} /context deadline exceeded/ {b++} /finished scheduled compaction/ {gsub("\"",""); sub("ms}",""); split($0,a,":"); if (a[12]<min) min=a[12]; if (a[12]>max) max=a[12]; avg+=a[12]; c++} END{printf "took too long: %d\ndeadline exceeded: %d\n",t,b; printf "compaction times:\n min: %d\n max: %d\n avg:%d\n",min,max,avg/c}'
# )
# # if [ "$PLOT" = true ]; then
# # for lines in $(cat $1/etcd/etcd/logs/current.log||grep "apply request took too long"|grep -ohE "took\":\"[0-9]+(.[0-9]+)ms"|cut -c8-);
# # do
# # TOOKS_MS+=("$lines");
# # if [ "$lines" != "}" ]; then
# # echo $lines >> $REPORT_FOLDER/$1-long.data
# # fi
# # done
# # fi
# # if [ "$PLOT" = true ]; then
# # gnuplot_render $1 "${#TOOKS_MS[@]}" "took too long messages" "Sample number" "Took (ms)" "tooktoolong_graph" "$REPORT_FOLDER/$1-long.data"
# # fi
# if [ "$TOOK" != "0" ]; then
# echo -e "${RED}[WARNING]${NONE} we found $TOOK 'apply request took too long' messages in $1"
# echo -e "$SUMMARY"
# TK=$(($TK+$TOOK))
# echo -e ""
# fi
# }
# help_etcd_objects() {
# echo -e ""
# echo -e "- Number of objects ---"
# echo -e ""
# echo -e "List number of objects in ETCD:"
# echo -e ""
# echo -e "$ oc project openshift-etcd"
# echo -e "oc get pods"
# echo -e "oc rsh etcd-ip-10-0-150-204.eu-central-1.compute.internal"
# echo -e "> etcdctl get / --prefix --keys-only | sed '/^$/d' | cut -d/ -f3 | sort | uniq -c | sort -rn"
# echo -e ""
# echo -e "[HINT] Any number of CRDs (secrets, deployments, etc..) above 8k could cause performance issues on storage with not enough IOPS."
# echo -e ""
# echo -e "List secrets per namespace:"
# echo -e ""
# echo -e "> oc get secrets -A --no-headers | awk '{ns[\$1]++}END{for (i in ns) print i,ns[i]}'"
# echo -e ""
# echo -e "[HINT] Any namespace with 20+ secrets should be cleaned up (unless there's specific customer need for so many secrets)."
# echo -e ""
# }
# help_etcd_troubleshoot() {
# echo -e ""
# echo -e "- Generic troubleshooting ---"
# echo -e ""
# echo -e "More details about troubleshooting ETCD can be found at https://access.redhat.com/articles/6271341"
# }
# help_etcd_metrics() {
# echo -e ""
# echo -e "- ETCD metrics ---"
# echo -e ""
# echo -e "How to collect ETCD metrics. https://access.redhat.com/solutions/5489721"
# }
# help_etcd_networking() {
# echo -e ""
# echo -e "- ETCD networking troubleshooting ---"
# echo -e ""
# echo -e "From masters check if there are no dropped packets or RX/TX errors on main NIC."
# echo -e "> ip -s link show"
# echo -e ""
# echo -e "but also check latency against API (expected value is 2-5ms, 0.002-0.005 in output)"
# echo -e "> curl -k https://api.<OCP URL>.com -w \"%{time_connect}\""
# echo -e "Any higher latency could mean network bottleneck."
# }
# help_etcd_objects
# etcd_ntp() {
# CLOCK=$(cat $1/etcd/etcd/logs/current.log|grep 'clock difference'|wc -l)
# LASTNTP=$(cat $1/etcd/etcd/logs/current.log|grep 'clock difference'|tail -1)
# LONGDRIFT=$(cat $1/etcd/etcd/logs/current.log|grep 'clock-drift'|wc -l)
# LASTLONGDRIFT=$(cat $1/etcd/etcd/logs/current.log|grep 'clock-drift'|tail -1)
# LOGENDNTP=$(cat $1/etcd/etcd/logs/current.log|tail -1)
# if [ "$CLOCK" != "0" ]; then
# echo -e "${RED}[WARNING]${NONE} we found $CLOCK ntp clock difference messages in $1"
# NTP=$(($NTP+$CLOCK))
# echo -e "Last occurrence:"
# echo -e "$LASTNTP"| cut -d " " -f1
# echo -e "Log ends at "
# echo -e "$LOGENDNTP"| cut -d " " -f1
# echo -e ""
# echo -e "Long drift: $LONGDRIFT"
# echo -e "Last long drift:"
# echo -e $LASTLONGDRIFT
# fi
# }
#COMPATION
# echo -e ""
# echo -e "[COMPACTION]"
# echo -e "should be ideally below 100ms (and below 10ms on fast SSD/NVMe) on small clusters, 300-500 on medium or large and no more than 800-900ms on very large clusters."
# echo -e ""
# for member in "${ETCD[@]}"; do
# etcd_compaction $member
# done
# MAIN FUNCS
overload_solution() {
}
audit_logs() {
cd $MUST_PATH
cd $(echo */)
cd audit_logs/kube-apiserver/
echo -e ""
echo -e "[API CONSUMERS kube-apiserver on masters]"
echo -e ""
AUDIT_LOGS=$(ls *.gz|grep audit)
node=""
for i in $AUDIT_LOGS; do
#echo -e "[ extracting $i ]"
gzip -d $i
done;
AUDIT_LOGS=$(ls *.log)
for i in $AUDIT_LOGS; do
echo -e "[ processing $i ]"
if [[ $i == *".log"* ]]; then
cat $i |jq '.user.username' -r > $OUTPUT_PATH/$(echo $i|cut -d ' ' -f2)_2sort.log
sort $OUTPUT_PATH/$(echo $i|cut -d ' ' -f2)_2sort.log | uniq -c | sort -bgr| head -10
echo -e ""
else
node=$i
continue
fi
done;
}
# timed out waiting for read index response (local node might have slow network)
echo -e ""
echo -e "ADDITIONAL HELP:"
# help_etcd_troubleshoot
# help_etcd_metrics
# help_etcd_networking
# help_etcd_objects