diff --git a/scripts/generate_dump b/scripts/generate_dump index 00b71eb76e..e7e378c218 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -106,6 +106,7 @@ save_bcmcmd() { local filename=$2 local filepath="${LOGDIR}/$filename" local do_gzip=${3:-false} + local tarpath="${BASE}/dump/$filename" local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" local cmd=$(escape_quotes "$cmd") if [ ! -d $LOGDIR ]; then @@ -140,9 +141,12 @@ save_bcmcmd() { fi if $do_gzip; then gzip ${filepath} 2>/dev/null + tarpath="${tarpath}.gz" filepath="${filepath}.gz" fi - + ($TAR $V -rhf $TARFILE -C $DUMPDIR "$tarpath" \ + || abort "${EXT_TAR_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \ + && $RM $V -rf "$filepath" end_t=$(date +%s%3N) echo "[ save_bcmcmd:$cmd ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO } @@ -176,7 +180,7 @@ save_bcmcmd_all_ns() { } ############################################################################### -# Runs a comamnd and saves its output to the file. +# Runs a comamnd and saves its output to the incrementally built tar. # Command gets timedout if it runs for more than TIMEOUT_MIN minutes. # Globals: # LOGDIR @@ -204,6 +208,7 @@ save_cmd() { local filename=$2 local filepath="${LOGDIR}/$filename" local do_gzip=${3:-false} + local tarpath="${BASE}/dump/$filename" local timeout_cmd="timeout --foreground ${TIMEOUT_MIN}m" local cleanup_method=${4:-dummy_cleanup_method} local redirect='&>' @@ -225,6 +230,7 @@ save_cmd() { # as one argument, e.g. vtysh -c "COMMAND HERE" needs to have # "COMMAND HERE" bunched together as 1 arg to vtysh -c if $do_gzip; then + tarpath="${tarpath}.gz" filepath="${filepath}.gz" # cleanup_method will run in a sub-shell, need declare it first local cmds="$cleanup_method_declration; $cmd $redirect_eval | $cleanup_method | gzip -c > '${filepath}'" @@ -254,35 +260,13 @@ save_cmd() { fi fi + ($TAR $V -rhf $TARFILE -C $DUMPDIR "$tarpath" \ + || abort "${EXT_TAR_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \ + && $RM $V -rf "$filepath" end_t=$(date +%s%3N) echo "[ save_cmd:$cmd ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO } -############################################################################### -# Save all collected data to tar archive. -# Globals: -# DUMPDIR -# TAR -# TARFILE -# V -# BASE -# Arguments: -# None -# Returns: -# None -############################################################################### -save_to_tar() { - trap 'handle_error $? $LINENO' ERR - local start_t=$(date +%s%3N) - local end_t=0 - - cd $DUMPDIR - $TAR $V -rhf $TARFILE "$BASE" - - end_t=$(date +%s%3N) - echo "[ save_to_tar ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO -} - ############################################################################### # Dummy cleanup method. # Globals: @@ -423,7 +407,7 @@ get_vtysh_namespace() { ############################################################################### # Runs a vtysh command in all namesapces for a multi ASIC platform, and in # default (host) namespace in single ASIC platforms. Saves its output to the -# file. +# incrementally built tar. # Globals: # None # Arguments: @@ -453,7 +437,7 @@ save_vtysh() { } ############################################################################### -# Runs an ip command and saves its output to the file. +# Runs an ip command and saves its output to the incrementally built tar. # Globals: # None # Arguments: @@ -472,7 +456,7 @@ save_ip() { } ############################################################################### -# Runs a bridge command and saves its output to the file. +# Runs a bridge command and saves its output to the incrementally built tar. # Globals: # None # Arguments: @@ -786,8 +770,8 @@ save_proc() { ( [ -e $f ] && $CP $V -r $f $TARDIR/proc ) || echo "$f not found" > $TARDIR/$f fi done - - chmod ugo+rw -R $DUMPDIR/$BASE/proc + $TAR $V -rhf $TARFILE -C $DUMPDIR --mode=+rw $BASE/proc + $RM $V -rf $TARDIR/proc } ############################################################################### @@ -838,7 +822,9 @@ save_proc_stats() { ( $CP $V -r $stats_file $TARDIR/proc_stats ) || echo "$stats_file error" > $TARDIR/$stats_file fi - chmod ugo+rw -R $DUMPDIR/$BASE/proc_stats + $TAR $V -rhf $TARFILE -C $DUMPDIR --mode=+rw $BASE/proc_stats + $RM $V -rf $TARDIR/proc_stats + $RM -rf $stats_file } ############################################################################### @@ -930,13 +916,16 @@ save_file() { local orig_path=$1 local supp_dir=$2 local gz_path="$TARDIR/$supp_dir/$(basename $orig_path)" + local tar_path="${BASE}/$supp_dir/$(basename $orig_path)" local do_gzip=${3:-true} + local do_tar_append=${4:-true} if [ ! -d "$TARDIR/$supp_dir" ]; then $MKDIR $V -p "$TARDIR/$supp_dir" fi if $do_gzip; then gz_path="${gz_path}.gz" + tar_path="${tar_path}.gz" if $NOOP; then echo "gzip -c $orig_path > $gz_path" else @@ -950,6 +939,11 @@ save_file() { fi fi + if $do_tar_append; then + ($TAR $V -rhf $TARFILE -C $DUMPDIR "$tar_path" \ + || abort "${EXT_PROCFS_SAVE_FAILED}" "tar append operation failed. Aborting to prevent data loss.") \ + && $RM $V -f "$gz_path" + fi end_t=$(date +%s%3N) echo "[ save_file:$orig_path] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO } @@ -1296,7 +1290,7 @@ collect_barefoot() { done for file in $(find /tmp/bf_logs -type f); do - save_file "${file}" log true + save_file "${file}" log true true done } @@ -1352,12 +1346,16 @@ save_log_files() { # don't gzip already-gzipped log files :) # do not append the individual files to the main tarball if [ -z "${file##*.gz}" ]; then - save_file $file log false + save_file $file log false false else - save_file $file log true + save_file $file log true false fi done + # Append the log folder to the main tarball + ($TAR $V -rhf $TARFILE -C $DUMPDIR ${BASE}/log \ + || abort "${EXT_TAR_FAILED}" "tar append operation failed. Aborting for safety") \ + && $RM $V -rf $TARDIR/log end_t=$(date +%s%3N) echo "[ TAR /var/log Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO @@ -1382,7 +1380,11 @@ save_warmboot_files() { else mkdir -p $TARDIR $CP $V -rf /host/warmboot $TARDIR - chmod ugo+rw -R $DUMPDIR/$BASE/warmboot + + ($TAR $V --warning=no-file-removed -rhf $TARFILE -C $DUMPDIR --mode=+rw \ + $BASE/warmboot \ + || abort "${EXT_TAR_FAILED}" "Tar append operation failed. Aborting for safety.") \ + && $RM $V -rf $TARDIR fi end_t=$(date +%s%3N) echo "[ Warm-boot Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO @@ -1544,121 +1546,103 @@ main() { /proc/pagetypeinfo /proc/partitions /proc/sched_debug /proc/slabinfo \ /proc/softirqs /proc/stat /proc/swaps /proc/sysvipc /proc/timer_list \ /proc/uptime /proc/version /proc/vmallocinfo /proc/vmstat \ - /proc/zoneinfo & - save_proc_stats & + /proc/zoneinfo \ + || abort "${EXT_PROCFS_SAVE_FAILED}" "Proc saving operation failed. Aborting for safety." + save_proc_stats end_t=$(date +%s%3N) echo "[ Capture Proc State ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO - wait # Save all the processes within each docker - save_cmd "show services" services.summary & + save_cmd "show services" services.summary # Save reboot cause information - save_cmd "show reboot-cause" reboot.cause & - wait + save_cmd "show reboot-cause" reboot.cause local asic="$(/usr/local/bin/sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type)" # 1st counter snapshot early. Need 2 snapshots to make sense of counters trend. save_counter_snapshot $asic 1 - save_cmd "systemd-analyze blame" "systemd.analyze.blame" & - save_cmd "systemd-analyze dump" "systemd.analyze.dump" & - save_cmd "systemd-analyze plot" "systemd.analyze.plot.svg" & - wait - - save_platform_info & - save_cmd "show vlan brief" "vlan.summary" & - save_cmd "show version" "version" & - save_cmd "show platform summary" "platform.summary" & - wait - - save_cmd "cat /host/machine.conf" "machine.conf" & - save_cmd "cat /boot/config-$(uname -r)" "boot.conf" & - save_cmd "docker stats --no-stream" "docker.stats" & - wait - - save_cmd "sensors" "sensors" & - save_cmd "lspci -vvv -xx" "lspci" & - save_cmd "lsusb -v" "lsusb" & - save_cmd "sysctl -a" "sysctl" & - wait - - save_ip_info & - save_bridge_info & - wait - - save_frr_info & - - save_bgp_info & - save_evpn_info & - wait - - save_cmd "show interface status -d all" "interface.status" & - save_cmd "show interface transceiver presence" "interface.xcvrs.presence" & - save_cmd "show interface transceiver eeprom --dom" "interface.xcvrs.eeprom" & - save_cmd "show ip interface -d all" "ip.interface" & - wait - - save_cmd "lldpctl" "lldpctl" & + save_cmd "systemd-analyze blame" "systemd.analyze.blame" + save_cmd "systemd-analyze dump" "systemd.analyze.dump" + save_cmd "systemd-analyze plot" "systemd.analyze.plot.svg" + + save_platform_info + + save_cmd "show vlan brief" "vlan.summary" + save_cmd "show version" "version" + save_cmd "show platform summary" "platform.summary" + save_cmd "cat /host/machine.conf" "machine.conf" + save_cmd "cat /boot/config-$(uname -r)" "boot.conf" + save_cmd "docker stats --no-stream" "docker.stats" + + save_cmd "sensors" "sensors" + save_cmd "lspci -vvv -xx" "lspci" + save_cmd "lsusb -v" "lsusb" + save_cmd "sysctl -a" "sysctl" + + save_ip_info + save_bridge_info + + save_frr_info + save_bgp_info + save_evpn_info + + save_cmd "show interface status -d all" "interface.status" + save_cmd "show interface transceiver presence" "interface.xcvrs.presence" + save_cmd "show interface transceiver eeprom --dom" "interface.xcvrs.eeprom" + save_cmd "show ip interface -d all" "ip.interface" + + save_cmd "lldpctl" "lldpctl" if [[ ( "$NUM_ASICS" > 1 ) ]]; then for (( i=0; i<$NUM_ASICS; i++ )) do - save_cmd "docker exec lldp$i lldpcli show statistics" "lldp$i.statistics" & - save_cmd "docker logs bgp$i" "docker.bgp$i.log" & - save_cmd "docker logs swss$i" "docker.swss$i.log" & + save_cmd "docker exec lldp$i lldpcli show statistics" "lldp$i.statistics" + save_cmd "docker logs bgp$i" "docker.bgp$i.log" + save_cmd "docker logs swss$i" "docker.swss$i.log" done else - save_cmd "docker exec lldp lldpcli show statistics" "lldp.statistics" & - save_cmd "docker logs bgp" "docker.bgp.log" & - save_cmd "docker logs swss" "docker.swss.log" & + save_cmd "docker exec lldp lldpcli show statistics" "lldp.statistics" + save_cmd "docker logs bgp" "docker.bgp.log" + save_cmd "docker logs swss" "docker.swss.log" fi - wait - - save_cmd "ps aux" "ps.aux" & - save_cmd "top -b -n 1" "top" & - save_cmd "free" "free" & - wait - save_cmd "vmstat 1 5" "vmstat" & - save_cmd "vmstat -m" "vmstat.m" & - save_cmd "vmstat -s" "vmstat.s" & - wait - save_cmd "mount" "mount" & - save_cmd "df" "df" & - save_cmd "dmesg" "dmesg" & - wait - - save_nat_info & - save_bfd_info & - wait - save_redis_info & + + save_cmd "ps aux" "ps.aux" + save_cmd "top -b -n 1" "top" + save_cmd "free" "free" + save_cmd "vmstat 1 5" "vmstat" + save_cmd "vmstat -m" "vmstat.m" + save_cmd "vmstat -s" "vmstat.s" + save_cmd "mount" "mount" + save_cmd "df" "df" + save_cmd "dmesg" "dmesg" + + save_nat_info + save_bfd_info + save_redis_info if $DEBUG_DUMP then - save_dump_state_all_ns & + save_dump_state_all_ns fi - wait - save_cmd "docker ps -a" "docker.ps" & - save_cmd "docker top pmon" "docker.pmon" & + save_cmd "docker ps -a" "docker.ps" + save_cmd "docker top pmon" "docker.pmon" if [[ -d ${PLUGINS_DIR} ]]; then local -r dump_plugins="$(find ${PLUGINS_DIR} -type f -executable)" for plugin in $dump_plugins; do # save stdout output of plugin and gzip it - save_cmd "$plugin" "$(basename $plugin)" true & + save_cmd "$plugin" "$(basename $plugin)" true done fi - wait - save_cmd "dpkg -l" "dpkg" & - save_cmd "who -a" "who" & - save_cmd "swapon -s" "swapon" & - wait - save_cmd "hdparm -i /dev/sda" "hdparm" & - save_cmd "ps -AwwL -o user,pid,lwp,ppid,nlwp,pcpu,pri,nice,vsize,rss,tty,stat,wchan:12,start,bsdtime,command" "ps.extended" & + save_cmd "dpkg -l" "dpkg" + save_cmd "who -a" "who" + save_cmd "swapon -s" "swapon" + save_cmd "hdparm -i /dev/sda" "hdparm" + save_cmd "ps -AwwL -o user,pid,lwp,ppid,nlwp,pcpu,pri,nice,vsize,rss,tty,stat,wchan:12,start,bsdtime,command" "ps.extended" - save_saidump & - wait + save_saidump if [ "$asic" = "barefoot" ]; then collect_barefoot @@ -1682,7 +1666,6 @@ main() { $RM $V -rf $TARDIR $MKDIR $V -p $TARDIR $MKDIR $V -p $LOGDIR - # Copying the /etc files to a directory and then tar it $CP -r /etc $TARDIR/etc rm_list=$(find -L $TARDIR/etc -maxdepth 5 -type l) @@ -1694,22 +1677,34 @@ main() { # Remove secret from /etc files before tar remove_secret_from_etc_files $TARDIR - # Remove unecessary files - $RM $V -rf $TARDIR/etc/alternatives $TARDIR/etc/passwd* \ - $TARDIR/etc/shadow* $TARDIR/etc/group* $TARDIR/etc/gshadow* \ - $TARDIR/etc/ssh* $TARDIR/etc/mlnx $TARDIR/etc/mft \ - $TARDIR/etc/ssl/certs/ $TARDIR/etc/ssl/private/* - rm_list=$(find -L $TARDIR -type f \( -iname \*.cer -o -iname \*.crt -o \ - -iname \*.pem -o -iname \*.key -o -iname \*snmpd.conf\* -o -iname \*get_creds\* \)) - if [ ! -z "$rm_list" ] - then - rm $rm_list - fi + start_t=$(date +%s%3N) + ($TAR $V --warning=no-file-removed -rhf $TARFILE -C $DUMPDIR --mode=+rw \ + --exclude="etc/alternatives" \ + --exclude="*/etc/passwd*" \ + --exclude="*/etc/shadow*" \ + --exclude="*/etc/group*" \ + --exclude="*/etc/gshadow*" \ + --exclude="*/etc/ssh*" \ + --exclude="*get_creds*" \ + --exclude="*snmpd.conf*" \ + --exclude="*/etc/mlnx" \ + --exclude="*/etc/mft" \ + --exclude="*/etc/sonic/*.cer" \ + --exclude="*/etc/sonic/*.crt" \ + --exclude="*/etc/sonic/*.pem" \ + --exclude="*/etc/sonic/*.key" \ + --exclude="*/etc/ssl/*.pem" \ + --exclude="*/etc/ssl/certs/*" \ + --exclude="*/etc/ssl/private/*" \ + $BASE/etc \ + || abort "${EXT_TAR_FAILED}" "Tar append operation failed. Aborting for safety.") \ + && $RM $V -rf $TARDIR + end_t=$(date +%s%3N) + echo "[ TAR /etc Files ] : $(($end_t-$start_t)) msec" >> $TECHSUPPORT_TIME_INFO - save_log_files & - save_crash_files & - save_warmboot_files & - wait + save_log_files + save_crash_files + save_warmboot_files if [[ "$asic" = "mellanox" ]]; then collect_mellanox_dfw_dumps @@ -1725,8 +1720,6 @@ finalize() { # Save techsupport timing profile info save_file $TECHSUPPORT_TIME_INFO log false - save_to_tar - if $DO_COMPRESS; then RC=0 $GZIP $V $TARFILE || RC=$?