Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug/stoneship #82

Merged
merged 2 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions scripts/admin-node-scripts/collectBIOSLogsFrozenBPNodes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Copyright 2023 Hewlett Packard Enterprise Development LP
# Other additional copyright holders may be indicated within.
#
# The entirety of this work is licensed under the Apache License,
# Version 2.0 (the "License"); you may not use this file except
# in compliance with the License.
#
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# set -e
# set -o xtrace

computeNodes=(
x9000c3s0b1
x9000c3s2b0
x9000c3s2b1
x9000c3s3b1
x9000c3s4b0
x9000c3s4b1
x9000c3s5b0
x9000c3s5b1
x9000c3s6b1
)


for nC in "${computeNodes[@]}";
do
printf "Compute node %s\n" "$nC"
scp "$nC":/var/log/n0/current /root/ajf/frozen-bp-logs/"$nC".log
done
printf "\n"


Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,24 @@ do
rsync lvm.sh "$nC":

rabbitVGS=$(ssh "$nC" 'vgs | grep rabbit')
if [ -z "${rabbitVGS[@]}" ];
if [ -z "${rabbitVGS[*]}" ];
then
# Retrieve the list of namespaces from each KIOXIA or 'SAMSUNG MZ3LO1T9HCJR' drive seen on the compute node and count this number of namespaces. We expect only 1
nameSpaceCount=$(ssh "$nC" 'for DRIVE in $(ls -v /dev/nvme* | grep -E "nvme[[:digit:]]+n[[:digit:]]+$"); do if [ "$(nvme id-ctrl ${DRIVE} | grep -e KIOXIA -e 'SAMSUNG MZ3LO1T9HCJR')" != "" ]; then nvme id-ns $DRIVE | grep "NVME"; fi; done | uniq | wc -l')
nameSpaceCount=$(ssh "$nC" 'for DRIVE in $(ls -v /dev/nvme* | grep -E "nvme[[:digit:]]+n[[:digit:]]+$"); do if [ "$(nvme id-ctrl ${DRIVE} | grep -e KIOXIA -e MZ3LO1T9HCJR)" != "" ]; then nvme id-ns $DRIVE | grep "NVME"; fi; done | uniq | wc -l')
if ((nameSpaceCount > 1)); then
printf "Too many namespaces(%d), please examine your setup\n" "$nameSpaceCount"
exit 1
fi

# Pull the namespace list from each KIOXIA or 'SAMSUNG MZ3LO1T9HCJR' drive, we know there is only 1 now.
nameSpaceStr=$(ssh "$nC" 'for DRIVE in $(ls -v /dev/nvme* | grep -E "nvme[[:digit:]]+n[[:digit:]]+$"); do if [ "$(nvme id-ctrl ${DRIVE} | grep -e KIOXIA -e 'SAMSUNG MZ3LO1T9HCJR')" != "" ]; then nvme id-ns $DRIVE | grep "NVME"; fi; done | uniq')
nameSpaceID=$(echo $nameSpaceStr | sed 's|:||g' | awk '{print $4}')
nameSpaceStr=$(ssh "$nC" 'for DRIVE in $(ls -v /dev/nvme* | grep -E "nvme[[:digit:]]+n[[:digit:]]+$"); do if [ "$(nvme id-ctrl ${DRIVE} | grep -e KIOXIA -e MZ3LO1T9HCJR)" != "" ]; then nvme id-ns $DRIVE | grep "NVME"; fi; done | uniq')
nameSpaceID=$(echo "$nameSpaceStr" | sed 's|:||g' | awk '{print $4}')

# Create an LVM volume from the namespaces present
# shellcheck disable=SC2029
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a vscode thing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I use shellcheck to lint my scripts. This comment tells shellcheck to avoid giving me the SC2029 warning for the line below.

ssh "$nC" "./lvm.sh create rabbit $nameSpaceID"
else
printf "%s\n", "${rabbitVGS[*]}"
fi
done
printf "\n"


Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,41 @@
# set -e
# set -o xtrace

rabbitPXName="x1000c[0-7]j7b0n0"
rabbitSXName="x1000c[0-7]j4b0"
shopt -s expand_aliases

usage() {
cat <<EOF
Powercycle the specified Rabbits.
Use 'pdsh' style specifiers to specify multiple Rabbit nodes
See: https://linux.die.net/man/1/pdsh for details

Usage: $0 [-h] [-t] [RABBIT-P-X-NAMES] [RABBIT-S-X-NAMES]

X-NAMES:
# Texas TDS systems
x9000c[0-7]rbt7b0n0 Chassis 0..7, Rabbit P (rbt7), board 0, node 0
x9000c[0-7]rbt4b0 Chassis 0..7, Rabbit S (rbt4), board 0

# Stoneship TDS systems
x1000c[0-7]j7b0n0 Chassis 0..7, Rabbit P (j7), board 0, node 0
x1000c[0-7]j4b0 Chassis 0..7, Rabbit S (j4), board 0

Arguments:
-h display this help
-t time each command

Examples:
# Texas TDS
./powercycle.sh -t x9000c[1,3]rbt7b0n0 x9000c[1,3]rbt4b0 # c[1] - tx-peter, c[3] - tx-bugs

# Stoneship TDS
./powercycle.sh -t x1000c[0-7]j7b0n0 x1000c[0-7]j4b0 # c[0-7] all Rabbits
EOF
}


rabbitPXName="${1:-x9000c3rbt7b0n0}"
rabbitSXName="${2:-x9000c3rbt4b0}"

paxControl() {
local op=$1
Expand Down
64 changes: 64 additions & 0 deletions scripts/admin-node-scripts/updateDriveFirmware.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

# Copyright 2023 Hewlett Packard Enterprise Development LP
# Other additional copyright holders may be indicated within.
#
# The entirety of this work is licensed under the Apache License,
# Version 2.0 (the "License"); you may not use this file except
# in compliance with the License.
#
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# set -e
# set -o xtrace

usage() {
cat <<EOF
Query drive firmware version for all drives in a Rabbit. Update drives that are out-of-date.

Assumes that the following are installed on the Rabbit:
- /root/nnf-ec
- /root/tools/nvme.sh
- /root/KIOXIA/<firmware-file>

Usage: $0 [-h] [RABBIT-XNAME] [EXPECTED-FIRMWARE] [FIRMWARE-FILENAME]

Arguments:
-h display this help

EOF
}

rabbit=$1
expectedFirmware=$2
firmwareFile=$3

# Run nnf-ec to initialize PAX chips and drives
ssh "$rabbit" ./nnf-ec -initializeAndExit > /dev/null 2>&1

# Retrieve a list of unique firmware levels
firmware=$(ssh "$rabbit" "tools/nvme.sh cmd id-ctrl | grep -e \"^fr \" | uniq")
firmware=$(echo "$firmware" | awk '{print $3}')
echo "$firmware"

if [ "$firmware" == "$expectedFirmware" ]; then
printf "Firmware up to date\n"
else
printf "Firmware mismatch, downloading %s %s\n" "$expectedFirmware" "$firmwareFile"

for (( slot=1; slot <= 3; ++slot ));
do
# shellcheck disable=SC2029
ssh "$rabbit" "tools/nvme.sh cmd fw-download --fw=$firmwareFile --xfer=256"
# shellcheck disable=SC2029
ssh "$rabbit" "tools/nvme.sh cmd fw-activate --slot=$slot ==action=1"
done
fi
printf "\n"
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,5 @@ do
ssh "$rabbit" "cd /usr/src/switchtec-user && ./configure && make install && cp /usr/src/switchtec-nvme-cli/switchtec-nvme /usr/sbin"

# Launch nnf-ec to initialize PAX's, drives and compute node endpoints
# type CTRL-C when you see "Starting HTTP Server {"address": ":8080"}"
# to proceed to the next Rabbit.
ssh "$rabbit" "./nnf-ec --initializeAndExit"
done
67 changes: 52 additions & 15 deletions tools/nvme.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,25 +65,61 @@ Drive Firmware upgrade:
EOF
}

# execute <fn<path>> <args...>
execute() {
local FUNCTION=$1 ARGS=( "${@:2}" )
# executeOnSwitch <fn<path>> <switch> <args...>
executeOnSwitch() {
local FUNCTION=$1 SWITCH=$2 ARGS=( "${@:3}" )

# shellcheck disable=SC2086
if [ "$(type -t $FUNCTION)" != "function" ]; then
echo "$1 is not a function."
exit 1
fi

mapfile -t PDFIDS < <(getPDFIDs "$SWITCH")
for INDEX in "${!PDFIDS[@]}";
do
"$FUNCTION" "${PDFIDS[$INDEX]}@$SWITCH" "${ARGS[@]}"
done
}

# execute <fn<path>> <args...>
execute() {
local FUNCTION=$1 ARGS=( "${@:2}" )

SWITCHES=("/dev/switchtec0" "/dev/switchtec1")
for SWITCH in "${SWITCHES[@]}";
do
executeOnSwitch "$FUNCTION" "$SWITCH" "${ARGS[@]}"
done
}

# executeParallel <fn<path>> on each switch in parallel
executeParallel() {
local FUNCTION=$1 ARGS=( "${@:2}" )

SWITCHES=("/dev/switchtec0" "/dev/switchtec1")
for SWITCH in "${SWITCHES[@]}";
do
mapfile -t PDFIDS < <(getPDFIDs "$SWITCH")
for INDEX in "${!PDFIDS[@]}";
do
"$FUNCTION" "${PDFIDS[$INDEX]}@$SWITCH" "${ARGS[@]}"
done
# To see the output as commands run use this approach. This produces
# a mixed output that is difficult to read, but provides feedback that something is happening
# executeOnSwitch "$FUNCTION" "$SWITCH" "${ARGS[@]}" 2>&1 | tee _result"$(basename "$SWITCH")" &

local functionName="$FUNCTION"
if [ "$FUNCTION" == "cmd" ];
then
functionName="$2"
printf "Executing %s for each drive on %s\n" "$functionName" "$SWITCH"
fi
executeOnSwitch "$FUNCTION" "$SWITCH" "${ARGS[@]}" > _result"$(basename "$SWITCH")" 2>&1 &
done
wait

for SWITCH in "${SWITCHES[@]}";
do
cat _result"$(basename "$SWITCH")"
done

rm _result*
}

alias TIME=""
Expand Down Expand Up @@ -121,46 +157,46 @@ case $1 in
echo "Creating Namespaces on $DRIVE with size ${SIZE}"
TIME switchtec-nvme create-ns "$DRIVE" --nsze="$SECTORS" --ncap="$SECTORS" --block-size=4096 --nmic=1
}
execute create_ns "${2:-0}"
executeParallel create_ns "${2:-0}"
;;
attach)
function attach_ns() {
local DRIVE=$1 NAMESPACE=$2 CONTROLLER=$3
echo "Attaching Namespace $NAMESPACE on $DRIVE to Controller $CONTROLLER"
TIME switchtec-nvme attach-ns "$DRIVE" --namespace-id="$NAMESPACE" --controllers="$CONTROLLER"
}
execute attach_ns "${2:-1}" "${3:-3}"
executeParallel attach_ns "${2:-1}" "${3:-3}"
;;
delete)
function delete_ns() {
local DRIVE=$1 NAMESPACE=$2
echo "Deleting Namespaces $NAMESPACE on $DRIVE"
TIME switchtec-nvme delete-ns "$DRIVE" --namespace-id="$NAMESPACE"
}
execute delete_ns "${2:-1}"
executeParallel delete_ns "${2:-1}"
;;
detach)
function detach_ns() {
local DRIVE=$1 NAMESPACE=$2 CONTROLLER=$3
echo "Detaching Namespace $NAMESPACE on $DRIVE from Controller $CONTROLLER"
TIME switchtec-nvme detach-ns "$DRIVE" --namespace-id="$NAMESPACE" --controllers="$CONTROLLER"
}
execute detach_ns "${2:-1}" "${3:-3}"
executeParallel detach_ns "${2:-1}" "${3:-3}"
;;
list)
function list_ns() {
local DRIVE=$1
echo "Namespaces on $DRIVE"
TIME switchtec-nvme list-ns "$DRIVE" --all
}
execute list_ns
executeParallel list_ns
;;
list-pdfid)
function list_pfid() {
local DRIVE=$1
echo "$DRIVE"
}
execute list_pfid
executeParallel list_pfid
;;

cmd)
Expand All @@ -169,7 +205,8 @@ case $1 in
echo "Execute on $DRIVE $CMD" "${ARGS[@]}"
TIME switchtec-nvme "$CMD" "$DRIVE" "${ARGS[@]}"
}
execute cmd "${@:2}"
# execute cmd "${@:2}"
executeParallel cmd "${@:2}"
;;
*)
usage
Expand Down