From ba00a21e54db283c9fd745c5c7e3dcb68bb8256f Mon Sep 17 00:00:00 2001 From: Dusty Mabe Date: Thu, 16 Apr 2020 17:02:55 -0400 Subject: [PATCH] overlay: 15coreos-copy-firstboot-network: handle race condition Add short while loop to handle a race condition where After=dev-disk-by\x2dlabel-boot.device doesn't seem to be sufficient always. --- .../coreos-copy-firstboot-network.sh | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/overlay.d/05core/usr/lib/dracut/modules.d/15coreos-copy-firstboot-network/coreos-copy-firstboot-network.sh b/overlay.d/05core/usr/lib/dracut/modules.d/15coreos-copy-firstboot-network/coreos-copy-firstboot-network.sh index 32af9d7487..38f35454bc 100755 --- a/overlay.d/05core/usr/lib/dracut/modules.d/15coreos-copy-firstboot-network/coreos-copy-firstboot-network.sh +++ b/overlay.d/05core/usr/lib/dracut/modules.d/15coreos-copy-firstboot-network/coreos-copy-firstboot-network.sh @@ -4,6 +4,8 @@ set -euo pipefail # For a description of how this is used see coreos-copy-firstboot-network.service bootmnt=/mnt/boot_partition +mkdir -p ${bootmnt} +bootdev=/dev/disk/by-label/boot firstboot_network_dir_basename="coreos-firstboot-network" initramfs_firstboot_network_dir="${bootmnt}/${firstboot_network_dir_basename}" initramfs_network_dir="/run/NetworkManager/system-connections/" @@ -11,10 +13,40 @@ realroot_firstboot_network_dir="/boot/${firstboot_network_dir_basename}" # Mount /boot. Note that we mount /boot but we don't unmount boot because we # are run in a systemd unit with MountFlags=slave so it is unmounted for us. -mkdir -p ${bootmnt} -# mount as read-only since we don't strictly need write access and we may be +# Mount as read-only since we don't strictly need write access and we may be # running alongside other code that also has it mounted ro -mount -o ro /dev/disk/by-label/boot ${bootmnt} +mountboot() { + # Wait for up to 5 seconds for the boot device to be available + # The After=...*boot.device in the systemd unit should be enough + # but there appears to be some race in the kernel where the link under + # /dev/disk/by-label exists but mount is not able to use the device yet. + # We saw errors like this in CI: + # + # [ 4.045181] systemd[1]: Found device /dev/disk/by-label/boot. + # [ OK ] Found device /dev/disk/by-label/boot + # [ 4.051500] systemd[1]: Starting Copy CoreOS Firstboot Networking Config... + # Starting Copy CoreOS Firstboot Networking Config + # [ 4.060573] vda: vda1 vda2 vda3 vda4 + # [ 4.063296] coreos-copy-firstboot-network[479]: mount: /mnt/boot_partition: special device /dev/disk/by-label/boot does not exist. + # + mounted=0 + for x in {1..5}; do + if mount -o ro ${bootdev} ${bootmnt}; then + echo "info: ${bootdev} successfully mounted." + mounted=1 + break + else + echo "info: retrying ${bootdev} mount in 1 second..." + sleep 1 + fi + done + if [ "${mounted}" == "0" ]; then + echo "error: ${bootdev} mount did not succeed" 1>&2 + return 1 + fi +} + +mountboot || exit 1 if [ -n "$(ls -A ${initramfs_firstboot_network_dir} 2>/dev/null)" ]; then # Clear out any files that may have already been generated from