exec growpart
exec sgdisk --info # read-only
exec sgdisk --pretend # read-only
exec sgdisk --backup # read-only copy
# modification of disk starts
exec sgdisk --move-second-header \ --delete=PART \ --new=PART \ --typecode --partition-guid --change-name
# now that sgdisk has *closed* the filehandle on the disk, systemd-udevd will
# get an inotify signal and trigger udevd to run udev scripts on the disk.
# this includes the *removal* of symlinks due to the --delete portion of sgdisk call
# and following the removal, the -new will trigger the add run on the rules which would
# recreate the symlinks.
# update kernel partition sizes; this is an ioctl so it does not trigger an udev events
exec partx --update
# the kernel has the new partition sizes, and udev scripts/events are all queued (and possibly in flight)
exit growpart
cloud-init invokes get_size() operation which:
# this is where the race occurs if the symlink created by udev is *not* present
os.open(/dev/disk/by-id/fancy-symlink-with-partuuid-points-to-sdb1)
So, you're suggesting that somehow _not all_ of the uevents triggered by the sgdisk command in growpart *wouldn't* have been queued before we call udevadm settle?
If some other events are happening how is cloud-init to know such that it can take action to "handle this race" more robustly?
Lastly if there is a *race* in the symlink creation/remove/delay in uevent propigation; why is that a userspace let alone a cloud-init issue. This isn't universally reproducible, rather it's pretty narrow circumstances between certain kernels and udevs all the while the growpart/cloud-init code remains the same.
The sequence is:
exec growpart second- header \
--delete= PART \
--new=PART \
--typecode --partition-guid --change-name
exec sgdisk --info # read-only
exec sgdisk --pretend # read-only
exec sgdisk --backup # read-only copy
# modification of disk starts
exec sgdisk --move-
# now that sgdisk has *closed* the filehandle on the disk, systemd-udevd will
# get an inotify signal and trigger udevd to run udev scripts on the disk.
# this includes the *removal* of symlinks due to the --delete portion of sgdisk call
# and following the removal, the -new will trigger the add run on the rules which would
# recreate the symlinks.
# update kernel partition sizes; this is an ioctl so it does not trigger an udev events
exec partx --update
# the kernel has the new partition sizes, and udev scripts/events are all queued (and possibly in flight)
exit growpart
cloud-init invokes get_size() operation which: /dev/disk/ by-id/fancy- symlink- with-partuuid- points- to-sdb1)
# this is where the race occurs if the symlink created by udev is *not* present
os.open(
Dan had put a udevadm settle in this spot like so
def get_size(filename) subp([' udevadm' , 'settle'])
util.
os.open(....)
So, you're suggesting that somehow _not all_ of the uevents triggered by the sgdisk command in growpart *wouldn't* have been queued before we call udevadm settle?
If some other events are happening how is cloud-init to know such that it can take action to "handle this race" more robustly?
Lastly if there is a *race* in the symlink creation/ remove/ delay in uevent propigation; why is that a userspace let alone a cloud-init issue. This isn't universally reproducible, rather it's pretty narrow circumstances between certain kernels and udevs all the while the growpart/cloud-init code remains the same.