From 5ba2f8225b55b02a8f57b881c3a920ac57bd766c Mon Sep 17 00:00:00 2001 From: Peter Schuurman Date: Wed, 7 Feb 2024 16:11:03 -0800 Subject: [PATCH] Add support for a multi-zone volumeHandle --- cmd/gce-pd-csi-driver/main.go | 19 +- pkg/common/constants.go | 8 + pkg/common/utils.go | 12 ++ pkg/gce-cloud-provider/compute/cloud-disk.go | 13 ++ .../compute/cloud-disk_test.go | 45 +++++ pkg/gce-pd-csi-driver/controller.go | 170 +++++++++++++++--- pkg/gce-pd-csi-driver/controller_test.go | 9 +- pkg/gce-pd-csi-driver/gce-pd-driver.go | 17 +- pkg/gce-pd-csi-driver/gce-pd-driver_test.go | 3 +- test/e2e/tests/multi_zone_e2e_test.go | 106 ++++++++++- test/e2e/tests/resize_e2e_test.go | 6 +- test/e2e/tests/single_zone_e2e_test.go | 63 ++++++- test/e2e/utils/utils.go | 2 +- test/remote/client-wrappers.go | 12 +- test/sanity/sanity_test.go | 3 +- 15 files changed, 432 insertions(+), 56 deletions(-) diff --git a/cmd/gce-pd-csi-driver/main.go b/cmd/gce-pd-csi-driver/main.go index 8a122ee64..cc6b8b2ba 100644 --- a/cmd/gce-pd-csi-driver/main.go +++ b/cmd/gce-pd-csi-driver/main.go @@ -70,12 +70,16 @@ var ( maxConcurrentFormatAndMount = flag.Int("max-concurrent-format-and-mount", 1, "If set then format and mount operations are serialized on each node. This is stronger than max-concurrent-format as it includes fsck and other mount operations") formatAndMountTimeout = flag.Duration("format-and-mount-timeout", 1*time.Minute, "The maximum duration of a format and mount operation before another such operation will be started. Used only if --serialize-format-and-mount") fallbackRequisiteZonesFlag = flag.String("fallback-requisite-zones", "", "Comma separated list of requisite zones that will be used if there are not sufficient zones present in requisite topologies when provisioning a disk") + enableStoragePoolsFlag = flag.Bool("enable-storage-pools", false, "If set to true, the CSI Driver will allow volumes to be provisioned in Storage Pools") + + multiZoneVolumeHandleDiskTypesFlag = flag.String("multi-zone-volume-handle-disk-types", "", "Comma separated list of allowed disk types that can use the multi-zone volumeHandle. Used only if --multi-zone-volume-handle-enable") + multiZoneVolumeHandleEnableFlag = flag.Bool("multi-zone-volume-handle-enable", false, "If set to true, the multi-zone volumeHandle feature will be enabled") - enableStoragePoolsFlag = flag.Bool("enable-storage-pools", false, "If set to true, the CSI Driver will allow volumes to be provisioned in Storage Pools") computeEnvironment gce.Environment = gce.EnvironmentProduction computeEndpoint *url.URL - version string allowedComputeEnvironment = []gce.Environment{gce.EnvironmentStaging, gce.EnvironmentProduction} + + version string ) const ( @@ -156,9 +160,16 @@ func handle() { // Initialize identity server identityServer := driver.NewIdentityServer(gceDriver) - // Initilaize requisite zones + // Initialize requisite zones fallbackRequisiteZones := strings.Split(*fallbackRequisiteZonesFlag, ",") + // Initialize multi-zone disk types + multiZoneVolumeHandleDiskTypes := strings.Split(*multiZoneVolumeHandleDiskTypesFlag, ",") + multiZoneVolumeHandleConfig := driver.MultiZoneVolumeHandleConfig{ + Enable: *multiZoneVolumeHandleEnableFlag, + DiskTypes: multiZoneVolumeHandleDiskTypes, + } + // Initialize requirements for the controller service var controllerServer *driver.GCEControllerServer if *runControllerService { @@ -168,7 +179,7 @@ func handle() { } initialBackoffDuration := time.Duration(*errorBackoffInitialDurationMs) * time.Millisecond maxBackoffDuration := time.Duration(*errorBackoffMaxDurationMs) * time.Millisecond - controllerServer = driver.NewControllerServer(gceDriver, cloudProvider, initialBackoffDuration, maxBackoffDuration, fallbackRequisiteZones, *enableStoragePoolsFlag) + controllerServer = driver.NewControllerServer(gceDriver, cloudProvider, initialBackoffDuration, maxBackoffDuration, fallbackRequisiteZones, *enableStoragePoolsFlag, multiZoneVolumeHandleConfig) } else if *cloudConfigFilePath != "" { klog.Warningf("controller service is disabled but cloud config given - it has no effect") } diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 9ca91f40c..9851b11d1 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -24,4 +24,12 @@ const ( VolumeAttributePartition = "partition" UnspecifiedValue = "UNSPECIFIED" + + // Keyword indicating a 'multi-zone' volumeHandle. Replaces "zones" in the volumeHandle: + // eg: projects/{project}/zones/multi-zone/disks/{name} vs. + // projects/{project}/zones/{zone}/disks/{name} + MultiZoneValue = "multi-zone" + + // Label that is set on a disk when it is used by a 'multi-zone' VolumeHandle + MultiZoneLabel = "goog-gke-multi-zone" ) diff --git a/pkg/common/utils.go b/pkg/common/utils.go index b004f9d6c..36e628e5d 100644 --- a/pkg/common/utils.go +++ b/pkg/common/utils.go @@ -478,3 +478,15 @@ func UnorderedSlicesEqual(slice1 []string, slice2 []string) bool { } return true } + +func VolumeIdAsMultiZone(volumeId string) (string, error) { + splitId := strings.Split(volumeId, "/") + if len(splitId) != volIDTotalElements { + return "", fmt.Errorf("failed to get id components. Expected projects/{project}/zones/{zone}/disks/{name}. Got: %s", volumeId) + } + if splitId[volIDToplogyKey] != "zones" { + return "", fmt.Errorf("expected id to be zonal. Got: %s", volumeId) + } + splitId[volIDToplogyValue] = MultiZoneValue + return strings.Join(splitId, "/"), nil +} diff --git a/pkg/gce-cloud-provider/compute/cloud-disk.go b/pkg/gce-cloud-provider/compute/cloud-disk.go index 5649a952f..b704c1cf4 100644 --- a/pkg/gce-cloud-provider/compute/cloud-disk.go +++ b/pkg/gce-cloud-provider/compute/cloud-disk.go @@ -290,3 +290,16 @@ func (d *CloudDisk) GetEnableStoragePools() bool { return false } } + +func (d *CloudDisk) GetLabels() map[string]string { + switch { + case d.disk != nil: + return d.disk.Labels + case d.betaDisk != nil: + return d.betaDisk.Labels + case d.alphaDisk != nil: + return d.alphaDisk.Labels + default: + return nil + } +} diff --git a/pkg/gce-cloud-provider/compute/cloud-disk_test.go b/pkg/gce-cloud-provider/compute/cloud-disk_test.go index bdf8c45ff..0efa1062b 100644 --- a/pkg/gce-cloud-provider/compute/cloud-disk_test.go +++ b/pkg/gce-cloud-provider/compute/cloud-disk_test.go @@ -20,6 +20,7 @@ package gcecloudprovider import ( "testing" + "github.com/google/go-cmp/cmp" computealpha "google.golang.org/api/compute/v0.alpha" computebeta "google.golang.org/api/compute/v0.beta" computev1 "google.golang.org/api/compute/v1" @@ -129,3 +130,47 @@ func TestGetEnableStoragePools(t *testing.T) { } } } + +func TestGetLabels(t *testing.T) { + testCases := []struct { + name string + cloudDisk *CloudDisk + wantLabels map[string]string + }{ + { + name: "v1 disk labels", + cloudDisk: &CloudDisk{ + disk: &computev1.Disk{ + Labels: map[string]string{"foo": "v1", "goog-gke-multi-zone": "true"}, + }, + }, + wantLabels: map[string]string{"foo": "v1", "goog-gke-multi-zone": "true"}, + }, + { + name: "beta disk labels", + cloudDisk: &CloudDisk{ + betaDisk: &computebeta.Disk{ + Labels: map[string]string{"bar": "beta", "goog-gke-multi-zone": "true"}, + }, + }, + wantLabels: map[string]string{"bar": "beta", "goog-gke-multi-zone": "true"}, + }, + { + name: "alpha disk without storage pool returns false", + cloudDisk: &CloudDisk{ + alphaDisk: &computealpha.Disk{ + Labels: map[string]string{"baz": "alpha", "goog-gke-multi-zone": "true"}, + }, + }, + wantLabels: map[string]string{"baz": "alpha", "goog-gke-multi-zone": "true"}, + }, + } + + for _, tc := range testCases { + t.Logf("Running test: %v", tc.name) + gotLabels := tc.cloudDisk.GetLabels() + if diff := cmp.Diff(tc.wantLabels, gotLabels); diff != "" { + t.Errorf("GetLabels() returned unexpected difference (-want +got):\n%s", diff) + } + } +} diff --git a/pkg/gce-pd-csi-driver/controller.go b/pkg/gce-pd-csi-driver/controller.go index fb4745015..486b400d7 100644 --- a/pkg/gce-pd-csi-driver/controller.go +++ b/pkg/gce-pd-csi-driver/controller.go @@ -47,8 +47,8 @@ type GCEControllerServer struct { CloudProvider gce.GCECompute Metrics metrics.MetricsManager - disks []*compute.Disk - seen map[string]int + volumeEntries []*csi.ListVolumesResponse_Entry + volumeEntriesSeen map[string]int snapshots []*csi.ListSnapshotsResponse_Entry snapshotTokens map[string]int @@ -103,6 +103,25 @@ type GCEControllerServer struct { // If set to true, the CSI Driver will allow volumes to be provisioned in Storage Pools. enableStoragePools bool + + multiZoneVolumeHandleConfig MultiZoneVolumeHandleConfig +} + +type MultiZoneVolumeHandleConfig struct { + // A set of supported disk types that are compatible with multi-zone volumeHandles. + // The disk type is only validated on ControllerPublish. + // Other operations that interacti with volumeHandle (ListVolumes/ControllerUnpublish) + // don't validate the disk type. This ensures existing published multi-zone volumes + // are listed and unpublished correctly. This allows this flag + // to be ratcheted to be more restricted without affecting volumes that are already + // published. + DiskTypes []string + + // If set to true, the CSI driver will enable the multi-zone volumeHandle feature. + // If set to false, volumeHandles that contain 'multi-zone' will not be translated + // to their respective attachment zone (based on the node), which will result in + // an "Unknown zone" error on ControllerPublish/ControllerUnpublish. + Enable bool } type csiErrorBackoffId string @@ -588,6 +607,25 @@ func parseMachineType(machineTypeUrl string) string { return machineType } +func convertMultiZoneVolKeyToZoned(volumeKey *meta.Key, instanceZone string) *meta.Key { + volumeKey.Zone = instanceZone + return volumeKey +} + +func (gceCS *GCEControllerServer) validateMultiZoneDisk(volumeID string, disk *gce.CloudDisk) error { + if !slices.Contains(gceCS.multiZoneVolumeHandleConfig.DiskTypes, disk.GetPDType()) { + return status.Errorf(codes.InvalidArgument, "Multi-zone volumeID %q points to disk with unsupported disk type %q: %v", volumeID, disk.GetPDType(), disk.GetSelfLink()) + } + if _, ok := disk.GetLabels()[common.MultiZoneLabel]; !ok { + return status.Errorf(codes.InvalidArgument, "Multi-zone volumeID %q points to disk that is missing label %q: %v", volumeID, common.MultiZoneLabel, disk.GetSelfLink()) + } + return nil +} + +func isMultiZoneVolKey(volumeKey *meta.Key) bool { + return volumeKey.Type() == meta.Zonal && volumeKey.Zone == common.MultiZoneValue +} + func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Context, req *csi.ControllerPublishVolumeRequest) (*csi.ControllerPublishVolumeResponse, error, *gce.CloudDisk) { project, volKey, pdcsiContext, err := gceCS.validateControllerPublishVolumeRequest(ctx, req) if err != nil { @@ -603,6 +641,21 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con PublishContext: nil, } + instanceZone, instanceName, err := common.NodeIDToZoneAndName(nodeID) + if err != nil { + return nil, status.Errorf(codes.NotFound, "could not split nodeID: %v", err.Error()), nil + } + + volumeIsMultiZone := isMultiZoneVolKey(volKey) + if gceCS.multiZoneVolumeHandleConfig.Enable && volumeIsMultiZone { + // Only allow read-only attachment for "multi-zone" volumes + if !readOnly { + return nil, status.Errorf(codes.InvalidArgument, "'multi-zone' volume only supports 'readOnly': %v", volumeID), nil + } + + volKey = convertMultiZoneVolKeyToZoned(volKey, instanceZone) + } + project, volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, project, volKey) if err != nil { if gce.IsGCENotFoundError(err) { @@ -625,10 +678,6 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con } return nil, common.LoggedError("Failed to getDisk: ", err), disk } - instanceZone, instanceName, err := common.NodeIDToZoneAndName(nodeID) - if err != nil { - return nil, status.Errorf(codes.NotFound, "could not split nodeID: %v", err.Error()), disk - } instance, err := gceCS.CloudProvider.GetInstanceOrError(ctx, instanceZone, instanceName) if err != nil { if gce.IsGCENotFoundError(err) { @@ -637,6 +686,12 @@ func (gceCS *GCEControllerServer) executeControllerPublishVolume(ctx context.Con return nil, common.LoggedError("Failed to get instance: ", err), disk } + if gceCS.multiZoneVolumeHandleConfig.Enable && volumeIsMultiZone { + if err := gceCS.validateMultiZoneDisk(volumeID, disk); err != nil { + return nil, err, disk + } + } + readWrite := "READ_WRITE" if readOnly { readWrite = "READ_ONLY" @@ -739,6 +794,16 @@ func (gceCS *GCEControllerServer) executeControllerUnpublishVolume(ctx context.C volumeID := req.GetVolumeId() nodeID := req.GetNodeId() + + instanceZone, instanceName, err := common.NodeIDToZoneAndName(nodeID) + if err != nil { + return nil, status.Errorf(codes.InvalidArgument, "could not split nodeID: %v", err.Error()), nil + } + + if gceCS.multiZoneVolumeHandleConfig.Enable && isMultiZoneVolKey(volKey) { + volKey = convertMultiZoneVolKeyToZoned(volKey, instanceZone) + } + project, volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, project, volKey) if err != nil { if gce.IsGCENotFoundError(err) { @@ -756,10 +821,6 @@ func (gceCS *GCEControllerServer) executeControllerUnpublishVolume(ctx context.C } defer gceCS.volumeLocks.Release(lockingVolumeID) diskToUnpublish, _ := gceCS.CloudProvider.GetDisk(ctx, project, volKey, gce.GCEAPIVersionV1) - instanceZone, instanceName, err := common.NodeIDToZoneAndName(nodeID) - if err != nil { - return nil, status.Errorf(codes.InvalidArgument, "could not split nodeID: %v", err.Error()), diskToUnpublish - } instance, err := gceCS.CloudProvider.GetInstanceOrError(ctx, instanceZone, instanceName) if err != nil { if gce.IsGCENotFoundError(err) { @@ -810,6 +871,7 @@ func (gceCS *GCEControllerServer) ValidateVolumeCapabilities(ctx context.Context if err != nil { return nil, status.Errorf(codes.InvalidArgument, "Volume ID is invalid: %v", err.Error()) } + project, volKey, err = gceCS.CloudProvider.RepairUnderspecifiedVolumeKey(ctx, project, volKey) if err != nil { if gce.IsGCENotFoundError(err) { @@ -879,8 +941,9 @@ func (gceCS *GCEControllerServer) ListVolumes(ctx context.Context, req *csi.List "ListVolumes got max entries request %v. GCE only supports values >0", req.MaxEntries) } - offset := 0 + offsetLow := 0 var ok bool + var volumeEntries []*csi.ListVolumesResponse_Entry if req.StartingToken == "" { diskList, _, err := gceCS.CloudProvider.ListDisks(ctx) if err != nil { @@ -889,10 +952,14 @@ func (gceCS *GCEControllerServer) ListVolumes(ctx context.Context, req *csi.List } return nil, common.LoggedError("Failed to list disk: ", err) } - gceCS.disks = diskList - gceCS.seen = map[string]int{} + volumeEntries = gceCS.disksToVolumeEntries(diskList) + } + + if req.StartingToken == "" { + gceCS.volumeEntries = volumeEntries + gceCS.volumeEntriesSeen = map[string]int{} } else { - offset, ok = gceCS.seen[req.StartingToken] + offsetLow, ok = gceCS.volumeEntriesSeen[req.StartingToken] if !ok { return nil, status.Errorf(codes.Aborted, "ListVolumes error with invalid startingToken: %s", req.StartingToken) } @@ -903,9 +970,50 @@ func (gceCS *GCEControllerServer) ListVolumes(ctx context.Context, req *csi.List maxEntries = maxListVolumesResponseEntries } + nextToken := "" + offsetHigh := offsetLow + maxEntries + if offsetHigh < len(gceCS.volumeEntries) { + nextToken = string(uuid.NewUUID()) + gceCS.volumeEntriesSeen[nextToken] = offsetHigh + } else { + offsetHigh = len(gceCS.volumeEntries) + } + + return &csi.ListVolumesResponse{ + Entries: gceCS.volumeEntries[offsetLow:offsetHigh], + NextToken: nextToken, + }, nil +} + +// isMultiZoneDisk returns the multi-zone volumeId of a disk if it is +// "multi-zone", otherwise returns an empty string +// The second parameter indiciates if it is a "multi-zone" disk +func isMultiZoneDisk(diskRsrc string, diskLabels map[string]string) (string, bool) { + isMultiZoneDisk := false + for l := range diskLabels { + if l == common.MultiZoneLabel { + isMultiZoneDisk = true + } + } + if !isMultiZoneDisk { + return "", false + } + + multiZoneVolumeId, err := common.VolumeIdAsMultiZone(diskRsrc) + if err != nil { + klog.Warningf("Error converting multi-zone volume handle for disk %s, skipped: %v", diskRsrc, err) + return "", false + } + return multiZoneVolumeId, true +} + +// disksToVolumeEntries converts a list of disks to a list of CSI ListVolumeResponse entries +// It appends "multi-zone" volumeHandles at the end. These are volumeHandles which +// map to multiple volumeHandles in different zones +func (gceCS *GCEControllerServer) disksToVolumeEntries(disks []*compute.Disk) []*csi.ListVolumesResponse_Entry { + multiZoneNodesByVolumeId := map[string][]string{} entries := []*csi.ListVolumesResponse_Entry{} - for i := 0; i+offset < len(gceCS.disks) && i < maxEntries; i++ { - d := gceCS.disks[i+offset] + for _, d := range disks { diskRsrc, err := getResourceId(d.SelfLink) if err != nil { klog.Warningf("Bad ListVolumes disk resource %s, skipped: %v (%+v)", d.SelfLink, err, d) @@ -920,6 +1028,16 @@ func (gceCS *GCEControllerServer) ListVolumes(ctx context.Context, req *csi.List users = append(users, rsrc) } } + + if gceCS.multiZoneVolumeHandleConfig.Enable { + if multiZoneVolumeId, isMultiZone := isMultiZoneDisk(diskRsrc, d.Labels); isMultiZone { + _, ok := multiZoneNodesByVolumeId[multiZoneVolumeId] + if !ok { + multiZoneNodesByVolumeId[multiZoneVolumeId] = []string{} + } + multiZoneNodesByVolumeId[multiZoneVolumeId] = append(multiZoneNodesByVolumeId[multiZoneVolumeId], users...) + } + } entries = append(entries, &csi.ListVolumesResponse_Entry{ Volume: &csi.Volume{ VolumeId: diskRsrc, @@ -929,17 +1047,17 @@ func (gceCS *GCEControllerServer) ListVolumes(ctx context.Context, req *csi.List }, }) } - - nextToken := "" - if len(entries)+offset < len(gceCS.disks) { - nextToken = string(uuid.NewUUID()) - gceCS.seen[nextToken] = len(entries) + offset + for volumeId, nodeIds := range multiZoneNodesByVolumeId { + entries = append(entries, &csi.ListVolumesResponse_Entry{ + Volume: &csi.Volume{ + VolumeId: volumeId, + }, + Status: &csi.ListVolumesResponse_VolumeStatus{ + PublishedNodeIds: nodeIds, + }, + }) } - - return &csi.ListVolumesResponse{ - Entries: entries, - NextToken: nextToken, - }, nil + return entries } func (gceCS *GCEControllerServer) GetCapacity(ctx context.Context, req *csi.GetCapacityRequest) (*csi.GetCapacityResponse, error) { diff --git a/pkg/gce-pd-csi-driver/controller_test.go b/pkg/gce-pd-csi-driver/controller_test.go index c9096990c..598d66c80 100644 --- a/pkg/gce-pd-csi-driver/controller_test.go +++ b/pkg/gce-pd-csi-driver/controller_test.go @@ -67,6 +67,7 @@ var ( testVolumeID = fmt.Sprintf("projects/%s/zones/%s/disks/%s", project, zone, name) underspecifiedVolumeID = fmt.Sprintf("projects/UNSPECIFIED/zones/UNSPECIFIED/disks/%s", name) + multiZoneVolumeID = fmt.Sprintf("projects/%s/zones/multi-zone/disks/%s", project, name) region, _ = common.GetRegionFromZones([]string{zone}) testRegionalID = fmt.Sprintf("projects/%s/regions/%s/disks/%s", project, region, name) @@ -3781,10 +3782,10 @@ func backoffDriver(t *testing.T, config *backoffDriverConfig) *GCEDriver { driver := GetGCEDriver() driver.cs = &GCEControllerServer{ - Driver: driver, - seen: map[string]int{}, - volumeLocks: common.NewVolumeLocks(), - errorBackoff: newFakeCSIErrorBackoff(config.clock), + Driver: driver, + volumeEntriesSeen: map[string]int{}, + volumeLocks: common.NewVolumeLocks(), + errorBackoff: newFakeCSIErrorBackoff(config.clock), } driver.cs.CloudProvider = fcp diff --git a/pkg/gce-pd-csi-driver/gce-pd-driver.go b/pkg/gce-pd-csi-driver/gce-pd-driver.go index 07db696d9..cb9c790f0 100644 --- a/pkg/gce-pd-csi-driver/gce-pd-driver.go +++ b/pkg/gce-pd-csi-driver/gce-pd-driver.go @@ -152,15 +152,16 @@ func NewNodeServer(gceDriver *GCEDriver, mounter *mount.SafeFormatAndMount, devi } } -func NewControllerServer(gceDriver *GCEDriver, cloudProvider gce.GCECompute, errorBackoffInitialDuration, errorBackoffMaxDuration time.Duration, fallbackRequisiteZones []string, enableStoragePools bool) *GCEControllerServer { +func NewControllerServer(gceDriver *GCEDriver, cloudProvider gce.GCECompute, errorBackoffInitialDuration, errorBackoffMaxDuration time.Duration, fallbackRequisiteZones []string, enableStoragePools bool, multiZoneVolumeHandleConfig MultiZoneVolumeHandleConfig) *GCEControllerServer { return &GCEControllerServer{ - Driver: gceDriver, - CloudProvider: cloudProvider, - seen: map[string]int{}, - volumeLocks: common.NewVolumeLocks(), - errorBackoff: newCsiErrorBackoff(errorBackoffInitialDuration, errorBackoffMaxDuration), - fallbackRequisiteZones: fallbackRequisiteZones, - enableStoragePools: enableStoragePools, + Driver: gceDriver, + CloudProvider: cloudProvider, + volumeEntriesSeen: map[string]int{}, + volumeLocks: common.NewVolumeLocks(), + errorBackoff: newCsiErrorBackoff(errorBackoffInitialDuration, errorBackoffMaxDuration), + fallbackRequisiteZones: fallbackRequisiteZones, + enableStoragePools: enableStoragePools, + multiZoneVolumeHandleConfig: multiZoneVolumeHandleConfig, } } diff --git a/pkg/gce-pd-csi-driver/gce-pd-driver_test.go b/pkg/gce-pd-csi-driver/gce-pd-driver_test.go index 881e5ae8e..4c340ab17 100644 --- a/pkg/gce-pd-csi-driver/gce-pd-driver_test.go +++ b/pkg/gce-pd-csi-driver/gce-pd-driver_test.go @@ -48,8 +48,9 @@ func initGCEDriverWithCloudProvider(t *testing.T, cloudProvider gce.GCECompute) errorBackoffMaxDuration := 5 * time.Minute fallbackRequisiteZones := []string{} enableStoragePools := false + multiZoneVolumeHandleConfig := MultiZoneVolumeHandleConfig{} - controllerServer := NewControllerServer(gceDriver, cloudProvider, errorBackoffInitialDuration, errorBackoffMaxDuration, fallbackRequisiteZones, enableStoragePools) + controllerServer := NewControllerServer(gceDriver, cloudProvider, errorBackoffInitialDuration, errorBackoffMaxDuration, fallbackRequisiteZones, enableStoragePools, multiZoneVolumeHandleConfig) err := gceDriver.SetupGCEDriver(driver, vendorVersion, nil, nil, controllerServer, nil) if err != nil { t.Fatalf("Failed to setup GCE Driver: %v", err) diff --git a/test/e2e/tests/multi_zone_e2e_test.go b/test/e2e/tests/multi_zone_e2e_test.go index bb54356c4..7e49ea21d 100644 --- a/test/e2e/tests/multi_zone_e2e_test.go +++ b/test/e2e/tests/multi_zone_e2e_test.go @@ -22,6 +22,7 @@ import ( csi "github.com/container-storage-interface/spec/lib/go/csi" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + compute "google.golang.org/api/compute/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/klog/v2" @@ -65,6 +66,99 @@ var _ = Describe("GCE PD CSI Driver Multi-Zone", func() { }) + It("Should attach ROX 'multi-zone' PV instances to two separate VMs", func() { + // Create new driver and client + + Expect(testContexts).NotTo(BeEmpty()) + + zoneToContext := map[string]*remote.TestContext{} + zones := []string{} + + for _, tc := range testContexts { + _, z, _ := tc.Instance.GetIdentity() + // Zone hasn't been seen before + if _, ok := zoneToContext[z]; !ok { + zoneToContext[z] = tc + zones = append(zones, z) + } + if len(zoneToContext) == 2 { + break + } + } + + Expect(len(zoneToContext)).To(Equal(2), "Must have instances in 2 zones") + + controllerContext := zoneToContext[zones[0]] + controllerClient := controllerContext.Client + controllerInstance := controllerContext.Instance + + p, _, _ := controllerInstance.GetIdentity() + + // Create Disk + volName := testNamePrefix + string(uuid.NewUUID()) + _, volID0 := createAndValidateZonalDisk(controllerClient, p, zones[0], standardDiskType, volName) + _, volID1 := createAndValidateZonalDisk(controllerClient, p, zones[1], standardDiskType, volName) + + labelsMap := map[string]string{ + common.MultiZoneLabel: "true", + } + disk1, err := computeService.Disks.Get(p, zones[0], volName).Do() + Expect(err).To(BeNil(), "Could not get disk") + disk1Op, err := computeService.Disks.SetLabels(p, zones[0], volName, &compute.ZoneSetLabelsRequest{ + LabelFingerprint: disk1.LabelFingerprint, + Labels: labelsMap, + }).Do() + Expect(err).To(BeNil(), "Could not set disk labels") + _, err = computeService.ZoneOperations.Wait(p, zones[0], disk1Op.Name).Do() + Expect(err).To(BeNil(), "Could not set disk labels") + + disk2, err := computeService.Disks.Get(p, zones[1], volName).Do() + Expect(err).To(BeNil(), "Could not get disk") + disk2Op, err := computeService.Disks.SetLabels(p, zones[1], volName, &compute.ZoneSetLabelsRequest{ + LabelFingerprint: disk2.LabelFingerprint, + Labels: labelsMap, + }).Do() + Expect(err).To(BeNil(), "Could not set disk labels") + _, err = computeService.ZoneOperations.Wait(p, zones[1], disk2Op.Name).Do() + Expect(err).To(BeNil(), "Could not set disk labels") + + defer deleteDisk(controllerClient, p, zones[0], volID0, volName) + defer deleteDisk(controllerClient, p, zones[1], volID1, volName) + + // Attach Disk + volID := fmt.Sprintf("projects/%s/zones/multi-zone/disks/%s", p, volName) + + // Attach disk to instance in the first zone. + tc0 := zoneToContext[zones[0]] + tc1 := zoneToContext[zones[1]] + + nodeID0 := tc0.Instance.GetNodeID() + nodeID1 := tc1.Instance.GetNodeID() + + err = controllerClient.ControllerPublishVolumeReadOnly(volID, nodeID0) + Expect(err).To(BeNil(), "Failed to attach and mount vol1") + + err = controllerClient.ControllerPublishVolumeReadOnly(volID, nodeID1) + Expect(err).To(BeNil(), "Failed to attach and mount vol2") + + // List Volumes + volsToNodes, err := controllerClient.ListVolumes() + Expect(err).To(BeNil(), "Failed ListVolumes") + + // Verify List Volumes + Expect(volsToNodes[volID0]).To(ContainElements(nodeID0), "Find find node in attach nodes for vol") + Expect(volsToNodes[volID1]).To(ContainElements(nodeID1), "Find find node in attach nodes for vol") + Expect(volsToNodes[volID]).To(ContainElements(nodeID0, nodeID1), "Couldn't find node in attached nodes for vol") + + // Detach disk + err = controllerClient.ControllerUnpublishVolume(volID, nodeID0) + Expect(err).To(BeNil(), "Failed to detach vol1") + + err = controllerClient.ControllerUnpublishVolume(volID, nodeID1) + Expect(err).To(BeNil(), "Failed to detach vol2") + + }) + It("Should successfully run through entire lifecycle of an RePD volume on instances in 2 zones", func() { // Create new driver and client @@ -256,6 +350,16 @@ var _ = Describe("GCE PD CSI Driver Multi-Zone", func() { }) }) +func deleteDisk(controllerClient *remote.CsiClient, p, zone, volID, volName string) { + // Delete Disk + err := controllerClient.DeleteVolume(volID) + Expect(err).To(BeNil(), "DeleteVolume failed") + + // Validate Disk Deleted + _, err = computeService.Disks.Get(p, zone, volName).Do() + Expect(gce.IsGCEError(err, "notFound")).To(BeTrue(), "Expected disk to not be found") +} + func testAttachWriteReadDetach(volID string, volName string, instance *remote.InstanceInfo, client *remote.CsiClient, readOnly bool) error { var testFileContents = "test" writeFile := func(a *verifyArgs) error { @@ -287,7 +391,7 @@ func testAttachWriteReadDetach(volID string, volName string, instance *remote.In func testAttachAndMount(volID string, volName string, instance *remote.InstanceInfo, client *remote.CsiClient, useBlock, forceAttach bool) (error, func(), *verifyArgs) { // Attach Disk - err := client.ControllerPublishVolume(volID, instance.GetNodeID(), forceAttach) + err := client.ControllerPublishVolumeReadWrite(volID, instance.GetNodeID(), forceAttach) if err != nil { return fmt.Errorf("ControllerPublishVolume failed with error for disk %v on node %v: %v", volID, instance.GetNodeID(), err.Error()), nil, nil } diff --git a/test/e2e/tests/resize_e2e_test.go b/test/e2e/tests/resize_e2e_test.go index 2b04047d4..7c130f5c1 100644 --- a/test/e2e/tests/resize_e2e_test.go +++ b/test/e2e/tests/resize_e2e_test.go @@ -67,7 +67,7 @@ var _ = Describe("GCE PD CSI Driver", func() { }() // Attach Disk - err = client.ControllerPublishVolume(volume.VolumeId, instance.GetNodeID(), false /* forceAttach */) + err = client.ControllerPublishVolumeReadWrite(volume.VolumeId, instance.GetNodeID(), false /* forceAttach */) Expect(err).To(BeNil(), "Controller publish volume failed") defer func() { @@ -189,7 +189,7 @@ var _ = Describe("GCE PD CSI Driver", func() { Expect(cloudDisk.SizeGb).To(Equal(newSizeGb)) // Attach and mount again - err = client.ControllerPublishVolume(volume.VolumeId, instance.GetNodeID(), false /* forceAttach */) + err = client.ControllerPublishVolumeReadWrite(volume.VolumeId, instance.GetNodeID(), false /* forceAttach */) Expect(err).To(BeNil(), "Controller publish volume failed") defer func() { @@ -281,7 +281,7 @@ var _ = Describe("GCE PD CSI Driver", func() { }() // Attach Disk - err = client.ControllerPublishVolume(volume.VolumeId, instance.GetNodeID(), false /* forceAttach */) + err = client.ControllerPublishVolumeReadWrite(volume.VolumeId, instance.GetNodeID(), false /* forceAttach */) Expect(err).To(BeNil(), "Controller publish volume failed") defer func() { diff --git a/test/e2e/tests/single_zone_e2e_test.go b/test/e2e/tests/single_zone_e2e_test.go index 58911cd34..71587270b 100644 --- a/test/e2e/tests/single_zone_e2e_test.go +++ b/test/e2e/tests/single_zone_e2e_test.go @@ -118,7 +118,7 @@ var _ = Describe("GCE PD CSI Driver", func() { }() // Attach Disk - err := client.ControllerPublishVolume(volID, instance.GetNodeID(), false /* forceAttach */) + err := client.ControllerPublishVolumeReadWrite(volID, instance.GetNodeID(), false /* forceAttach */) Expect(err).To(BeNil(), "ControllerPublishVolume failed with error for disk %v on node %v: %v", volID, instance.GetNodeID()) defer func() { @@ -190,7 +190,7 @@ var _ = Describe("GCE PD CSI Driver", func() { }() // Attach Disk - err := client.ControllerPublishVolume(volID, instance.GetNodeID(), false /* forceAttach */) + err := client.ControllerPublishVolumeReadWrite(volID, instance.GetNodeID(), false /* forceAttach */) Expect(err).To(BeNil(), "ControllerPublishVolume failed with error for disk %v on node %v: %v", volID, instance.GetNodeID()) defer func() { @@ -330,7 +330,7 @@ var _ = Describe("GCE PD CSI Driver", func() { }() // Attach Disk - err := client.ControllerPublishVolume(underSpecifiedID, instance.GetNodeID(), false /* forceAttach */) + err := client.ControllerPublishVolumeReadWrite(underSpecifiedID, instance.GetNodeID(), false /* forceAttach */) Expect(err).To(BeNil(), "ControllerPublishVolume failed") }, Entry("on pd-standard", standardDiskType), @@ -699,7 +699,7 @@ var _ = Describe("GCE PD CSI Driver", func() { defer deleteVolumeOrError(client, secondVolID) // Attach volID to current instance - err := client.ControllerPublishVolume(volID, nodeID, false /* forceAttach */) + err := client.ControllerPublishVolumeReadWrite(volID, nodeID, false /* forceAttach */) Expect(err).To(BeNil(), "Failed ControllerPublishVolume") defer client.ControllerUnpublishVolume(volID, nodeID) @@ -1304,6 +1304,55 @@ var _ = Describe("GCE PD CSI Driver", func() { Expect(err).To(BeNil(), "no error expected when passed valid compute url") }) + + type multiZoneTestConfig struct { + diskType string + readOnly bool + hasMultiZoneLabel bool + wantErrSubstring string + } + + DescribeTable("Unsupported 'multi-zone' PV ControllerPublish attempts", + func(cfg multiZoneTestConfig) { + Expect(testContexts).ToNot(BeEmpty()) + testContext := getRandomTestContext() + + controllerInstance := testContext.Instance + controllerClient := testContext.Client + + p, z, _ := controllerInstance.GetIdentity() + + volName := testNamePrefix + string(uuid.NewUUID()) + _, diskVolumeId := createAndValidateZonalDisk(controllerClient, p, z, cfg.diskType, volName) + defer deleteDisk(controllerClient, p, z, diskVolumeId, volName) + + if cfg.hasMultiZoneLabel { + labelsMap := map[string]string{ + common.MultiZoneLabel: "true", + } + disk, err := computeService.Disks.Get(p, z, volName).Do() + Expect(err).To(BeNil(), "Could not get disk") + diskOp, err := computeService.Disks.SetLabels(p, z, volName, &compute.ZoneSetLabelsRequest{ + LabelFingerprint: disk.LabelFingerprint, + Labels: labelsMap, + }).Do() + Expect(err).To(BeNil(), "Could not set disk labels") + _, err = computeService.ZoneOperations.Wait(p, z, diskOp.Name).Do() + Expect(err).To(BeNil(), "Could not set disk labels") + } + + // Attach Disk + volID := fmt.Sprintf("projects/%s/zones/multi-zone/disks/%s", p, volName) + nodeID := testContext.Instance.GetNodeID() + + err := controllerClient.ControllerPublishVolume(volID, nodeID, false /* forceAttach */, cfg.readOnly) + Expect(err).ToNot(BeNil(), "Unexpected success attaching disk") + Expect(err.Error()).To(ContainSubstring(cfg.wantErrSubstring), "Expected err") + }, + Entry("with unsupported ROX mode", multiZoneTestConfig{diskType: standardDiskType, readOnly: false, hasMultiZoneLabel: true, wantErrSubstring: "'multi-zone' volume only supports 'readOnly'"}), + Entry("with missing multi-zone label", multiZoneTestConfig{diskType: standardDiskType, readOnly: true, hasMultiZoneLabel: false, wantErrSubstring: "points to disk that is missing label \"goog-gke-multi-zone\""}), + Entry("with unsupported disk-type pd-extreme", multiZoneTestConfig{diskType: extremeDiskType, readOnly: true, hasMultiZoneLabel: true, wantErrSubstring: "points to disk with unsupported disk type"}), + ) }) func equalWithinEpsilon(a, b, epsiolon int64) bool { @@ -1314,9 +1363,13 @@ func equalWithinEpsilon(a, b, epsiolon int64) bool { } func createAndValidateUniqueZonalDisk(client *remote.CsiClient, project, zone string, diskType string) (string, string) { + volName := testNamePrefix + string(uuid.NewUUID()) + return createAndValidateZonalDisk(client, project, zone, diskType, volName) +} + +func createAndValidateZonalDisk(client *remote.CsiClient, project, zone string, diskType string, volName string) (string, string) { // Create Disk disk := typeToDisk[diskType] - volName := testNamePrefix + string(uuid.NewUUID()) diskSize := defaultSizeGb switch diskType { diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index 97364bed4..0f92e685b 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -63,7 +63,7 @@ func GCEClientAndDriverSetup(instance *remote.InstanceInfo, computeEndpoint stri workspace := remote.NewWorkspaceDir("gce-pd-e2e-") // Log at V(6) as the compute API calls are emitted at that level and it's // useful to see what's happening when debugging tests. - driverRunCmd := fmt.Sprintf("sh -c '/usr/bin/nohup %s/gce-pd-csi-driver -v=6 --endpoint=%s %s 2> %s/prog.out < /dev/null > /dev/null &'", + driverRunCmd := fmt.Sprintf("sh -c '/usr/bin/nohup %s/gce-pd-csi-driver -v=6 --endpoint=%s --multi-zone-volume-handle-enable --multi-zone-volume-handle-disk-types=pd-standard %s 2> %s/prog.out < /dev/null > /dev/null &'", workspace, endpoint, strings.Join(extra_flags, " "), workspace) config := &remote.ClientConfig{ diff --git a/test/remote/client-wrappers.go b/test/remote/client-wrappers.go index 7c86e657d..671cbb1df 100644 --- a/test/remote/client-wrappers.go +++ b/test/remote/client-wrappers.go @@ -134,12 +134,20 @@ func (c *CsiClient) DeleteVolume(volId string) error { return err } -func (c *CsiClient) ControllerPublishVolume(volId, nodeId string, forceAttach bool) error { +func (c *CsiClient) ControllerPublishVolumeReadOnly(volId, nodeId string) error { + return c.ControllerPublishVolume(volId, nodeId, false /* forceAttach */, true /* readOnly */) +} + +func (c *CsiClient) ControllerPublishVolumeReadWrite(volId, nodeId string, forceAttach bool) error { + return c.ControllerPublishVolume(volId, nodeId, forceAttach, false /* readOnly */) +} + +func (c *CsiClient) ControllerPublishVolume(volId, nodeId string, forceAttach bool, readOnly bool) error { cpreq := &csipb.ControllerPublishVolumeRequest{ VolumeId: volId, NodeId: nodeId, VolumeCapability: stdVolCap, - Readonly: false, + Readonly: readOnly, } if forceAttach { cpreq.VolumeContext = map[string]string{ diff --git a/test/sanity/sanity_test.go b/test/sanity/sanity_test.go index b0d883152..c093125dc 100644 --- a/test/sanity/sanity_test.go +++ b/test/sanity/sanity_test.go @@ -63,13 +63,14 @@ func TestSanity(t *testing.T) { fallbackRequisiteZones := []string{} enableStoragePools := false + multiZoneVolumeHandleConfig := driver.MultiZoneVolumeHandleConfig{} mounter := mountmanager.NewFakeSafeMounter() deviceUtils := deviceutils.NewFakeDeviceUtils(true) //Initialize GCE Driver identityServer := driver.NewIdentityServer(gceDriver) - controllerServer := driver.NewControllerServer(gceDriver, cloudProvider, 0, 5*time.Minute, fallbackRequisiteZones, enableStoragePools) + controllerServer := driver.NewControllerServer(gceDriver, cloudProvider, 0, 5*time.Minute, fallbackRequisiteZones, enableStoragePools, multiZoneVolumeHandleConfig) nodeServer := driver.NewNodeServer(gceDriver, mounter, deviceUtils, metadataservice.NewFakeService(), mountmanager.NewFakeStatter(mounter)) err = gceDriver.SetupGCEDriver(driverName, vendorVersion, extraLabels, identityServer, controllerServer, nodeServer) if err != nil {