Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

change GetDisk error reporting to temporary in CreateVolume codepath #1558

Merged
merged 1 commit into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions pkg/gce-cloud-provider/compute/gce-compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -560,7 +560,9 @@ func (cloud *CloudProvider) insertRegionalDisk(
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return err
// failed to GetDisk, however the Disk may already exist
// the error code should be non-Final
return status.Error(codes.Unavailable, err.Error())
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -572,16 +574,19 @@ func (cloud *CloudProvider) insertRegionalDisk(
klog.Warningf("GCE PD %s already exists, reusing", volKey.Name)
return nil
}
return status.Error(codes.Internal, fmt.Sprintf("unknown Insert disk error: %v", err.Error()))
// if the error code is considered "final", RegionDisks.Insert might not be retried
return fmt.Errorf("unknown Insert Regional disk error: %w", err)
leiyiz marked this conversation as resolved.
Show resolved Hide resolved
}
klog.V(5).Infof("InsertDisk operation %s for disk %s", opName, diskToCreate.Name)

err = cloud.waitForRegionalOp(ctx, project, opName, volKey.Region)
// failed to wait for Op to finish, however, the Op possibly is still running as expected
// the error code returned should be non-final
if err != nil {
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return err
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -593,7 +598,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
return nil
}
return fmt.Errorf("unknown Insert disk operation error: %w", err)
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
}
return nil
}
Expand Down Expand Up @@ -695,7 +700,9 @@ func (cloud *CloudProvider) insertZonalDisk(
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return err
leiyiz marked this conversation as resolved.
Show resolved Hide resolved
// failed to GetDisk, however the Disk may already exist
// the error code should be non-Final
return status.Error(codes.Unavailable, err.Error())
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -707,17 +714,20 @@ func (cloud *CloudProvider) insertZonalDisk(
klog.Warningf("GCE PD %s already exists, reusing", volKey.Name)
return nil
}
// if the error code is considered "final", Disks.Insert might not be retried
return fmt.Errorf("unknown Insert disk error: %w", err)
}
klog.V(5).Infof("InsertDisk operation %s for disk %s", opName, diskToCreate.Name)

err = cloud.waitForZonalOp(ctx, project, opName, volKey.Zone)

if err != nil {
// failed to wait for Op to finish, however, the Op possibly is still running as expected
// the error code returned should be non-final
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return err
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -729,7 +739,7 @@ func (cloud *CloudProvider) insertZonalDisk(
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
return nil
}
return fmt.Errorf("unknown Insert disk operation error: %w", err)
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
}
return nil
}
Expand Down
17 changes: 10 additions & 7 deletions pkg/gce-pd-csi-driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,8 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
existingDisk, err := gceCS.CloudProvider.GetDisk(ctx, gceCS.CloudProvider.GetDefaultProject(), volKey, gceAPIVersion)
if err != nil {
if !gce.IsGCEError(err, "notFound") {
return nil, common.LoggedError("CreateVolume, failed to getDisk when validating: ", err)
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
return nil, common.LoggedError("CreateVolume, failed to getDisk when validating: ", status.Error(codes.Unavailable, err.Error()))
}
}
if err == nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I hate that you can't comment on lines outside the PR...)

On 345 there's a !ready check that returns an internal error code --- we should probably turn this into a temporary error?

You know, I think this is going to screw up our slo filtering :-/

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and on line 423

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(offline discussion: our slo filtering will be fine)

Expand All @@ -339,10 +340,10 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre

ready, err := isDiskReady(existingDisk)
if err != nil {
return nil, common.LoggedError("CreateVolume disk "+volKey.String()+" had error checking ready status: ", err)
return nil, status.Errorf(codes.Aborted, "CreateVolume disk %q had error checking ready status: %v", volKey.String(), err.Error())
}
if !ready {
return nil, status.Errorf(codes.Internal, "CreateVolume existing disk %v is not ready", volKey)
return nil, status.Errorf(codes.Aborted, "CreateVolume existing disk %v is not ready", volKey)
}

// If there is no validation error, immediately return success
Expand Down Expand Up @@ -417,10 +418,10 @@ func (gceCS *GCEControllerServer) CreateVolume(ctx context.Context, req *csi.Cre
// Verify the source disk is ready.
ready, err := isDiskReady(diskFromSourceVolume)
if err != nil {
return nil, common.LoggedError("CreateVolume disk from source volume "+sourceVolKey.String()+" had error checking ready status: ", err)
return nil, status.Errorf(codes.Aborted, "CreateVolume disk from source volume %q had error checking ready status: %v", sourceVolKey.String(), err.Error())
}
if !ready {
return nil, status.Errorf(codes.Internal, "CreateVolume disk from source volume %v is not ready", sourceVolKey)
return nil, status.Errorf(codes.Aborted, "CreateVolume disk from source volume %v is not ready", sourceVolKey)
}
}
} else { // if VolumeContentSource is nil, validate access mode is not read only
Expand Down Expand Up @@ -1876,9 +1877,10 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name
if multiWriter {
gceAPIVersion = gce.GCEAPIVersionBeta
}
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion)
if err != nil {
return nil, fmt.Errorf("failed to get disk after creating regional disk: %w", err)
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error())
}
return disk, nil
}
Expand All @@ -1898,9 +1900,10 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam
if multiWriter {
gceAPIVersion = gce.GCEAPIVersionBeta
}
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion)
if err != nil {
return nil, err
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error())
}
return disk, nil
}
Expand Down