From 4638c99200cb240a49ecf0875a21858e7e4933fd Mon Sep 17 00:00:00 2001 From: Alexis MacAskill Date: Mon, 8 Apr 2024 16:22:58 +0000 Subject: [PATCH] Record original error code to operation_errors metric for temporary errors --- pkg/common/errors.go | 58 +++++++++++++ pkg/common/utils_test.go | 20 +++++ pkg/gce-cloud-provider/compute/gce-compute.go | 12 +-- pkg/gce-pd-csi-driver/controller.go | 4 +- pkg/metrics/metrics.go | 24 ++++-- pkg/metrics/metrics_test.go | 82 +++++++++++++++++++ 6 files changed, 187 insertions(+), 13 deletions(-) create mode 100644 pkg/common/errors.go diff --git a/pkg/common/errors.go b/pkg/common/errors.go new file mode 100644 index 0000000000..afd83e0822 --- /dev/null +++ b/pkg/common/errors.go @@ -0,0 +1,58 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package common + +import ( + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// TemporaryError wraps an error with a temporary error code. +// It implements the error interface. Do not return TemporaryError +// directly from CSI Spec API calls, as CSI Spec API calls MUST +// return a standard gRPC status. If TemporaryErrors are returned from +// helper functions within a CSI Spec API method, make sure the outer CSI +// Spec API method returns a standard gRPC status. (e.g. LoggedError(tempErr) ) +type TemporaryError struct { + err error + code codes.Code +} + +// Unwrap extracts the original error. +func (te *TemporaryError) Unwrap() error { + return te.err +} + +// GRPCStatus extracts the underlying gRPC Status error. +// This method is necessary to fulfill the grpcstatus interface +// described in https://pkg.go.dev/google.golang.org/grpc/status#FromError. +// FromError is used in CodeForError to get existing error codes from status errors. +func (te *TemporaryError) GRPCStatus() *status.Status { + if te.err == nil { + return status.New(codes.OK, "") + } + return status.New(te.code, te.err.Error()) +} + +func NewTemporaryError(code codes.Code, err error) *TemporaryError { + return &TemporaryError{err: err, code: code} +} + +// Error returns a readable representation of the TemporaryError. +func (te *TemporaryError) Error() string { + return te.err.Error() +} diff --git a/pkg/common/utils_test.go b/pkg/common/utils_test.go index de7d27ba3f..8212f278ec 100644 --- a/pkg/common/utils_test.go +++ b/pkg/common/utils_test.go @@ -1021,6 +1021,26 @@ func TestCodeForError(t *testing.T) { inputErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"), expCode: codes.InvalidArgument, }, + { + name: "TemporaryError that wraps googleapi error", + inputErr: &TemporaryError{code: codes.Unavailable, err: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}}, + expCode: codes.Unavailable, + }, + { + name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error", + inputErr: &TemporaryError{code: codes.Aborted, err: fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})}, + expCode: codes.Aborted, + }, + { + name: "TemporaryError that wraps status error", + inputErr: &TemporaryError{code: codes.Aborted, err: status.Error(codes.Aborted, "aborted error")}, + expCode: codes.Aborted, + }, + { + name: "TemporaryError that wraps context canceled error", + inputErr: &TemporaryError{code: codes.Aborted, err: context.Canceled}, + expCode: codes.Aborted, + }, } for _, tc := range testCases { diff --git a/pkg/gce-cloud-provider/compute/gce-compute.go b/pkg/gce-cloud-provider/compute/gce-compute.go index 7da5de7ecc..05c96ef595 100644 --- a/pkg/gce-cloud-provider/compute/gce-compute.go +++ b/pkg/gce-cloud-provider/compute/gce-compute.go @@ -522,7 +522,7 @@ func (cloud *CloudProvider) insertRegionalDisk( if err != nil { // failed to GetDisk, however the Disk may already exist // the error code should be non-Final - return status.Error(codes.Unavailable, err.Error()) + return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err)) } err = cloud.ValidateExistingDisk(ctx, disk, params, int64(capacityRange.GetRequiredBytes()), @@ -546,7 +546,7 @@ func (cloud *CloudProvider) insertRegionalDisk( if IsGCEError(err, "alreadyExists") { disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion) if err != nil { - return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error()) + return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err)) } err = cloud.ValidateExistingDisk(ctx, disk, params, int64(capacityRange.GetRequiredBytes()), @@ -558,7 +558,7 @@ func (cloud *CloudProvider) insertRegionalDisk( klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name) return nil } - return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error()) + return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err)) } return nil } @@ -653,7 +653,7 @@ func (cloud *CloudProvider) insertZonalDisk( if err != nil { // failed to GetDisk, however the Disk may already exist // the error code should be non-Final - return status.Error(codes.Unavailable, err.Error()) + return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err)) } err = cloud.ValidateExistingDisk(ctx, disk, params, int64(capacityRange.GetRequiredBytes()), @@ -678,7 +678,7 @@ func (cloud *CloudProvider) insertZonalDisk( if IsGCEError(err, "alreadyExists") { disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion) if err != nil { - return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error()) + return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err)) } err = cloud.ValidateExistingDisk(ctx, disk, params, int64(capacityRange.GetRequiredBytes()), @@ -690,7 +690,7 @@ func (cloud *CloudProvider) insertZonalDisk( klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name) return nil } - return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error()) + return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err)) } return nil } diff --git a/pkg/gce-pd-csi-driver/controller.go b/pkg/gce-pd-csi-driver/controller.go index 733694059e..2df9ac2b01 100644 --- a/pkg/gce-pd-csi-driver/controller.go +++ b/pkg/gce-pd-csi-driver/controller.go @@ -2001,7 +2001,7 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name // failed to GetDisk, however the Disk may already be created, the error code should be non-Final disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion) if err != nil { - return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error()) + return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating regional disk: %w", err)) } return disk, nil } @@ -2024,7 +2024,7 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam // failed to GetDisk, however the Disk may already be created, the error code should be non-Final disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion) if err != nil { - return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error()) + return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating zonal disk: %w", err)) } return disk, nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index d37c666f50..8a998f0890 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -17,6 +17,7 @@ limitations under the License. package metrics import ( + "errors" "fmt" "net/http" "os" @@ -97,11 +98,9 @@ func (mm *MetricsManager) RecordOperationErrorMetrics( diskType string, enableConfidentialStorage string, enableStoragePools string) { - err := codes.OK.String() - if operationErr != nil { - err = common.CodeForError(operationErr).String() - } - pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, err, diskType, enableConfidentialStorage, enableStoragePools).Inc() + errCode := errorCodeLabelValue(operationErr) + pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc() + klog.Infof("Recorded PDCSI operation error code: %q", errCode) } func (mm *MetricsManager) EmitGKEComponentVersion() error { @@ -169,3 +168,18 @@ func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) { } return diskType, enableConfidentialStorage, enableStoragePools } + +// errorCodeLabelValue returns the label value for the given operation error. +// This was separated into a helper function for unit testing purposes. +func errorCodeLabelValue(operationErr error) string { + err := codes.OK.String() + if operationErr != nil { + // If the operationErr is a TemporaryError, unwrap the temporary error before passing it to CodeForError. + var tempErr *common.TemporaryError + if errors.As(operationErr, &tempErr) { + operationErr = tempErr.Unwrap() + } + err = common.CodeForError(operationErr).String() + } + return err +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 919cb8388f..62499a5755 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -18,9 +18,19 @@ limitations under the License. package metrics import ( + "context" + "errors" + "fmt" + "net/http" "testing" + "github.com/google/go-cmp/cmp" "google.golang.org/api/compute/v1" + "google.golang.org/api/googleapi" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute" ) @@ -101,3 +111,75 @@ func TestGetMetricParameters(t *testing.T) { } } } + +func TestErrorCodeLabelValue(t *testing.T) { + testCases := []struct { + name string + operationErr error + wantErrorCode string + }{ + { + name: "Not googleapi.Error", + operationErr: errors.New("I am not a googleapi.Error"), + wantErrorCode: "Internal", + }, + { + name: "User error", + operationErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}, + wantErrorCode: "InvalidArgument", + }, + { + name: "googleapi.Error but not a user error", + operationErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"}, + wantErrorCode: "Internal", + }, + { + name: "context canceled error", + operationErr: context.Canceled, + wantErrorCode: "Canceled", + }, + { + name: "context deadline exceeded error", + operationErr: context.DeadlineExceeded, + wantErrorCode: "DeadlineExceeded", + }, + { + name: "status error with Aborted error code", + operationErr: status.Error(codes.Aborted, "aborted error"), + wantErrorCode: "Aborted", + }, + { + name: "user multiattach error", + operationErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"), + wantErrorCode: "InvalidArgument", + }, + { + name: "TemporaryError that wraps googleapi error", + operationErr: common.NewTemporaryError(codes.Unavailable, &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}), + wantErrorCode: "InvalidArgument", + }, + { + name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error", + operationErr: common.NewTemporaryError(codes.Aborted, fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})), + wantErrorCode: "InvalidArgument", + }, + { + name: "TemporaryError that wraps status error", + operationErr: common.NewTemporaryError(codes.Aborted, status.Error(codes.InvalidArgument, "User error with bad request")), + wantErrorCode: "InvalidArgument", + }, + { + name: "TemporaryError that wraps multiattach error", + operationErr: common.NewTemporaryError(codes.Unavailable, fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'")), + wantErrorCode: "InvalidArgument", + }, + } + + for _, tc := range testCases { + t.Logf("Running test: %v", tc.name) + errCode := errorCodeLabelValue(tc.operationErr) + if diff := cmp.Diff(tc.wantErrorCode, errCode); diff != "" { + t.Errorf("%s: -want err, +got err\n%s", tc.name, diff) + } + } +}