Skip to content

Commit

Permalink
Record original error code to operation_errors metric for temporary e…
Browse files Browse the repository at this point in the history
…rrors
  • Loading branch information
amacaskill committed Apr 10, 2024
1 parent cb041f3 commit e0054fd
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 13 deletions.
54 changes: 54 additions & 0 deletions pkg/common/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package common

import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

// TemporaryError wraps an error that needs to be returned to the CSI Provisioner
// with a temporary Error code. It implements the error interface.
type TemporaryError struct {
err error
code codes.Code
}

// Unwrap extracts the original error.
func (te *TemporaryError) Unwrap() error {
return te.err
}

// GRPCStatus extracts the underlying gRPC Status error.
// This method is necessary to fulfill the interface
// described in https://pkg.go.dev/google.golang.org/grpc/status#FromError.
// FromError is used in CodeForError to get existing error codes from status errors.
func (te *TemporaryError) GRPCStatus() *status.Status {
if te.err == nil {
return status.New(codes.OK, "")
}
return status.New(te.code, te.err.Error())
}

func NewTemporaryError(code codes.Code, err error) *TemporaryError {
return &TemporaryError{err: err, code: code}
}

// Error returns a readable representation of the TemporaryError.
func (te *TemporaryError) Error() string {
return te.err.Error()
}
1 change: 1 addition & 0 deletions pkg/common/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ func existingErrorCode(err error) (codes.Code, error) {

func LoggedError(msg string, err error) error {
klog.Errorf(msg+"%v", err.Error())

return status.Errorf(CodeForError(err), msg+"%v", err.Error())
}

Expand Down
20 changes: 20 additions & 0 deletions pkg/common/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,26 @@ func TestCodeForError(t *testing.T) {
inputErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
expCode: codes.InvalidArgument,
},
{
name: "TemporaryError that wraps googleapi error",
inputErr: &TemporaryError{code: codes.Unavailable, err: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}},
expCode: codes.Unavailable,
},
{
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
inputErr: &TemporaryError{code: codes.Aborted, err: fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})},
expCode: codes.Aborted,
},
{
name: "TemporaryError that wraps status error",
inputErr: &TemporaryError{code: codes.Aborted, err: status.Error(codes.Aborted, "aborted error")},
expCode: codes.Aborted,
},
{
name: "TemporaryError that wraps context canceled error",
inputErr: &TemporaryError{code: codes.Aborted, err: context.Canceled},
expCode: codes.Aborted,
},
}

for _, tc := range testCases {
Expand Down
12 changes: 6 additions & 6 deletions pkg/gce-cloud-provider/compute/gce-compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
if err != nil {
// failed to GetDisk, however the Disk may already exist
// the error code should be non-Final
return status.Error(codes.Unavailable, err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -586,7 +586,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -598,7 +598,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
return nil
}
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
}
return nil
}
Expand Down Expand Up @@ -702,7 +702,7 @@ func (cloud *CloudProvider) insertZonalDisk(
if err != nil {
// failed to GetDisk, however the Disk may already exist
// the error code should be non-Final
return status.Error(codes.Unavailable, err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -727,7 +727,7 @@ func (cloud *CloudProvider) insertZonalDisk(
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -739,7 +739,7 @@ func (cloud *CloudProvider) insertZonalDisk(
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
return nil
}
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
}
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/gce-pd-csi-driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2001,7 +2001,7 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error())
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating regional disk: %w", err))
}
return disk, nil
}
Expand All @@ -2024,7 +2024,7 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error())
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating zonal disk: %w", err))
}
return disk, nil
}
Expand Down
24 changes: 19 additions & 5 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package metrics

import (
"errors"
"fmt"
"net/http"
"os"
Expand Down Expand Up @@ -97,11 +98,9 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
diskType string,
enableConfidentialStorage string,
enableStoragePools string) {
err := codes.OK.String()
if operationErr != nil {
err = common.CodeForError(operationErr).String()
}
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, err, diskType, enableConfidentialStorage, enableStoragePools).Inc()
errCode := errorCodeLabelValue(operationErr)
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc()
klog.Infof("Recorded PDCSI operation error code: %q", errCode)
}

func (mm *MetricsManager) EmitGKEComponentVersion() error {
Expand Down Expand Up @@ -169,3 +168,18 @@ func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) {
}
return diskType, enableConfidentialStorage, enableStoragePools
}

// errorCodeLabelValue returns the label value for the given operation error.
// This was separated into a helper function for unit testing purposes.
func errorCodeLabelValue(operationErr error) string {
err := codes.OK.String()
if operationErr != nil {
// If the operationErr is a TemporaryError, unwrap the temporary error before passing it to CodeForError.
var tempErr *common.TemporaryError
if errors.As(operationErr, &tempErr) {
operationErr = tempErr.Unwrap()
}
err = common.CodeForError(operationErr).String()
}
return err
}
77 changes: 77 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,21 @@ limitations under the License.
package metrics

import (
"context"
"errors"
"fmt"
"net/http"
"testing"

"github.com/google/go-cmp/cmp"
computealpha "google.golang.org/api/compute/v0.alpha"
computebeta "google.golang.org/api/compute/v0.beta"
"google.golang.org/api/compute/v1"
"google.golang.org/api/googleapi"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
)

Expand Down Expand Up @@ -115,3 +125,70 @@ func TestGetMetricParameters(t *testing.T) {
}
}
}

func TestErrorCodeLabelValue(t *testing.T) {
testCases := []struct {
name string
operationErr error
wantErrorCode string
}{
{
name: "Not googleapi.Error",
operationErr: errors.New("I am not a googleapi.Error"),
wantErrorCode: "Internal",
},
{
name: "User error",
operationErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
wantErrorCode: "InvalidArgument",
},
{
name: "googleapi.Error but not a user error",
operationErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
wantErrorCode: "Internal",
},
{
name: "context canceled error",
operationErr: context.Canceled,
wantErrorCode: "Canceled",
},
{
name: "context deadline exceeded error",
operationErr: context.DeadlineExceeded,
wantErrorCode: "DeadlineExceeded",
},
{
name: "status error with Aborted error code",
operationErr: status.Error(codes.Aborted, "aborted error"),
wantErrorCode: "Aborted",
},
{
name: "user multiattach error",
operationErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
wantErrorCode: "InvalidArgument",
},
{
name: "TemporaryError that wraps googleapi error",
operationErr: common.NewTemporaryError(codes.Unavailable, &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}),
wantErrorCode: "InvalidArgument",
},
{
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
operationErr: common.NewTemporaryError(codes.Aborted, fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})),
wantErrorCode: "InvalidArgument",
},
{
name: "TemporaryError that wraps status error",
operationErr: common.NewTemporaryError(codes.Aborted, status.Error(codes.InvalidArgument, "User error with bad request")),
wantErrorCode: "InvalidArgument",
},
}

for _, tc := range testCases {
t.Logf("Running test: %v", tc.name)
errCode := errorCodeLabelValue(tc.operationErr)
if diff := cmp.Diff(tc.wantErrorCode, errCode); diff != "" {
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
}
}
}

0 comments on commit e0054fd

Please sign in to comment.