Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record original error code to operation_errors metric for temporary errors #1664

Merged
merged 1 commit into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions pkg/common/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package common

import (
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

// TemporaryError wraps an error with a temporary error code.
// It implements the error interface. Do not return TemporaryError
// directly from CSI Spec API calls, as CSI Spec API calls MUST
// return a standard gRPC status. If TemporaryErrors are returned from
// helper functions within a CSI Spec API method, make sure the outer CSI
// Spec API method returns a standard gRPC status. (e.g. LoggedError(tempErr) )
type TemporaryError struct {
err error
code codes.Code
}

// Unwrap extracts the original error.
func (te *TemporaryError) Unwrap() error {
return te.err
}

// GRPCStatus extracts the underlying gRPC Status error.
// This method is necessary to fulfill the grpcstatus interface
// described in https://pkg.go.dev/google.golang.org/grpc/status#FromError.
// FromError is used in CodeForError to get existing error codes from status errors.
func (te *TemporaryError) GRPCStatus() *status.Status {
if te.err == nil {
return status.New(codes.OK, "")
}
return status.New(te.code, te.err.Error())
}

func NewTemporaryError(code codes.Code, err error) *TemporaryError {
return &TemporaryError{err: err, code: code}
}

// Error returns a readable representation of the TemporaryError.
func (te *TemporaryError) Error() string {
return te.err.Error()
}
20 changes: 20 additions & 0 deletions pkg/common/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,26 @@ func TestCodeForError(t *testing.T) {
inputErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
expCode: codes.InvalidArgument,
},
{
name: "TemporaryError that wraps googleapi error",
inputErr: &TemporaryError{code: codes.Unavailable, err: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}},
expCode: codes.Unavailable,
},
{
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
inputErr: &TemporaryError{code: codes.Aborted, err: fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})},
expCode: codes.Aborted,
},
{
name: "TemporaryError that wraps status error",
inputErr: &TemporaryError{code: codes.Aborted, err: status.Error(codes.Aborted, "aborted error")},
expCode: codes.Aborted,
},
{
name: "TemporaryError that wraps context canceled error",
inputErr: &TemporaryError{code: codes.Aborted, err: context.Canceled},
expCode: codes.Aborted,
},
}

for _, tc := range testCases {
Expand Down
12 changes: 6 additions & 6 deletions pkg/gce-cloud-provider/compute/gce-compute.go
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
if err != nil {
// failed to GetDisk, however the Disk may already exist
// the error code should be non-Final
return status.Error(codes.Unavailable, err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -546,7 +546,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -558,7 +558,7 @@ func (cloud *CloudProvider) insertRegionalDisk(
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
return nil
}
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
}
return nil
}
Expand Down Expand Up @@ -653,7 +653,7 @@ func (cloud *CloudProvider) insertZonalDisk(
if err != nil {
// failed to GetDisk, however the Disk may already exist
// the error code should be non-Final
return status.Error(codes.Unavailable, err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -678,7 +678,7 @@ func (cloud *CloudProvider) insertZonalDisk(
if IsGCEError(err, "alreadyExists") {
disk, err := cloud.GetDisk(ctx, project, volKey, gceAPIVersion)
if err != nil {
return status.Errorf(codes.Unavailable, "error when getting disk: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("error when getting disk: %w", err))
}
err = cloud.ValidateExistingDisk(ctx, disk, params,
int64(capacityRange.GetRequiredBytes()),
Expand All @@ -690,7 +690,7 @@ func (cloud *CloudProvider) insertZonalDisk(
klog.Warningf("GCE PD %s already exists after wait, reusing", volKey.Name)
return nil
}
return status.Errorf(codes.Unavailable, "unknown error when polling the operation: %v", err.Error())
return common.NewTemporaryError(codes.Unavailable, fmt.Errorf("unknown error when polling the operation: %w", err))
}
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/gce-pd-csi-driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2001,7 +2001,7 @@ func createRegionalDisk(ctx context.Context, cloudProvider gce.GCECompute, name
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
disk, err := cloudProvider.GetDisk(ctx, project, meta.RegionalKey(name, region), gceAPIVersion)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating regional disk: %v", err.Error())
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating regional disk: %w", err))
}
return disk, nil
}
Expand All @@ -2024,7 +2024,7 @@ func createSingleZoneDisk(ctx context.Context, cloudProvider gce.GCECompute, nam
// failed to GetDisk, however the Disk may already be created, the error code should be non-Final
disk, err := cloudProvider.GetDisk(ctx, project, meta.ZonalKey(name, diskZone), gceAPIVersion)
if err != nil {
return nil, status.Errorf(codes.Unavailable, "failed to get disk after creating zonal disk: %v", err.Error())
return nil, common.NewTemporaryError(codes.Unavailable, fmt.Errorf("failed to get disk after creating zonal disk: %w", err))
}
return disk, nil
}
Expand Down
24 changes: 19 additions & 5 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package metrics

import (
"errors"
"fmt"
"net/http"
"os"
Expand Down Expand Up @@ -97,11 +98,9 @@ func (mm *MetricsManager) RecordOperationErrorMetrics(
diskType string,
enableConfidentialStorage string,
enableStoragePools string) {
err := codes.OK.String()
if operationErr != nil {
err = common.CodeForError(operationErr).String()
}
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, err, diskType, enableConfidentialStorage, enableStoragePools).Inc()
errCode := errorCodeLabelValue(operationErr)
pdcsiOperationErrorsMetric.WithLabelValues(pdcsiDriverName, "/csi.v1.Controller/"+operationName, errCode, diskType, enableConfidentialStorage, enableStoragePools).Inc()
klog.Infof("Recorded PDCSI operation error code: %q", errCode)
}

func (mm *MetricsManager) EmitGKEComponentVersion() error {
Expand Down Expand Up @@ -169,3 +168,18 @@ func GetMetricParameters(disk *gce.CloudDisk) (string, string, string) {
}
return diskType, enableConfidentialStorage, enableStoragePools
}

// errorCodeLabelValue returns the label value for the given operation error.
// This was separated into a helper function for unit testing purposes.
func errorCodeLabelValue(operationErr error) string {
err := codes.OK.String()
if operationErr != nil {
// If the operationErr is a TemporaryError, unwrap the temporary error before passing it to CodeForError.
var tempErr *common.TemporaryError
if errors.As(operationErr, &tempErr) {
operationErr = tempErr.Unwrap()
}
err = common.CodeForError(operationErr).String()
}
return err
}
82 changes: 82 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,19 @@ limitations under the License.
package metrics

import (
"context"
"errors"
"fmt"
"net/http"
"testing"

"github.com/google/go-cmp/cmp"
"google.golang.org/api/compute/v1"
"google.golang.org/api/googleapi"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"

"sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common"
gce "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/gce-cloud-provider/compute"
)

Expand Down Expand Up @@ -101,3 +111,75 @@ func TestGetMetricParameters(t *testing.T) {
}
}
}

func TestErrorCodeLabelValue(t *testing.T) {
testCases := []struct {
name string
operationErr error
wantErrorCode string
}{
{
name: "Not googleapi.Error",
operationErr: errors.New("I am not a googleapi.Error"),
wantErrorCode: "Internal",
},
{
name: "User error",
operationErr: &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"},
wantErrorCode: "InvalidArgument",
},
{
name: "googleapi.Error but not a user error",
operationErr: &googleapi.Error{Code: http.StatusInternalServerError, Message: "Internal error"},
wantErrorCode: "Internal",
},
{
name: "context canceled error",
operationErr: context.Canceled,
wantErrorCode: "Canceled",
},
{
name: "context deadline exceeded error",
operationErr: context.DeadlineExceeded,
wantErrorCode: "DeadlineExceeded",
},
{
name: "status error with Aborted error code",
operationErr: status.Error(codes.Aborted, "aborted error"),
wantErrorCode: "Aborted",
},
{
name: "user multiattach error",
operationErr: fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'"),
wantErrorCode: "InvalidArgument",
},
{
name: "TemporaryError that wraps googleapi error",
operationErr: common.NewTemporaryError(codes.Unavailable, &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"}),
wantErrorCode: "InvalidArgument",
},
{
name: "TemporaryError that wraps fmt.Errorf, which wraps googleapi error",
operationErr: common.NewTemporaryError(codes.Aborted, fmt.Errorf("got error: %w", &googleapi.Error{Code: http.StatusBadRequest, Message: "User error with bad request"})),
wantErrorCode: "InvalidArgument",
},
{
name: "TemporaryError that wraps status error",
operationErr: common.NewTemporaryError(codes.Aborted, status.Error(codes.InvalidArgument, "User error with bad request")),
wantErrorCode: "InvalidArgument",
},
amacaskill marked this conversation as resolved.
Show resolved Hide resolved
{
name: "TemporaryError that wraps multiattach error",
operationErr: common.NewTemporaryError(codes.Unavailable, fmt.Errorf("The disk resource 'projects/foo/disk/bar' is already being used by 'projects/foo/instances/1'")),
wantErrorCode: "InvalidArgument",
},
}

for _, tc := range testCases {
t.Logf("Running test: %v", tc.name)
errCode := errorCodeLabelValue(tc.operationErr)
if diff := cmp.Diff(tc.wantErrorCode, errCode); diff != "" {
t.Errorf("%s: -want err, +got err\n%s", tc.name, diff)
}
}
}