Skip to content

Commit

Permalink
Introduce new metrics to track API call duration and Update status.St…
Browse files Browse the repository at this point in the history
…atus to capture underlying cause (#842)

* added histogram metrics to capture invocation duration of provide APIs

changed the name of the API request duration metric

added metric to capture overall duration for driver api calls

reverted back to using service label for provider api metric

added cause to status.Status

added WrapError function

corrected docstrings

corrected leftover docstring

Update pkg/util/provider/metrics/metrics.go

Co-authored-by: Himanshu Sharma <[email protected]>

Apply suggestions from code review

Co-authored-by: Himanshu Sharma <[email protected]>

addressed review comment - removed populating cause when in FromError when err is already status.Status

* added Driver failed API request counter
  • Loading branch information
unmarshall authored Sep 5, 2023
1 parent b7fe287 commit 61d9417
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 3 deletions.
30 changes: 28 additions & 2 deletions pkg/util/provider/machinecodes/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ type Status struct {
// [google.rpc.Status.details][google.rpc.Status.details] field, or localized
// by the client.
message string
// cause captures the underlying error
cause error
}

// Code returns the status code contained in status.
Expand All @@ -63,7 +65,15 @@ func (s *Status) Message() string {
return s.message
}

// Cause returns the underlying error if captured.
func (s *Status) Cause() error {
return s.cause
}

// Error returns the error message for the status.
// WARNING: There is an unwritten contract for anyone using status.Status. One MUST never change
// the message text. It expects error code to be in the first square brackets and error message in the next. Therefore,
// any change made here should never change that. Any square brackets added after code and error are ignored when parsing.
func (s *Status) Error() string {
return fmt.Sprintf("machine codes error: code = [%s] message = [%s]", s.Code(), s.Message())
}
Expand All @@ -78,6 +88,15 @@ func Error(c codes.Code, msg string) error {
return New(c, msg)
}

// WrapError creates an instance of status.Status wrapping the underlying cause along with the code and custom error message.
func WrapError(c codes.Code, msg string, cause error) *Status {
return &Status{
code: int32(c),
message: msg,
cause: cause,
}
}

// FromError returns a Status representing err if it was produced from this
// package or has a method `GRPCStatus() *Status`. Otherwise, ok is false and a
// Status is returned with codes.Unknown and the original error message.
Expand All @@ -88,10 +107,17 @@ func FromError(err error) (s *Status, ok bool) {

if matches, errInFind := findInString(err.Error()); errInFind == nil {
code := codes.StringToCode(matches[0])
return New(code, matches[1]), true
return &Status{
code: int32(code),
message: matches[1],
}, true
}

return New(codes.Unknown, err.Error()), false
return &Status{
code: int32(codes.Unknown),
message: err.Error(),
cause: err,
}, false
}

// findInString need to check if this logic can be optimized
Expand Down
32 changes: 31 additions & 1 deletion pkg/util/provider/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ var (
Help: "Number of Failed Cloud Service API requests, partitioned by provider, and service.",
}, []string{"provider", "service"},
)

// APIRequestDuration records duration of all successful provider API calls.
// This metric can be filtered by provider and service.
APIRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: cloudAPISubsystem,
Name: "api_request_duration_seconds",
Help: "Time(in seconds) it takes for a provider API request to complete",
}, []string{"provider", "service"})

// DriverAPIRequestDuration records duration of all successful driver API calls.
// This metric can be filtered by provider and operation.
DriverAPIRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: cloudAPISubsystem,
Name: "driver_request_duration_seconds",
Help: "Total time (in seconds) taken for a driver API request to complete",
}, []string{"provider", "operation"})

// DriverFailedAPIRequests records number of failed driver API calls.
// This metric can be filtered by provider, operation and error code.
DriverFailedAPIRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: cloudAPISubsystem,
Name: "driver_requests_failed_total",
Help: "Number of failed Driver API requests, partitioned by provider, operation and error code",
}, []string{"provider", "operation", "error_code"})
)

// variables for subsystem: misc
Expand All @@ -103,6 +130,9 @@ func registerMachineSubsystemMetrics() {
func registerCloudAPISubsystemMetrics() {
prometheus.MustRegister(APIRequestCount)
prometheus.MustRegister(APIFailedRequestCount)
prometheus.MustRegister(APIRequestDuration)
prometheus.MustRegister(DriverAPIRequestDuration)
prometheus.MustRegister(DriverFailedAPIRequests)
}

func registerMiscellaneousMetrics() {
Expand All @@ -113,4 +143,4 @@ func init() {
registerMachineSubsystemMetrics()
registerCloudAPISubsystemMetrics()
registerMiscellaneousMetrics()
}
}

0 comments on commit 61d9417

Please sign in to comment.