Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce new metrics to track API call duration and Update status.Status to capture underlying cause #842

Merged
merged 2 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions pkg/util/provider/machinecodes/status/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ type Status struct {
// [google.rpc.Status.details][google.rpc.Status.details] field, or localized
// by the client.
message string
// cause captures the underlying error
cause error
}

// Code returns the status code contained in status.
Expand All @@ -63,7 +65,15 @@ func (s *Status) Message() string {
return s.message
}

// Cause returns the underlying error if captured.
func (s *Status) Cause() error {
return s.cause
}

// Error returns the error message for the status.
// WARNING: There is an unwritten contract for anyone using status.Status. One MUST never change
// the message text. It expects error code to be in the first square brackets and error message in the next. Therefore,
// any change made here should never change that. Any square brackets added after code and error are ignored when parsing.
func (s *Status) Error() string {
return fmt.Sprintf("machine codes error: code = [%s] message = [%s]", s.Code(), s.Message())
}
Expand All @@ -78,6 +88,15 @@ func Error(c codes.Code, msg string) error {
return New(c, msg)
}

// WrapError creates an instance of status.Status wrapping the underlying cause along with the code and custom error message.
func WrapError(c codes.Code, msg string, cause error) *Status {
return &Status{
code: int32(c),
message: msg,
cause: cause,
}
}

// FromError returns a Status representing err if it was produced from this
// package or has a method `GRPCStatus() *Status`. Otherwise, ok is false and a
// Status is returned with codes.Unknown and the original error message.
Expand All @@ -88,10 +107,17 @@ func FromError(err error) (s *Status, ok bool) {

if matches, errInFind := findInString(err.Error()); errInFind == nil {
code := codes.StringToCode(matches[0])
return New(code, matches[1]), true
return &Status{
code: int32(code),
message: matches[1],
}, true
}

return New(codes.Unknown, err.Error()), false
return &Status{
code: int32(codes.Unknown),
message: err.Error(),
cause: err,
}, false
unmarshall marked this conversation as resolved.
Show resolved Hide resolved
}

// findInString need to check if this logic can be optimized
Expand Down
32 changes: 31 additions & 1 deletion pkg/util/provider/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ var (
Help: "Number of Failed Cloud Service API requests, partitioned by provider, and service.",
}, []string{"provider", "service"},
)

// APIRequestDuration records duration of all successful provider API calls.
// This metric can be filtered by provider and service.
APIRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: cloudAPISubsystem,
Name: "api_request_duration_seconds",
Help: "Time(in seconds) it takes for a provider API request to complete",
}, []string{"provider", "service"})
himanshu-kun marked this conversation as resolved.
Show resolved Hide resolved

// DriverAPIRequestDuration records duration of all successful driver API calls.
// This metric can be filtered by provider and operation.
DriverAPIRequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: cloudAPISubsystem,
Name: "driver_request_duration_seconds",
Help: "Total time (in seconds) taken for a driver API request to complete",
}, []string{"provider", "operation"})

// DriverFailedAPIRequests records number of failed driver API calls.
// This metric can be filtered by provider, operation and error code.
DriverFailedAPIRequests = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: cloudAPISubsystem,
Name: "driver_requests_failed_total",
Help: "Number of failed Driver API requests, partitioned by provider, operation and error code",
}, []string{"provider", "operation", "error_code"})
)

// variables for subsystem: misc
Expand All @@ -103,6 +130,9 @@ func registerMachineSubsystemMetrics() {
func registerCloudAPISubsystemMetrics() {
prometheus.MustRegister(APIRequestCount)
prometheus.MustRegister(APIFailedRequestCount)
prometheus.MustRegister(APIRequestDuration)
prometheus.MustRegister(DriverAPIRequestDuration)
prometheus.MustRegister(DriverFailedAPIRequests)
}

func registerMiscellaneousMetrics() {
Expand All @@ -113,4 +143,4 @@ func init() {
registerMachineSubsystemMetrics()
registerCloudAPISubsystemMetrics()
registerMiscellaneousMetrics()
}
}