Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Nexus failure_reason metric tag #1671

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions internal/common/metrics/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,19 @@ const (

// Metric tag keys
const (
NamespaceTagName = "namespace"
ClientTagName = "client_name"
PollerTypeTagName = "poller_type"
WorkerTypeTagName = "worker_type"
WorkflowTypeNameTagName = "workflow_type"
ActivityTypeNameTagName = "activity_type"
NexusServiceTagName = "nexus_service"
NexusOperationTagName = "nexus_operation"
TaskQueueTagName = "task_queue"
OperationTagName = "operation"
CauseTagName = "cause"
WorkflowTaskFailureReason = "failure_reason"
RequestFailureCode = "status_code"
NamespaceTagName = "namespace"
ClientTagName = "client_name"
PollerTypeTagName = "poller_type"
WorkerTypeTagName = "worker_type"
WorkflowTypeNameTagName = "workflow_type"
ActivityTypeNameTagName = "activity_type"
NexusServiceTagName = "nexus_service"
NexusOperationTagName = "nexus_operation"
FailureReasonTagName = "failure_reason"
TaskQueueTagName = "task_queue"
OperationTagName = "operation"
CauseTagName = "cause"
RequestFailureCode = "status_code"
)

// Metric tag values
Expand Down
9 changes: 8 additions & 1 deletion internal/common/metrics/tags.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ func NexusTags(service, operation, taskQueueName string) map[string]string {
}
}

// NexusTaskFailureTags returns a set of tags for Nexus Operation failures.
func NexusTaskFailureTags(reason string) map[string]string {
return map[string]string{
FailureReasonTagName: reason,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to, somewhere, clarify all possible enumerate values here? Some spec or something for SDKs to follow. I want to make sure it's a fixed enumerate, that all SDKs share them, and we are ok with the casing of the values.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My plan was to put this in our docs in /sdk-metrics. Does that work for you?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! I would like to confirm the values before merging this (can add them in here if easiest).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So to confirm, it's handler_error_NOT_IMPLEMENTED not all consistent cased, correct? I don't mind, just checking.

}
}

// TaskQueueTags returns a set of tags for a task queue.
func TaskQueueTags(taskQueue string) map[string]string {
return map[string]string{
Expand All @@ -106,7 +113,7 @@ func PollerTags(pollerType string) map[string]string {
// WorkflowTaskFailedTags returns a set of tags for a workflow task failure.
func WorkflowTaskFailedTags(reason string) map[string]string {
return map[string]string{
WorkflowTaskFailureReason: reason,
FailureReasonTagName: reason,
}
}

Expand Down
17 changes: 15 additions & 2 deletions internal/internal_nexus_task_poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,21 @@ func (ntp *nexusTaskPoller) ProcessTask(task interface{}) error {
// Internal error processing the task.
// Failure from user handler.
// Special case for the start response with operation error.
if err != nil || failure != nil || res.Response.GetStartOperation().GetOperationError() != nil {
metricsHandler.Counter(metrics.NexusTaskExecutionFailedCounter).Inc(1)
if err != nil {
metricsHandler.
WithTags(metrics.NexusTaskFailureTags("internal_sdk_error")).
Counter(metrics.NexusTaskExecutionFailedCounter).
Inc(1)
} else if failure != nil {
metricsHandler.
WithTags(metrics.NexusTaskFailureTags("handler_error_" + failure.GetError().GetErrorType())).
Counter(metrics.NexusTaskExecutionFailedCounter).
Inc(1)
} else if e := res.Response.GetStartOperation().GetOperationError(); e != nil {
metricsHandler.
WithTags(metrics.NexusTaskFailureTags("operation_" + e.GetOperationState())).
Counter(metrics.NexusTaskExecutionFailedCounter).
Inc(1)
}

// Let the poller machinery drop the task, nothing to report back.
Expand Down
21 changes: 11 additions & 10 deletions test/nexus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,11 +146,12 @@ func (tc *testContext) requireTimer(t *assert.CollectT, metric, service, operati
}))
}

func (tc *testContext) requireCounter(t *assert.CollectT, metric, service, operation string) {
func (tc *testContext) requireFailureCounter(t *assert.CollectT, service, operation, failureType string) {
assert.True(t, slices.ContainsFunc(tc.metricsHandler.Counters(), func(ct *metrics.CapturedCounter) bool {
return ct.Name == metric &&
return ct.Name == metrics.NexusTaskExecutionFailedCounter &&
ct.Tags[metrics.NexusServiceTagName] == service &&
ct.Tags[metrics.NexusOperationTagName] == operation
ct.Tags[metrics.NexusOperationTagName] == operation &&
ct.Tags[metrics.FailureReasonTagName] == failureType
}))
}

Expand Down Expand Up @@ -256,7 +257,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "operation_failed")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -271,7 +272,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_INTERNAL")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -286,7 +287,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_BAD_REQUEST")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -301,7 +302,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_BAD_REQUEST")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -316,7 +317,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_INTERNAL")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -331,7 +332,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_BAD_REQUEST")
}, time.Second*3, time.Millisecond*100)
})

Expand All @@ -346,7 +347,7 @@ func TestNexusSyncOperation(t *testing.T) {
tc.requireTimer(t, metrics.NexusTaskEndToEndLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskScheduleToStartLatency, service.Name, syncOp.Name())
tc.requireTimer(t, metrics.NexusTaskExecutionLatency, service.Name, syncOp.Name())
tc.requireCounter(t, metrics.NexusTaskExecutionFailedCounter, service.Name, syncOp.Name())
tc.requireFailureCounter(t, service.Name, syncOp.Name(), "handler_error_INTERNAL")
}, time.Second*3, time.Millisecond*100)
})
}
Expand Down
Loading