Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Observability] Foundation for load testing telemetry #832

Merged
merged 42 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
ff762ea
measure begin/end blockers
okdas Sep 24, 2024
1aad6ca
--wip-- [skip ci]
okdas Sep 24, 2024
ee84b91
Merge remote-tracking branch 'origin/main' into add-metrics
okdas Sep 25, 2024
025698a
metrixxx
okdas Sep 26, 2024
b7c5bd3
--wip-- [skip ci]
okdas Sep 26, 2024
549710e
bring back original
okdas Sep 26, 2024
208f824
TODO: figure out why prometheus doesn't scrape anymore
okdas Sep 27, 2024
90624df
log level and reduce verbosity of some logs
okdas Sep 27, 2024
8fa8e04
increase the stakes to run the load-test
okdas Sep 27, 2024
8bf2212
Merge branch 'main' into add-metrics
okdas Oct 2, 2024
0262062
--wip-- [skip ci]
okdas Oct 1, 2024
d39fb8f
--wip-- [skip ci]
okdas Oct 2, 2024
69f3df5
--wip-- [skip ci]
okdas Oct 3, 2024
fbfe6d1
--wip-- [skip ci]
okdas Oct 3, 2024
cfa7dc7
--wip-- [skip ci]
okdas Oct 4, 2024
050687a
add histogram
okdas Oct 4, 2024
4786097
add custom metrics config
okdas Oct 4, 2024
83f1b7d
break proofs by app and supplier
okdas Oct 4, 2024
10eb2be
--wip-- [skip ci]
okdas Oct 5, 2024
d28f514
Merge remote-tracking branch 'origin/main' into add-metrics
okdas Oct 7, 2024
cdf62fc
self-review
okdas Oct 7, 2024
6628c07
self-review
okdas Oct 7, 2024
7ab27ac
Merge branch 'main' into add-metrics
okdas Oct 21, 2024
72b3086
fix after merge
okdas Oct 22, 2024
aa61ace
self-review pass
okdas Oct 22, 2024
eb96ae6
Merge branch 'main' into add-metrics
okdas Oct 24, 2024
2cd91de
change retention time on localnet
okdas Oct 24, 2024
d966241
Merge branch 'main' into add-metrics
Olshansk Oct 24, 2024
ce77cc7
add psql datasource to grafana
okdas Oct 25, 2024
e5cc4f6
Merge remote-tracking branch 'origin/main' into add-metrics
okdas Oct 28, 2024
4a0b12b
localnet_up after merge
okdas Oct 28, 2024
497e0e3
Merge remote-tracking branch 'origin/add-metrics' into use-pocketdex-…
okdas Oct 28, 2024
814d0a2
more dashboards
okdas Oct 30, 2024
cd438dc
Merge remote-tracking branch 'origin/main' into add-metrics
okdas Oct 30, 2024
54d6c0f
Update Tiltfile
okdas Oct 30, 2024
dd5276d
address the feedback
okdas Oct 30, 2024
fc53e96
address feedback
okdas Oct 30, 2024
a247a8e
clarify comments
okdas Oct 30, 2024
5f693e5
clarify comments
okdas Oct 30, 2024
6cac26c
Merge remote-tracking branch 'origin/main' into add-metrics
okdas Oct 30, 2024
a13c2ec
address feedback
okdas Oct 30, 2024
6cd43f0
fix the cycle
okdas Oct 31, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Tiltfile
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ if localnet_config["observability"]["enabled"]:
helm_repo("prometheus-community", "https://prometheus-community.github.io/helm-charts")
helm_repo("grafana-helm-repo", "https://grafana.github.io/helm-charts")

# Increase timeout for building the imagedefault is 30, which can be too low for slow internet connections to pull
# Timeout is increased to 120 seconds (default is 30) because a slow internet connection
# could timeout pulling the image.
# container images.
update_settings(k8s_upsert_timeout_secs=120)

Expand Down
31 changes: 17 additions & 14 deletions cmd/poktrolld/cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,16 @@ import (

var once sync.Once

// PoktrollAdditionalConfig represents a poktroll-specific part of `app.toml` file.
// See the `customAppConfigTemplate()` for additional information about each setting.
type PoktrollAdditionalConfig struct {
// PoktrollAppConfig represents a poktroll-specific part of `app.toml` file.
// Checkout `customAppConfigTemplate()` for additional information about each setting.
type PoktrollAppConfig struct {
Telemetry telemetry.PoktrollTelemetryConfig `mapstructure:"telemetry"`
}

// poktrollAdditionalConfigDefaults sets default values to render in `app.toml`.
func poktrollAdditionalConfigDefaults() PoktrollAdditionalConfig {
return PoktrollAdditionalConfig{
// poktrollAppConfigDefaults sets default values to render in `app.toml`.
// Checkout `customAppConfigTemplate()` for additional information about each setting.
func poktrollAppConfigDefaults() PoktrollAppConfig {
return PoktrollAppConfig{
Telemetry: telemetry.PoktrollTelemetryConfig{
CardinalityLevel: "medium",
Olshansk marked this conversation as resolved.
Show resolved Hide resolved
},
Expand Down Expand Up @@ -106,7 +107,7 @@ func initAppConfig() (string, interface{}) {
// The following code snippet is just for reference.
type CustomAppConfig struct {
serverconfig.Config `mapstructure:",squash"`
Poktroll PoktrollAdditionalConfig `mapstructure:"poktroll"`
Poktroll PoktrollAppConfig `mapstructure:"poktroll"`
}

// Optionally allow the chain developer to overwrite the SDK's default
Expand All @@ -130,7 +131,8 @@ func initAppConfig() (string, interface{}) {
srvCfg.MinGasPrices = "0.000000001upokt" // Also adjust ignite's `config.yml`.
srvCfg.Mempool.MaxTxs = 10000
srvCfg.Telemetry.Enabled = true
// Positive value turns on prometheus support. Prometheus metrics are removed from the exporter when retention time is reached.
// Positive non-zero value turns on Prometheus support.
// Prometheus metrics are removed from the exporter when retention time is reached.
srvCfg.Telemetry.PrometheusRetentionTime = 60 * 60 * 24 // in seconds.
srvCfg.Telemetry.MetricsSink = "mem"
srvCfg.Pruning = "nothing" // archiving node by default
Expand All @@ -140,21 +142,22 @@ func initAppConfig() (string, interface{}) {

customAppConfig := CustomAppConfig{
Config: *srvCfg,
Poktroll: poktrollAdditionalConfigDefaults(),
Poktroll: poktrollAppConfigDefaults(),
}

return customAppConfigTemplate(), customAppConfig
return customPoktrollAppConfigTemplate(), customAppConfig
}

// customAppConfigTemplate extends the default configuration `app.toml` file with our own configs. They are going to be
// used on validators and full-nodes, and they render using default values from `poktrollAdditionalConfigDefaults()`.
func customAppConfigTemplate() string {
// customPoktrollAppConfigTemplate extends the default configuration `app.toml` file with our own configs.
// They are going to be used by validators and full-nodes.
// These configs are rendered using default values from `poktrollAppConfigDefaults()`.
func customPoktrollAppConfigTemplate() string {
return serverconfig.DefaultConfigTemplate + `
###############################################################################
### Poktroll ###
###############################################################################

# Poktroll-specific configuration for Full Nodes and Validators.
# Poktroll-specific app configuration for Full Nodes and Validators.
[poktroll]

# Telemetry configuration in addition to the [telemetry] settings.
Expand Down
11 changes: 6 additions & 5 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ validators:
enabled: true
poktroll:
telemetry:
# "high" produces a lot of timeseries. Only suitable for small networks such as LocalNet.
# "high" produces a lot of timeseries.
# ONLY suitable for small networks such as LocalNet.
cardinality-level: high
config:
moniker: "validator1"
Expand Down Expand Up @@ -148,22 +149,22 @@ genesis:
# Application module
- address: pokt1rl3gjgzexmplmds3tq3r3yk84zlwdl6djzgsvm
coins:
- amount: "100000068" # Equals to the total of all app stakes below
- amount: "100000068" # MUST BE equal to the total of all app stakes below
denom: upokt
# Supplier module
- address: pokt1j40dzzmn6cn9kxku7a5tjnud6hv37vesr5ccaa
coins:
- amount: "1000068" # Equals to the total of all supplier stakes below
- amount: "1000068" # MUST BE equal to the total of all supplier stakes below
denom: upokt
# Gateway module
- address: pokt1f6j7u6875p2cvyrgjr0d2uecyzah0kget9vlpl
coins:
- amount: "1000068" # Equals to the total of all gateway stakes below
- amount: "1000068" # MUST BE equal to the total of all gateway stakes below
denom: upokt
# Service module
- address: pokt1nhmtqf4gcmpxu0p6e53hpgtwj0llmsqpxtumcf
coins:
- amount: "1000000000" # Equals to one add_service_fee below
- amount: "1000000000" # MUST BE equal to one add_service_fee below
denom: upokt
application:
params:
Expand Down
12 changes: 6 additions & 6 deletions localnet/grafana-dashboards/cosmos_sdk_insights.json
Original file line number Diff line number Diff line change
Expand Up @@ -787,9 +787,9 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(increase(poktroll_relays{job=\"$job\",service_id=~\"$service_id\"}[1m])) by (service_id, proof_stage)",
"expr": "sum(increase(poktroll_relays{job=\"$job\",service_id=~\"$service_id\"}[1m])) by (service_id, claim_proof_stage)",
"instant": false,
"legendFormat": "{{service_id}}-{{proof_stage}}",
"legendFormat": "{{service_id}}-{{claim_proof_stage}}",
"range": true,
"refId": "A"
}
Expand Down Expand Up @@ -884,9 +884,9 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(increase(poktroll_compute_units{job=\"$job\",service_id=~\"$service_id\"}[1m])) by (service_id, proof_stage)",
"expr": "sum(increase(poktroll_compute_units{job=\"$job\",service_id=~\"$service_id\"}[1m])) by (service_id, claim_proof_stage)",
"instant": false,
"legendFormat": "{{service_id}}-{{proof_stage}}",
"legendFormat": "{{service_id}}-{{claim_proof_stage}}",
"range": true,
"refId": "A"
}
Expand Down Expand Up @@ -1227,9 +1227,9 @@
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(increase(poktroll_claims{job=\"$job\",service_id=~\"$service_id\"}[1m])) by (service_id, proof_stage)",
"expr": "sum(increase(poktroll_claims{job=\"$job\",service_id=~\"$service_id\"}[1m])) by (service_id, claim_proof_stage)",
"instant": false,
"legendFormat": "{{service_id}}-{{proof_stage}}",
"legendFormat": "{{service_id}}-{{claim_proof_stage}}",
"range": true,
"refId": "A"
}
Expand Down
4 changes: 2 additions & 2 deletions localnet/kubernetes/observability-prometheus-stack.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ grafana:
jsonData:
sslmode: disable
postgresVersion: 1604 # Adjust to match your PostgreSQL version
# timescaledb: false # Set to true if you are using TimescaleDB
schema: localnet # Specify your schema here
# timescaledb: false # Set to true if you are using TimescaleDB
schema: localnet # Specify your postgres schema here

prometheus:
prometheusSpec:
Expand Down
5 changes: 4 additions & 1 deletion pkg/relayer/proxy/synchronous.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,10 @@ func (sync *synchronousRPCServer) ServeHTTP(writer http.ResponseWriter, request
}

// Increment the relays counter.
relaysTotal.With("service_id", supplierServiceId, "supplier_operator_address", relayRequest.Meta.SupplierOperatorAddress).Add(1)
relaysTotal.With(
"service_id", supplierServiceId,
"supplier_operator_address", relayRequest.Meta.SupplierOperatorAddress,
).Add(1)
defer func() {
duration := time.Since(startTime).Seconds()

Expand Down
31 changes: 17 additions & 14 deletions telemetry/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ import (
"github.com/hashicorp/go-metrics"
)

// MetricNameKeys constructs the full metric name by prefixing with a defined
// prefix and appending any additional metrics provided as variadic arguments.
// MetricNameKeys prefixes metrics with `poktroll` for easy identification.
// E.g., `("hodlers", "regret_level")` yields `poktroll_hodlers_regret_level` — great for tracking FOMO as hodlers rethink choices.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol

// Returns a slice of strings as `go-metric`, the underlying metrics library, expects.
func MetricNameKeys(metrics ...string) []string {
result := make([]string, 0, len(metrics)+1)
result = append(result, metricNamePrefix)
Expand All @@ -19,25 +20,27 @@ func isTelemetyEnabled() bool {
return cosmostelemetry.IsTelemetryEnabled()
}

// appendMediumCardinalityLabels only creates the label if cardinality if set to "medium".
// Good example of a medium cardinality label is `service_id` — we do not control the number of services
// on the network, and as permissionless services grow the metrics can get easily out of hand. We're keeping
// an option to turn off such labels.
// Medium cardinality labels are included when the cardinality is set to "high".
// Configuration option is exposed in app.toml, our own `poktroll.telemetry` section.
// appendMediumCardinalityLabels only creates the label if cardinality if set to "medium" or higher.
// A good example for a "medium" cardinality use-case is `service_id`:
// - This is a network wide parameter
// - It is dependenon the permissionless nature of the network and can grow unbounded
// - We're keeping an option to turn off such labels to avoid metric bloat
//
// Configuration option is exposed in app.toml under the `poktroll.telemetry` section.
func appendMediumCardinalityLabels(labels []metrics.Label, labelPairs ...metrics.Label) []metrics.Label {
if globalTelemetryConfig.CardinalityLevel == "medium" || globalTelemetryConfig.CardinalityLevel == "high" {
return append(labels, labelPairs...)
}
return labels
}

// appendHighCardinalityLabels only creates the label if cardinality if set to "high".
// Good examples of high cardinality labels are `application_address` or `supplier_address`.
// This setting, on a large network, will slow down both the full node and the metric scraping system.
// We want to have such labels exposed for local development, debugging and performance troubleshooring.
// More background on why this is important: https://www.robustperception.io/cardinality-is-key/
// Configuration option is exposed in app.toml, our own `poktroll.telemetry` section.
// appendHighCardinalityLabels only creates the label if cardinality is set to "high".
// A good example of high cardinality labels is `application_address` or `supplier_address`:
// - This setting, on a large network, will slow down both the full node and the metric scraping system.
// - These labels need to be exposed for local development, debugging and performance troubleshooting.
//
// Additional references on cardinality: https://www.robustperception.io/cardinality-is-key/
// Configuration option is exposed in app.toml under the `poktroll.telemetry` section.
func appendHighCardinalityLabels(labels []metrics.Label, labelPairs ...metrics.Label) []metrics.Label {
if globalTelemetryConfig.CardinalityLevel == "high" {
return append(labels, labelPairs...)
Expand Down
12 changes: 6 additions & 6 deletions telemetry/event_counters.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const (

// Label Names
applicationAddressLabelName = "app_addr"
supplierOperatorAddressLabelName = "supop_addr"
supplierOperatorAddressLabelName = "sup_op_addr"
)

// EventSuccessCounter increments a counter with the given data type and success status.
Expand Down Expand Up @@ -107,7 +107,7 @@ func ClaimComputeUnitsCounter(

incrementAmount := numComputeUnits
labels := []metrics.Label{
{Name: "proof_stage", Value: claimProofStage.String()},
{Name: "claim_proof_stage", Value: claimProofStage.String()},
}
labels = appendMediumCardinalityLabels(labels, toMetricLabel("service_id", serviceId))
labels = appendHighCardinalityLabels(
Expand Down Expand Up @@ -146,7 +146,7 @@ func ClaimRelaysCounter(

incrementAmount := numRelays
labels := []metrics.Label{
{Name: "proof_stage", Value: claimProofStage.String()},
{Name: "claim_proof_stage", Value: claimProofStage.String()},
}
labels = appendMediumCardinalityLabels(labels, toMetricLabel("service_id", serviceId))
labels = appendHighCardinalityLabels(
Expand Down Expand Up @@ -184,7 +184,7 @@ func ClaimCounter(

incrementAmount := numClaims
labels := []metrics.Label{
{Name: "proof_stage", Value: claimProofStage.String()},
{Name: "claim_proof_stage", Value: claimProofStage.String()},
}

labels = appendMediumCardinalityLabels(labels, toMetricLabel("service_id", serviceId))
Expand Down Expand Up @@ -244,7 +244,7 @@ func RelayEMAGauge(relayEMA uint64, serviceId string) {
// SessionSuppliersGauge sets a gauge which tracks the number of candidates available
// for session suppliers at the given maxPerSession value.
// The serviceId is used as a label to be able to track this information for each service.
func SessionSuppliersGauge(candidates int, maxPerSession int, serviceId string) {
func SessionSuppliersGauge(numCandidates int, maxPerSession int, serviceId string) {
if !isTelemetyEnabled() {
return
}
Expand All @@ -259,7 +259,7 @@ func SessionSuppliersGauge(candidates int, maxPerSession int, serviceId string)

telemetry.SetGaugeWithLabels(
MetricNameKeys("session", "suppliers"),
float32(candidates),
float32(numCandidates),
labels,
)
}
9 changes: 5 additions & 4 deletions telemetry/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@ import (
"github.com/mitchellh/mapstructure"
)

// globalTelemetryConfig is a private variable that stores cardinality level.
// It is set on initialization and does not change for the lifetime of the program.
// globalTelemetryConfig stores poktroll specific telemetry configurations.
// Set once on initialization and remains constant during runtime.
var globalTelemetryConfig PoktrollTelemetryConfig

// PoktrollTelemetryConfig represents the telemetry protion of the custom poktroll config.
// PoktrollTelemetryConfig represents the telemetry protion of the custom poktroll config section in `app.toml`.
type PoktrollTelemetryConfig struct {
CardinalityLevel string `mapstructure:"cardinality-level"`
}

// New sets the globalTelemetryConfig for telemetry package.
func New(appOpts servertypes.AppOptions) error {
// Extract the map from appOpts. `poktroll.telemetry` comes from `app.toml` which is parsed into a map.
// Extract the map from appOpts.
// `poktroll.telemetry` comes from `app.toml` which is parsed into a map.
telemetryMap := appOpts.Get("poktroll.telemetry").(map[string]interface{})

// Use mapstructure to decode the map into the struct
Expand Down
5 changes: 4 additions & 1 deletion telemetry/tokens.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ import (
"github.com/hashicorp/go-metrics"
)

okdas marked this conversation as resolved.
Show resolved Hide resolved
// TODO_TECHDEBT: Minted, burned and shashed tokens values might not be accurate or be inflated.
// TODO_MAINNET(@bryanchriswhite): Revisit how telemetry is managed under `x/tokenomics` to ensure that it
// complies with the new hardened settlement approach.

// TODO_MAINNET(@red-0ne, #897): Minted, burnt and slashd tokens values might not be completely accurate.
// While we're keeping this metric for now consider removing in favor of utilizing the `cosmos-exporter` which uses on-chain data.
// Context: https://github.com/cosmos/cosmos-sdk/issues/21614, https://github.com/pokt-network/poktroll/pull/832

Expand Down
2 changes: 1 addition & 1 deletion x/application/module/abci.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (

// EndBlocker is called every block and handles application related updates.
func EndBlocker(ctx sdk.Context, k keeper.Keeper) error {
// Telemetry: measure execution time like standard cosmos-sdk modules do that.
// Telemetry: measure the end-block execution time following standard cosmos-sdk practices.
defer cosmostelemetry.ModuleMeasureSince(types.ModuleName, cosmostelemetry.Now(), cosmostelemetry.MetricKeyEndBlocker)

if err := k.EndBlockerAutoUndelegateFromUnstakedGateways(ctx); err != nil {
Expand Down
32 changes: 19 additions & 13 deletions x/proof/keeper/msg_server_create_claim.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/pokt-network/poktroll/telemetry"
"github.com/pokt-network/poktroll/x/proof/types"
sessiontypes "github.com/pokt-network/poktroll/x/session/types"
sharedtypes "github.com/pokt-network/poktroll/x/shared/types"
)

Expand Down Expand Up @@ -42,19 +43,8 @@ func (k msgServer) CreateClaim(
return nil, status.Error(codes.InvalidArgument, err.Error())
}

// Defer telemetry calls so that they reference the final values the relevant variables.
defer func() {
// Only increment these metrics counters if handling a new claim.
if !isExistingClaim {
serviceId := session.Header.ServiceId
applicationAddress := session.Header.ApplicationAddress
supplierOperatorAddress := msg.GetSupplierOperatorAddress()

telemetry.ClaimCounter(types.ClaimProofStage_CLAIMED, 1, serviceId, applicationAddress, supplierOperatorAddress, err)
telemetry.ClaimRelaysCounter(types.ClaimProofStage_CLAIMED, numRelays, serviceId, applicationAddress, supplierOperatorAddress, err)
telemetry.ClaimComputeUnitsCounter(types.ClaimProofStage_CLAIMED, numClaimComputeUnits, serviceId, applicationAddress, supplierOperatorAddress, err)
}
}()
// Defer telemetry calls to a helper function to keep business logic clean.
defer k.finalizeCreateClaimTelemetry(session, msg, isExistingClaim, numRelays, numClaimComputeUnits, err)

// Construct and insert claim
claim = types.Claim{
Expand Down Expand Up @@ -158,3 +148,19 @@ func (k msgServer) CreateClaim(
Claim: &claim,
}, nil
}

// finalizeCreateClaimTelemetry defers telemetry calls to be executed after business logic,
// incrementing counters based on whether a new claim was handled successfully.
// Meant to run deferred.
func (k msgServer) finalizeCreateClaimTelemetry(session *sessiontypes.Session, msg *types.MsgCreateClaim, isExistingClaim bool, numRelays, numClaimComputeUnits uint64, err error) {
// Only increment these metrics counters if handling a new claim.
if !isExistingClaim {
serviceId := session.Header.ServiceId
applicationAddress := session.Header.ApplicationAddress
supplierOperatorAddress := msg.GetSupplierOperatorAddress()

telemetry.ClaimCounter(types.ClaimProofStage_CLAIMED, 1, serviceId, applicationAddress, supplierOperatorAddress, err)
telemetry.ClaimRelaysCounter(types.ClaimProofStage_CLAIMED, numRelays, serviceId, applicationAddress, supplierOperatorAddress, err)
telemetry.ClaimComputeUnitsCounter(types.ClaimProofStage_CLAIMED, numClaimComputeUnits, serviceId, applicationAddress, supplierOperatorAddress, err)
}
}
Loading
Loading