Skip to content

Commit

Permalink
Merge pull request #2884 from oasislabs/matevz/feature/new-metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
matevz authored May 19, 2020
2 parents 6d87530 + 5fde5ef commit ae51f4d
Show file tree
Hide file tree
Showing 12 changed files with 174 additions and 12 deletions.
7 changes: 4 additions & 3 deletions .changelog/2602.doc.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
Document all Prometheus metrics produced by `oasis-node`

List of metrics with description and file location is available in
List of metrics including the description, metric type, and location in the
source is now available in
[docs/oasis-node/metrics.md](../docs/oasis-node/metrics.md) Markdown file. To
automate generation of this list, new `go/extra/extract-metric` tool was
automate generation of this list, a new `go/extra/extract-metric` tool was
introduced. To update the list of metrics, execute `make update-docs` in the
project root.
project root. Documentation needs to be up to date for `lint` rule to succeed.
12 changes: 12 additions & 0 deletions .changelog/2842.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Add new consensus-related Prometheus metrics

Four new metrics have been added:

- `oasis_worker_epoch_number` is the current epoch number as seen by the
worker.
- `oasis_worker_node_registered` is a binary metric which denotes, if the
node is registered.
- `oasis_consensus_proposed_blocks` is the number of proposed Tendermint
blocks by the node.
- `oasis_consensus_signed_blocks` is the number of Tendermint blocks the node
voted for.
6 changes: 6 additions & 0 deletions .github/workflows/ci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ jobs:
make lint-changelog
# Always run this step so that all linting errors can be seen at once.
if: always()
- name: Check documentation synchronized with source code
run: |
pushd go/extra/extract-metrics && go build && popd
make lint-docs
# Always run this step so that all linting errors can be seen at once.
if: always()
- name: Check go mod tidy
# NOTE: go mod tidy doesn't implement a check mode yet.
# For more details, see: https://github.com/golang/go/issues/27005.
Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ build-helpers: build-helpers-go
build-go-generate:
@$(MAKE) -C go generate

# Generate source Markdown documentation.
# Synchronize source Markdown documentation.
update-docs: build-go
@$(MAKE) -C docs update

Expand All @@ -65,7 +65,7 @@ fmt-go:
fmt: $(fmt-targets)

# Lint code, commits and documentation.
lint-targets := lint-go lint-git lint-md lint-changelog
lint-targets := lint-go lint-git lint-md lint-changelog lint-docs

lint-go:
@$(MAKE) -C go lint
Expand All @@ -91,6 +91,10 @@ lint-changelog:
done; \
exit $$exit_status

# Check whether docs are synced with source code.
lint-docs:
@$(MAKE) -C docs check

lint: $(lint-targets)

# Test.
Expand Down
9 changes: 7 additions & 2 deletions docs/Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
all:

check:
@# Check whether update-docs are up to date.
@../go/extra/extract-metrics/extract-metrics --codebase.path ../go/ --markdown --markdown.template.file oasis-node/metrics.md.tpl | diff oasis-node/metrics.md -

update:
# Generate oasis-node/metrics.md.
../go/extra/extract-metrics/extract-metrics --codebase.path ../go/ --markdown --markdown.template.file oasis-node/metrics.md.tpl > oasis-node/metrics.md
@# Generate oasis-node/metrics.md.
@../go/extra/extract-metrics/extract-metrics --codebase.path ../go/ --markdown --markdown.template.file oasis-node/metrics.md.tpl > oasis-node/metrics.md

.PHONY:
all
check
update
4 changes: 4 additions & 0 deletions docs/oasis-node/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ Name | Type | Description | Package
-----|------|-------------|--------
oasis_abci_db_size | Gauge | Total size of the ABCI database (MiB). | [consensus/tendermint/abci](../../go/consensus/tendermint/abci/mux.go#L48)
oasis_codec_size | Summary | CBOR codec message size (bytes). | [common/cbor](../../go/common/cbor/codec.go#L19)
oasis_consensus_proposed_blocks | Counter | Number of blocks proposed by the node. | [consensus/metrics](../../go/consensus/metrics/metrics.go#L17)
oasis_consensus_signed_blocks | Counter | Number of blocks signed by the node. | [consensus/metrics](../../go/consensus/metrics/metrics.go#L10)
oasis_finalized_rounds | Counter | Number of finalized rounds. | [roothash](../../go/roothash/metrics.go#L15)
oasis_grpc_calls | Counter | Number of gRPC calls. | [common/grpc](../../go/common/grpc/grpc.go#L46)
oasis_grpc_latency | Summary | gRPC call latency (seconds). | [common/grpc](../../go/common/grpc/grpc.go#L53)
Expand Down Expand Up @@ -88,11 +90,13 @@ oasis_worker_batch_processing_time | Summary | Time it takes for a batch to fina
oasis_worker_batch_read_time | Summary | Time it takes to read a batch from storage (seconds). | [worker/compute/executor/committee](../../go/worker/compute/executor/committee/node.go#L71)
oasis_worker_batch_runtime_processing_time | Summary | Time it takes for a batch to be processed by the runtime (seconds). | [worker/compute/executor/committee](../../go/worker/compute/executor/committee/node.go#L85)
oasis_worker_batch_size | Summary | Number of transactions in a batch. | [worker/compute/executor/committee](../../go/worker/compute/executor/committee/node.go#L92)
oasis_worker_epoch_number | Gauge | Current epoch number as seen by the worker. | [worker/common/committee](../../go/worker/common/committee/node.go#L52)
oasis_worker_epoch_transition_count | Counter | Number of epoch transitions. | [worker/common/committee](../../go/worker/common/committee/node.go#L45)
oasis_worker_execution_discrepancy_detected_count | Counter | Number of detected execute discrepancies. | [worker/compute/executor/committee](../../go/worker/compute/executor/committee/node.go#L50)
oasis_worker_failed_round_count | Counter | Number of failed roothash rounds. | [worker/common/committee](../../go/worker/common/committee/node.go#L38)
oasis_worker_inconsistent_merge_root_count | Counter | Number of inconsistent merge roots. | [worker/compute/merge/committee](../../go/worker/compute/merge/committee/node.go#L60)
oasis_worker_merge_discrepancy_detected_count | Counter | Number of detected merge discrepancies. | [worker/compute/merge/committee](../../go/worker/compute/merge/committee/node.go#L39)
oasis_worker_node_registered | Gauge | Is oasis node registered (binary). | [worker/registration](../../go/worker/registration/worker.go#L58)
oasis_worker_processed_block_count | Counter | Number of processed roothash blocks. | [worker/common/committee](../../go/worker/common/committee/node.go#L24)
oasis_worker_processed_event_count | Counter | Number of processed roothash events. | [worker/common/committee](../../go/worker/common/committee/node.go#L31)
oasis_worker_roothash_merge_commit_latency | Summary | Latency of roothash merge commit (seconds). | [worker/compute/merge/committee](../../go/worker/compute/merge/committee/node.go#L46)
Expand Down
37 changes: 37 additions & 0 deletions go/consensus/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package metrics

import (
"sync"

"github.com/prometheus/client_golang/prometheus"
)

var (
SignedBlocks = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "oasis_consensus_signed_blocks",
Help: "Number of blocks signed by the node.",
},
[]string{"backend"},
)
ProposedBlocks = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "oasis_consensus_proposed_blocks",
Help: "Number of blocks proposed by the node.",
},
[]string{"backend"},
)

consensusCollectors = []prometheus.Collector{
SignedBlocks,
ProposedBlocks,
}

metricsOnce sync.Once
)

func init() {
metricsOnce.Do(func() {
prometheus.MustRegister(consensusCollectors...)
})
}
57 changes: 57 additions & 0 deletions go/consensus/tendermint/tendermint.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package tendermint

import (
"bytes"
"context"
"encoding/json"
"fmt"
Expand All @@ -12,6 +13,7 @@ import (
"sync/atomic"
"time"

"github.com/prometheus/client_golang/prometheus"
flag "github.com/spf13/pflag"
"github.com/spf13/viper"
tmabcitypes "github.com/tendermint/tendermint/abci/types"
Expand Down Expand Up @@ -42,6 +44,7 @@ import (
"github.com/oasislabs/oasis-core/go/common/version"
consensusAPI "github.com/oasislabs/oasis-core/go/consensus/api"
"github.com/oasislabs/oasis-core/go/consensus/api/transaction"
"github.com/oasislabs/oasis-core/go/consensus/metrics"
"github.com/oasislabs/oasis-core/go/consensus/tendermint/abci"
"github.com/oasislabs/oasis-core/go/consensus/tendermint/api"
tmbeacon "github.com/oasislabs/oasis-core/go/consensus/tendermint/beacon"
Expand All @@ -60,6 +63,7 @@ import (
keymanagerAPI "github.com/oasislabs/oasis-core/go/keymanager/api"
cmbackground "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/background"
cmflags "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/flags"
cmmetrics "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/metrics"
"github.com/oasislabs/oasis-core/go/registry"
registryAPI "github.com/oasislabs/oasis-core/go/registry/api"
"github.com/oasislabs/oasis-core/go/roothash"
Expand Down Expand Up @@ -139,6 +143,8 @@ const (
var (
_ service.TendermintService = (*tendermintService)(nil)

labelTendermint = prometheus.Labels{"backend": "tendermint"}

// Flags has the configuration flags.
Flags = flag.NewFlagSet("", flag.ContinueOnError)
)
Expand Down Expand Up @@ -258,6 +264,9 @@ func (t *tendermintService) Start() error {
}
go t.syncWorker()
go t.worker()
if viper.GetString(cmmetrics.CfgMetricsMode) != cmmetrics.MetricsModeNone {
go t.metrics()
}
case false:
close(t.syncedCh)
}
Expand Down Expand Up @@ -1316,6 +1325,54 @@ func (t *tendermintService) worker() {
}
}

// metrics updates oasis_consensus metrics by checking last accepted block info.
func (t *tendermintService) metrics() {
sub, err := t.Subscribe("tendermint/metrics", tmtypes.EventQueryNewBlock)
if err != nil {
t.Logger.Error("worker: failed to subscribe to new block events",
"err", err,
)
return
}
defer t.Unsubscribe("tendermint/metrics", tmtypes.EventQueryNewBlock) // nolint:errcheck

// Tendermint uses specific public key encoding.
pubKey := t.consensusSigner.Public()
myAddr := []byte(crypto.PublicKeyToTendermint(&pubKey).Address())
for {
var blk *tmtypes.Block
select {
case <-t.node.Quit():
return
case <-sub.Cancelled():
return
case v := <-sub.Out():
ev := v.Data().(tmtypes.EventDataNewBlock)
blk = ev.Block
}

// Was block proposed by our node.
if bytes.Equal(myAddr, blk.ProposerAddress) {
metrics.ProposedBlocks.With(labelTendermint).Inc()
}

// Was block voted for by our node. Ignore if there was no previous block.
if blk.LastCommit != nil {
for _, sig := range blk.LastCommit.Signatures {
if sig.Absent() || sig.BlockIDFlag == tmtypes.BlockIDFlagNil {
// Vote is missing, ignore.
continue
}

if bytes.Equal(myAddr, sig.ValidatorAddress) {
metrics.SignedBlocks.With(labelTendermint).Inc()
break
}
}
}
}
}

// New creates a new Tendermint service.
func New(ctx context.Context, dataDir string, identity *identity.Identity, upgrader upgradeAPI.Backend, genesisProvider genesisAPI.Provider) (service.TendermintService, error) {
// Retrive the genesis document early so that it is possible to
Expand Down
4 changes: 2 additions & 2 deletions go/oasis-test-runner/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ func initRootEnv(cmd *cobra.Command) (*env.Env, error) {
func runRoot(cmd *cobra.Command, args []string) error {
cmd.SilenceUsage = true

if viper.GetString(metrics.CfgMetricsAddr) != "" {
if viper.IsSet(metrics.CfgMetricsAddr) {
oasisTestRunnerOnce.Do(func() {
prometheus.MustRegister(oasisTestRunnerCollectors...)
})
Expand Down Expand Up @@ -360,7 +360,7 @@ func runRoot(cmd *cobra.Command, args []string) error {
}

// Init per-run prometheus pusher, if metrics are enabled.
if viper.GetString(metrics.CfgMetricsAddr) != "" {
if viper.IsSet(metrics.CfgMetricsAddr) {
pusher = push.New(viper.GetString(metrics.CfgMetricsAddr), metrics.MetricsJobTestRunner)
labels := metrics.GetDefaultPushLabels(childEnv.TestInfo())
for k, v := range labels {
Expand Down
4 changes: 2 additions & 2 deletions go/registry/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ var (
Help: "Number of registry runtimes.",
},
)
registeryCollectors = []prometheus.Collector{
registryCollectors = []prometheus.Collector{
registryNodes,
registryEntities,
registryRuntimes,
Expand Down Expand Up @@ -105,7 +105,7 @@ func (m *MetricsUpdater) updatePeriodicMetrics(ctx context.Context) {
// NewMetricsUpdater creates a new registry metrics updater.
func NewMetricsUpdater(ctx context.Context, backend api.Backend) *MetricsUpdater {
metricsOnce.Do(func() {
prometheus.MustRegister(registeryCollectors...)
prometheus.MustRegister(registryCollectors...)
})

m := &MetricsUpdater{
Expand Down
10 changes: 10 additions & 0 deletions go/worker/common/committee/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,20 @@ var (
},
[]string{"runtime"},
)
epochNumber = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "oasis_worker_epoch_number",
Help: "Current epoch number as seen by the worker.",
},
[]string{"runtime"},
)

nodeCollectors = []prometheus.Collector{
processedBlockCount,
processedEventCount,
failedRoundCount,
epochTransitionCount,
epochNumber,
}

metricsOnce sync.Once
Expand Down Expand Up @@ -175,6 +184,7 @@ func (n *Node) handleEpochTransitionLocked(height int64) {
}

epoch := n.Group.GetEpochSnapshot()
epochNumber.With(n.getMetricLabels()).Set(float64(epoch.epochNumber))
for _, hooks := range n.hooks {
hooks.HandleEpochTransitionLocked(epoch)
}
Expand Down
28 changes: 27 additions & 1 deletion go/worker/registration/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

"github.com/cenkalti/backoff/v4"
"github.com/prometheus/client_golang/prometheus"
flag "github.com/spf13/pflag"
"github.com/spf13/viper"

Expand Down Expand Up @@ -53,6 +54,19 @@ var (
Flags = flag.NewFlagSet("", flag.ContinueOnError)

allowUnroutableAddresses bool

workerNodeRegistered = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "oasis_worker_node_registered",
Help: "Is oasis node registered (binary).",
},
)

nodeCollectors = []prometheus.Collector{
workerNodeRegistered,
}

metricsOnce sync.Once
)

// RegisterNodeHook is a function that is used to update the node descriptor.
Expand Down Expand Up @@ -240,7 +254,14 @@ func (w *Worker) registrationLoop() { // nolint: gocyclo
default:
}

return w.registerNode(epoch, hook)
err := w.registerNode(epoch, hook)
switch err {
case nil:
workerNodeRegistered.Set(1.0)
default:
workerNodeRegistered.Set(0.0)
}
return err
}, off)
}

Expand Down Expand Up @@ -358,6 +379,7 @@ Loop:

func (w *Worker) doNodeRegistration() {
defer close(w.quitCh)
defer workerNodeRegistered.Set(0.0)

if !w.storedDeregister {
w.registrationLoop()
Expand Down Expand Up @@ -949,6 +971,10 @@ func (w *Worker) Cleanup() {
}

func init() {
metricsOnce.Do(func() {
prometheus.MustRegister(nodeCollectors...)
})

Flags.String(CfgRegistrationEntity, "", "entity to use as the node owner in registrations")
Flags.String(CfgDebugRegistrationPrivateKey, "", "private key to use to sign node registrations")
Flags.Bool(CfgRegistrationForceRegister, false, "override a previously saved deregistration request")
Expand Down

0 comments on commit ae51f4d

Please sign in to comment.