Skip to content

Commit

Permalink
feat: configure prometheus agent sharding per cluster (#1610)
Browse files Browse the repository at this point in the history
* feat: configure prometheus agent sharding per cluster

* Update CHANGELOG.md

Co-authored-by: Hervé Nicol <[email protected]>

* Document sharding overrides/

* Update README.md

Co-authored-by: Hervé Nicol <[email protected]>

* Update README.md

Co-authored-by: Hervé Nicol <[email protected]>

---------

Co-authored-by: Hervé Nicol <[email protected]>
  • Loading branch information
QuentinBisson and hervenicol authored May 13, 2024
1 parent efb976b commit 4841448
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add `cluster_control_plane_unhealthy` inhibition.
- Allow Prometheus Agent Sharding strategy to be overridden per cluster.

### Removed

Expand Down
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,28 @@ The retention size of prometheis will be set according to the volume size: we ap
* `large` (200 Gi) => retentionSize = 180Gi

Check [Prometheus Volume Sizing](https://docs.giantswarm.io/getting-started/observability/monitoring/prometheus/volume-size/) for more details.

# Prometheus Agent Sharding

Prometheus Meta Operator configures the Prometheus Agent instances running in workload clusters (pre-mimir setup cf. observability-operator).

To be able to ingest metrics without disrupting the workload running in the clusters, Prometheus Meta Operator can shard the number of running Prometheus Agents.

The default configuration is defined in PMO itself: PMO adds a new shard every 1M time series present in the WC prometheus running on the management cluster. To avoid scaling down too abruptly, we defined a scale down threshold of 20%.

As this default value was not enough to avoid workload disruptions, we added 2 ways to be able to override the scale up series count target and the scale down percentage.

1. Those values can be configured at the installation level by overriding the following values:

```yaml
prometheusAgent:
shardScaleUpSeriesCount: 1000000
shardScaleDownPercentage: 0.20
```
2. Those values can also be set per cluster using the following cluster annotations:
```yaml
monitoring.giantswarm.io/prometheus-agent-scale-up-series-count: 1000000
monitoring.giantswarm.io/prometheus-agent-scale-down-percentage: 0.20
```
46 changes: 46 additions & 0 deletions pkg/prometheus/agent/sharding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package agent

import "math"

type ShardingStrategy struct {
// Configures the number of series needed to add a new shard. Computation is number of series / ScaleUpSeriesCount
ScaleUpSeriesCount float64
// Percentage of needed series based on ScaleUpSeriesCount to scale down agents
ScaleDownPercentage float64
}

func (pass1 ShardingStrategy) Merge(pass2 *ShardingStrategy) ShardingStrategy {
strategy := ShardingStrategy{
pass1.ScaleUpSeriesCount,
pass1.ScaleDownPercentage,
}
if pass2 != nil {
if pass2.ScaleUpSeriesCount > 0 {
strategy.ScaleUpSeriesCount = pass2.ScaleUpSeriesCount
}
if pass2.ScaleDownPercentage > 0 {
strategy.ScaleDownPercentage = pass2.ScaleDownPercentage
}
}
return strategy
}

// We want to start with 1 prometheus-agent for each 1M time series with a scale down 20% threshold.
func (pass ShardingStrategy) ComputeShards(currentShardCount int, timeSeries float64) int {
shardScaleDownThreshold := pass.ScaleDownPercentage * pass.ScaleUpSeriesCount
desiredShardCount := int(math.Ceil(timeSeries / pass.ScaleUpSeriesCount))

// Compute Scale Down
if currentShardCount > desiredShardCount {
// We get the rest of a division of timeSeries by shardStep and we compare it with the scale down threshold
if math.Mod(timeSeries, pass.ScaleUpSeriesCount) > pass.ScaleUpSeriesCount-shardScaleDownThreshold {
desiredShardCount = currentShardCount
}
}

// We always have a minimum of 1 agent, even if there is no worker node
if desiredShardCount <= 0 {
return 1
}
return desiredShardCount
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package remotewriteconfig
package agent

import (
"flag"
Expand All @@ -8,7 +8,7 @@ import (
var _ = flag.Bool("update", false, "update the output file")

func TestShardComputationScaleUp(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)}

expected := 1
result := pass.ComputeShards(0, float64(1_000_000))
Expand All @@ -30,7 +30,7 @@ func TestShardComputationScaleUp(t *testing.T) {
}

func TestShardComputationReturnsAtLeast1Shart(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)}

expected := 1
result := pass.ComputeShards(0, 0)
Expand All @@ -46,7 +46,7 @@ func TestShardComputationReturnsAtLeast1Shart(t *testing.T) {
}

func TestShardComputationScaleDown(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)}
expected := 2
result := pass.ComputeShards(1, 1_000_001)
if result != expected {
Expand Down
4 changes: 2 additions & 2 deletions service/controller/clusterapi/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
)

type ControllerConfig struct {
Expand Down Expand Up @@ -50,7 +50,7 @@ type ControllerConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down
5 changes: 3 additions & 2 deletions service/controller/clusterapi/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/organization"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/password"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/alertmanagerwiring"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/heartbeat"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/heartbeatwebhookconfig"
Expand Down Expand Up @@ -68,7 +69,7 @@ type Config struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down Expand Up @@ -198,7 +199,7 @@ func New(config Config) ([]resource.Interface, error) {
Region: config.Region,
Version: config.PrometheusVersion,

PrometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
ShardingStrategy: config.ShardingStrategy,
}

remoteWriteConfigResource, err = remotewriteconfig.New(c)
Expand Down
4 changes: 2 additions & 2 deletions service/controller/managementcluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/key"
)

Expand Down Expand Up @@ -55,7 +55,7 @@ type ControllerConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down
5 changes: 3 additions & 2 deletions service/controller/managementcluster/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/organization"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/password"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/alertmanagerconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/alertmanagerwiring"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/heartbeat"
Expand Down Expand Up @@ -73,7 +74,7 @@ type resourcesConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down Expand Up @@ -355,7 +356,7 @@ func newResources(config resourcesConfig) ([]resource.Interface, error) {
Region: config.Region,
Version: config.PrometheusVersion,

PrometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
ShardingStrategy: config.ShardingStrategy,
}

remoteWriteConfigResource, err = remotewriteconfig.New(c)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/organization"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheusquerier"
remotewriteconfiguration "github.com/giantswarm/prometheus-meta-operator/v2/pkg/remotewrite/configuration"
"github.com/giantswarm/prometheus-meta-operator/v2/service/key"
Expand All @@ -36,7 +37,7 @@ type Config struct {
Region string
Version string

PrometheusAgentShardingStrategy PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy
}

type Resource struct {
Expand All @@ -51,7 +52,7 @@ type Resource struct {
region string
version string

prometheusAgentShardingStrategy PrometheusAgentShardingStrategy
shardingStrategy agent.ShardingStrategy
}

func New(config Config) (*Resource, error) {
Expand Down Expand Up @@ -92,7 +93,7 @@ func New(config Config) (*Resource, error) {
region: config.Region,
version: config.Version,

prometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
shardingStrategy: config.ShardingStrategy,
}

return r, nil
Expand Down Expand Up @@ -157,17 +158,23 @@ func (r *Resource) desiredConfigMap(ctx context.Context, cluster metav1.Object,

// We want to compute the number of shards based on the number of nodes.
func (r *Resource) getShardsCountForCluster(cluster metav1.Object, currentShardCount int) (int, error) {
clusterShardingStrategy, err := key.GetClusterShardingStrategy(cluster)
if err != nil {
return 0, microerror.Mask(err)
}

shardingStrategy := r.shardingStrategy.Merge(clusterShardingStrategy)
headSeries, err := prometheusquerier.QueryTSDBHeadSeries(key.ClusterID(cluster))
if err != nil {
// If prometheus is not accessible (for instance, not running because this is a new cluster, we check if prometheus is accessible)
var dnsError *net.DNSError
if errors.As(err, &dnsError) {
return r.prometheusAgentShardingStrategy.ComputeShards(currentShardCount, 1), nil
return shardingStrategy.ComputeShards(currentShardCount, 1), nil
}

return 0, microerror.Mask(err)
}
return r.prometheusAgentShardingStrategy.ComputeShards(currentShardCount, headSeries), nil
return shardingStrategy.ComputeShards(currentShardCount, headSeries), nil
}

func (r *Resource) createConfigMap(ctx context.Context, cluster metav1.Object, name string, namespace string, version string) error {
Expand Down
30 changes: 0 additions & 30 deletions service/controller/resource/monitoring/remotewriteconfig/types.go

This file was deleted.

21 changes: 21 additions & 0 deletions service/key/key.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"math"
"strconv"
"strings"

"github.com/giantswarm/k8sclient/v7/pkg/k8sclient"
Expand All @@ -16,6 +17,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
)

const (
Expand Down Expand Up @@ -253,6 +255,25 @@ func ClusterID(cluster metav1.Object) string {
return cluster.GetName()
}

func GetClusterShardingStrategy(cluster metav1.Object) (*agent.ShardingStrategy, error) {
var err error
var scaleUpSeriesCount, scaleDownPercentage float64
if value, ok := cluster.GetAnnotations()["monitoring.giantswarm.io/prometheus-agent-scale-up-series-count"]; ok {
if scaleUpSeriesCount, err = strconv.ParseFloat(value, 64); err != nil {
return nil, err
}
}
if value, ok := cluster.GetAnnotations()["monitoring.giantswarm.io/prometheus-agent-scale-down-percentage"]; ok {
if scaleDownPercentage, err = strconv.ParseFloat(value, 64); err != nil {
return nil, err
}
}
return &agent.ShardingStrategy{
ScaleUpSeriesCount: scaleUpSeriesCount,
ScaleDownPercentage: scaleDownPercentage,
}, nil
}

func Heartbeat() string {
return "heartbeat"
}
Expand Down
12 changes: 6 additions & 6 deletions service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/flag"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/clusterapi"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/managementcluster"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/remotewrite"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
)

// Config represents the configuration used to create a new service.
Expand Down Expand Up @@ -148,9 +148,9 @@ func New(config Config) (*Service, error) {
Flavor: config.Viper.GetString(config.Flag.Service.Provider.Flavor),
}

var prometheusAgentShardingStrategy = remotewriteconfig.PrometheusAgentShardingStrategy{
ShardScaleUpSeriesCount: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleUpSeriesCount),
ShardScaleDownPercentage: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleDownPercentage),
var shardingStrategy = agent.ShardingStrategy{
ScaleUpSeriesCount: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleUpSeriesCount),
ScaleDownPercentage: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleDownPercentage),
}
var proxyConfig = httpproxy.FromEnvironment()
var clusterapiController *clusterapi.Controller
Expand Down Expand Up @@ -185,7 +185,7 @@ func New(config Config) (*Service, error) {
PrometheusImageRepository: config.Viper.GetString(config.Flag.Service.Prometheus.ImageRepository),
PrometheusVersion: config.Viper.GetString(config.Flag.Service.Prometheus.Version),

PrometheusAgentShardingStrategy: prometheusAgentShardingStrategy,
ShardingStrategy: shardingStrategy,

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),
Expand Down Expand Up @@ -235,7 +235,7 @@ func New(config Config) (*Service, error) {
PrometheusScrapeInterval: config.Viper.GetString(config.Flag.Service.Prometheus.ScrapeInterval),
PrometheusVersion: config.Viper.GetString(config.Flag.Service.Prometheus.Version),

PrometheusAgentShardingStrategy: prometheusAgentShardingStrategy,
ShardingStrategy: shardingStrategy,

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),
Expand Down

0 comments on commit 4841448

Please sign in to comment.