Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: configure prometheus agent sharding per cluster #1610

Merged
merged 6 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add `cluster_control_plane_unhealthy` inhibition.
- Allow Prometheus Agent Sharding strategy to be configured per cluster instead of per installation.
QuentinBisson marked this conversation as resolved.
Show resolved Hide resolved

## [4.74.0] - 2024-05-02

Expand Down
46 changes: 46 additions & 0 deletions pkg/prometheus/agent/sharding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package agent

import "math"

type ShardingStrategy struct {
// Configures the number of series needed to add a new shard. Computation is number of series / ScaleUpSeriesCount
ScaleUpSeriesCount float64
// Percentage of needed series based on ScaleUpSeriesCount to scale down agents
ScaleDownPercentage float64
}

func (pass1 ShardingStrategy) Merge(pass2 *ShardingStrategy) ShardingStrategy {
strategy := ShardingStrategy{
pass1.ScaleUpSeriesCount,
pass1.ScaleDownPercentage,
}
if pass2 != nil {
if pass2.ScaleUpSeriesCount > 0 {
strategy.ScaleUpSeriesCount = pass2.ScaleUpSeriesCount
}
if pass2.ScaleDownPercentage > 0 {
strategy.ScaleDownPercentage = pass2.ScaleDownPercentage
}
}
return strategy
}

// We want to start with 1 prometheus-agent for each 1M time series with a scale down 20% threshold.
func (pass ShardingStrategy) ComputeShards(currentShardCount int, timeSeries float64) int {
shardScaleDownThreshold := pass.ScaleDownPercentage * pass.ScaleUpSeriesCount
desiredShardCount := int(math.Ceil(timeSeries / pass.ScaleUpSeriesCount))

// Compute Scale Down
if currentShardCount > desiredShardCount {
// We get the rest of a division of timeSeries by shardStep and we compare it with the scale down threshold
if math.Mod(timeSeries, pass.ScaleUpSeriesCount) > pass.ScaleUpSeriesCount-shardScaleDownThreshold {
desiredShardCount = currentShardCount
}
}

// We always have a minimum of 1 agent, even if there is no worker node
if desiredShardCount <= 0 {
return 1
}
return desiredShardCount
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package remotewriteconfig
package agent

import (
"flag"
Expand All @@ -8,7 +8,7 @@ import (
var _ = flag.Bool("update", false, "update the output file")

func TestShardComputationScaleUp(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)}

expected := 1
result := pass.ComputeShards(0, float64(1_000_000))
Expand All @@ -30,7 +30,7 @@ func TestShardComputationScaleUp(t *testing.T) {
}

func TestShardComputationReturnsAtLeast1Shart(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)}

expected := 1
result := pass.ComputeShards(0, 0)
Expand All @@ -46,7 +46,7 @@ func TestShardComputationReturnsAtLeast1Shart(t *testing.T) {
}

func TestShardComputationScaleDown(t *testing.T) {
pass := PrometheusAgentShardingStrategy{ShardScaleUpSeriesCount: float64(1_000_000), ShardScaleDownPercentage: float64(0.20)}
pass := ShardingStrategy{ScaleUpSeriesCount: float64(1_000_000), ScaleDownPercentage: float64(0.20)}
expected := 2
result := pass.ComputeShards(1, 1_000_001)
if result != expected {
Expand Down
4 changes: 2 additions & 2 deletions service/controller/clusterapi/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
)

type ControllerConfig struct {
Expand Down Expand Up @@ -50,7 +50,7 @@ type ControllerConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down
5 changes: 3 additions & 2 deletions service/controller/clusterapi/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/organization"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/password"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/alertmanagerwiring"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/heartbeat"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/heartbeatwebhookconfig"
Expand Down Expand Up @@ -68,7 +69,7 @@ type Config struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down Expand Up @@ -198,7 +199,7 @@ func New(config Config) ([]resource.Interface, error) {
Region: config.Region,
Version: config.PrometheusVersion,

PrometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
ShardingStrategy: config.ShardingStrategy,
}

remoteWriteConfigResource, err = remotewriteconfig.New(c)
Expand Down
4 changes: 2 additions & 2 deletions service/controller/managementcluster/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/key"
)

Expand Down Expand Up @@ -55,7 +55,7 @@ type ControllerConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down
5 changes: 3 additions & 2 deletions service/controller/managementcluster/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/organization"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/password"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/alertmanagerconfig"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/alertmanagerwiring"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/alerting/heartbeat"
Expand Down Expand Up @@ -73,7 +74,7 @@ type resourcesConfig struct {
PrometheusImageRepository string
PrometheusVersion string

PrometheusAgentShardingStrategy remotewriteconfig.PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy

RestrictedAccessEnabled bool
WhitelistedSubnets string
Expand Down Expand Up @@ -355,7 +356,7 @@ func newResources(config resourcesConfig) ([]resource.Interface, error) {
Region: config.Region,
Version: config.PrometheusVersion,

PrometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
ShardingStrategy: config.ShardingStrategy,
}

remoteWriteConfigResource, err = remotewriteconfig.New(c)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/organization"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheusquerier"
remotewriteconfiguration "github.com/giantswarm/prometheus-meta-operator/v2/pkg/remotewrite/configuration"
"github.com/giantswarm/prometheus-meta-operator/v2/service/key"
Expand All @@ -36,7 +37,7 @@ type Config struct {
Region string
Version string

PrometheusAgentShardingStrategy PrometheusAgentShardingStrategy
ShardingStrategy agent.ShardingStrategy
}

type Resource struct {
Expand All @@ -51,7 +52,7 @@ type Resource struct {
region string
version string

prometheusAgentShardingStrategy PrometheusAgentShardingStrategy
shardingStrategy agent.ShardingStrategy
}

func New(config Config) (*Resource, error) {
Expand Down Expand Up @@ -92,7 +93,7 @@ func New(config Config) (*Resource, error) {
region: config.Region,
version: config.Version,

prometheusAgentShardingStrategy: config.PrometheusAgentShardingStrategy,
shardingStrategy: config.ShardingStrategy,
}

return r, nil
Expand Down Expand Up @@ -157,17 +158,23 @@ func (r *Resource) desiredConfigMap(ctx context.Context, cluster metav1.Object,

// We want to compute the number of shards based on the number of nodes.
func (r *Resource) getShardsCountForCluster(cluster metav1.Object, currentShardCount int) (int, error) {
clusterShardingStrategy, err := key.GetClusterShardingStrategy(cluster)
if err != nil {
return 0, microerror.Mask(err)
}

shardingStrategy := r.shardingStrategy.Merge(clusterShardingStrategy)
headSeries, err := prometheusquerier.QueryTSDBHeadSeries(key.ClusterID(cluster))
if err != nil {
// If prometheus is not accessible (for instance, not running because this is a new cluster, we check if prometheus is accessible)
var dnsError *net.DNSError
if errors.As(err, &dnsError) {
return r.prometheusAgentShardingStrategy.ComputeShards(currentShardCount, 1), nil
return shardingStrategy.ComputeShards(currentShardCount, 1), nil
}

return 0, microerror.Mask(err)
}
return r.prometheusAgentShardingStrategy.ComputeShards(currentShardCount, headSeries), nil
return shardingStrategy.ComputeShards(currentShardCount, headSeries), nil
}

func (r *Resource) createConfigMap(ctx context.Context, cluster metav1.Object, name string, namespace string, version string) error {
Expand Down
30 changes: 0 additions & 30 deletions service/controller/resource/monitoring/remotewriteconfig/types.go

This file was deleted.

21 changes: 21 additions & 0 deletions service/key/key.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"math"
"strconv"
"strings"

"github.com/giantswarm/k8sclient/v7/pkg/k8sclient"
Expand All @@ -16,6 +17,7 @@ import (

"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
)

const (
Expand Down Expand Up @@ -253,6 +255,25 @@ func ClusterID(cluster metav1.Object) string {
return cluster.GetName()
}

func GetClusterShardingStrategy(cluster metav1.Object) (*agent.ShardingStrategy, error) {
var err error
var scaleUpSeriesCount, scaleDownPercentage float64
if value, ok := cluster.GetAnnotations()["monitoring.giantswarm.io/prometheus-agent-scale-up-series-count"]; ok {
if scaleUpSeriesCount, err = strconv.ParseFloat(value, 64); err != nil {
return nil, err
}
}
if value, ok := cluster.GetAnnotations()["monitoring.giantswarm.io/prometheus-agent-scale-down-percentage"]; ok {
if scaleDownPercentage, err = strconv.ParseFloat(value, 64); err != nil {
return nil, err
}
}
return &agent.ShardingStrategy{
ScaleUpSeriesCount: scaleUpSeriesCount,
ScaleDownPercentage: scaleDownPercentage,
}, nil
}

func Heartbeat() string {
return "heartbeat"
}
Expand Down
12 changes: 6 additions & 6 deletions service/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ import (
"github.com/giantswarm/prometheus-meta-operator/v2/flag"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/cluster"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/project"
"github.com/giantswarm/prometheus-meta-operator/v2/pkg/prometheus/agent"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/clusterapi"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/managementcluster"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/remotewrite"
"github.com/giantswarm/prometheus-meta-operator/v2/service/controller/resource/monitoring/remotewriteconfig"
)

// Config represents the configuration used to create a new service.
Expand Down Expand Up @@ -148,9 +148,9 @@ func New(config Config) (*Service, error) {
Flavor: config.Viper.GetString(config.Flag.Service.Provider.Flavor),
}

var prometheusAgentShardingStrategy = remotewriteconfig.PrometheusAgentShardingStrategy{
ShardScaleUpSeriesCount: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleUpSeriesCount),
ShardScaleDownPercentage: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleDownPercentage),
var shardingStrategy = agent.ShardingStrategy{
ScaleUpSeriesCount: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleUpSeriesCount),
ScaleDownPercentage: config.Viper.GetFloat64(config.Flag.Service.PrometheusAgent.ShardScaleDownPercentage),
}
var proxyConfig = httpproxy.FromEnvironment()
var clusterapiController *clusterapi.Controller
Expand Down Expand Up @@ -185,7 +185,7 @@ func New(config Config) (*Service, error) {
PrometheusImageRepository: config.Viper.GetString(config.Flag.Service.Prometheus.ImageRepository),
PrometheusVersion: config.Viper.GetString(config.Flag.Service.Prometheus.Version),

PrometheusAgentShardingStrategy: prometheusAgentShardingStrategy,
ShardingStrategy: shardingStrategy,

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),
Expand Down Expand Up @@ -235,7 +235,7 @@ func New(config Config) (*Service, error) {
PrometheusScrapeInterval: config.Viper.GetString(config.Flag.Service.Prometheus.ScrapeInterval),
PrometheusVersion: config.Viper.GetString(config.Flag.Service.Prometheus.Version),

PrometheusAgentShardingStrategy: prometheusAgentShardingStrategy,
ShardingStrategy: shardingStrategy,

RestrictedAccessEnabled: config.Viper.GetBool(config.Flag.Service.Security.RestrictedAccess.Enabled),
WhitelistedSubnets: config.Viper.GetString(config.Flag.Service.Security.RestrictedAccess.Subnets),
Expand Down