Skip to content

Commit

Permalink
[autodiscovery/providers] Fix telemetry gauge with number of errors (#…
Browse files Browse the repository at this point in the history
…20917)

* Revert "Add metrics for AD providers (#20144)"

This reverts commit 7e7189b.

* [autodiscovery/providers] Set current num errors in telemetry

* [autodiscovery/telemetry] Clarify description of "errors" gauge
  • Loading branch information
davidor authored Nov 16, 2023
1 parent 7b97e89 commit 2875643
Show file tree
Hide file tree
Showing 15 changed files with 8 additions and 92 deletions.
5 changes: 0 additions & 5 deletions pkg/autodiscovery/providers/cloudfoundry.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import (
"github.com/DataDog/datadog-agent/pkg/autodiscovery/common/utils"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/providers/names"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/telemetry"
"github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/util/cloudproviders/cloudfoundry"
"github.com/DataDog/datadog-agent/pkg/util/log"
Expand All @@ -40,7 +39,6 @@ func NewCloudFoundryConfigProvider(*config.ConfigurationProviders) (ConfigProvid
var err error

if cfp.bbsCache, err = cloudfoundry.GetGlobalBBSCache(); err != nil {
telemetry.Errors.Inc(names.CloudFoundryBBS)
return nil, err
}
return cfp, nil
Expand Down Expand Up @@ -88,7 +86,6 @@ func (cf CloudFoundryConfigProvider) getConfigsForApp(desiredLRP *cloudfoundry.D
}
parsedConfigs, errs := utils.ExtractTemplatesFromMap(id.String(), convertedADVal, "")
for _, err := range errs {
telemetry.Errors.Inc(names.CloudFoundryBBS)
log.Errorf("Cannot parse endpoint template for service %s of app %s: %s, skipping",
adName, desiredLRP.AppGUID, err)
}
Expand All @@ -101,7 +98,6 @@ func (cf CloudFoundryConfigProvider) getConfigsForApp(desiredLRP *cloudfoundry.D
// if service is found in VCAP_SERVICES (non-container service), we will run a single check per App
err := cf.renderExtractedConfigs(parsedConfigs, variables, vcVal)
if err != nil {
telemetry.Errors.Inc(names.CloudFoundryBBS)
log.Errorf("Failed to render config for service %s of app %s: %s", adName, desiredLRP.AppGUID, err)
} else {
success = true
Expand All @@ -116,7 +112,6 @@ func (cf CloudFoundryConfigProvider) getConfigsForApp(desiredLRP *cloudfoundry.D
if allSvcsStr == "" {
allSvcsStr = "no services found"
}
telemetry.Errors.Inc(names.CloudFoundryBBS)
log.Errorf(
"Service %s for app %s has variables configured, but is not present in VCAP_SERVICES (found services: %s)",
adName, desiredLRP.AppGUID, allSvcsStr,
Expand Down
5 changes: 0 additions & 5 deletions pkg/autodiscovery/providers/clusterchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (

"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/providers/names"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/telemetry"
"github.com/DataDog/datadog-agent/pkg/clusteragent/clusterchecks/types"
"github.com/DataDog/datadog-agent/pkg/config"
ddErrors "github.com/DataDog/datadog-agent/pkg/errors"
Expand Down Expand Up @@ -57,7 +56,6 @@ func NewClusterChecksConfigProvider(providerConfig *config.ConfigurationProvider
if config.Datadog.GetBool("cloud_foundry") {
boshID := config.Datadog.GetString("bosh_id")
if boshID == "" {
telemetry.Errors.Inc(names.ClusterChecks)
log.Warn("configuration variable cloud_foundry is set to true, but bosh_id is empty, can't retrieve node name")
} else {
c.identifier = boshID
Expand Down Expand Up @@ -142,7 +140,6 @@ func (c *ClusterChecksConfigProvider) Collect(ctx context.Context) ([]integratio
if c.dcaClient == nil {
err := c.initClient()
if err != nil {
telemetry.Errors.Inc(names.ClusterChecks)
return nil, err
}
}
Expand All @@ -152,7 +149,6 @@ func (c *ClusterChecksConfigProvider) Collect(ctx context.Context) ([]integratio
if (ddErrors.IsRemoteService(err) || ddErrors.IsTimeout(err)) && c.withinDegradedModePeriod() {
// Degraded mode: return the error to keep the configs scheduled
// during a Cluster Agent / network outage
telemetry.Errors.Inc(names.ClusterChecks)
return nil, err
}

Expand Down Expand Up @@ -198,7 +194,6 @@ func (c *ClusterChecksConfigProvider) heartbeatSender(ctx context.Context) {
extraHeartbeatTime = currentTime
log.Infof("Sent extra heartbeat at: %v", currentTime)
} else {
telemetry.Errors.Inc(names.ClusterChecks)
log.Warnf("Unable to send extra heartbeat to Cluster Agent, err: %v", err)
}
}
Expand Down
7 changes: 0 additions & 7 deletions pkg/autodiscovery/providers/consul.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"github.com/DataDog/datadog-agent/pkg/autodiscovery/common/utils"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/providers/names"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/telemetry"
"github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/util/log"
)
Expand Down Expand Up @@ -90,7 +89,6 @@ func NewConsulConfigProvider(providerConfig *config.ConfigurationProviders) (Con
cache := newProviderCache()
cli, err := consul.NewClient(clientCfg)
if err != nil {
telemetry.Errors.Inc(names.Consul)
return nil, fmt.Errorf("Unable to instantiate the consul client: %s", err)
}

Expand Down Expand Up @@ -138,7 +136,6 @@ func (p *ConsulConfigProvider) IsUpToDate(ctx context.Context) (bool, error) {
queryOptions = queryOptions.WithContext(ctx)
identifiers, _, err := kv.List(p.TemplateDir, queryOptions)
if err != nil {
telemetry.Errors.Inc(names.Consul)
return false, err
}
if p.cache.count != len(identifiers) {
Expand Down Expand Up @@ -174,7 +171,6 @@ func (p *ConsulConfigProvider) getIdentifiers(ctx context.Context, prefix string
// TODO: decide on the query parameters.
keys, _, err := kv.Keys(prefix, "", queryOptions)
if err != nil {
telemetry.Errors.Inc(names.Consul)
log.Error("Can't get templates keys from consul: ", err)
return identifiers
}
Expand Down Expand Up @@ -225,21 +221,18 @@ func (p *ConsulConfigProvider) getTemplates(ctx context.Context, key string) []i

checkNames, err := p.getCheckNames(ctx, checkNameKey)
if err != nil {
telemetry.Errors.Inc(names.Consul)
log.Errorf("Failed to retrieve check names at %s. Error: %s", checkNameKey, err)
return templates
}

initConfigs, err := p.getJSONValue(ctx, initKey)
if err != nil {
telemetry.Errors.Inc(names.Consul)
log.Errorf("Failed to retrieve init configs at %s. Error: %s", initKey, err)
return templates
}

instances, err := p.getJSONValue(ctx, instanceKey)
if err != nil {
telemetry.Errors.Inc(names.Consul)
log.Errorf("Failed to retrieve instances at %s. Error: %s", instanceKey, err)
return templates
}
Expand Down
5 changes: 2 additions & 3 deletions pkg/autodiscovery/providers/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,12 @@ func (k *ContainerConfigProvider) processEvents(evBundle workloadmeta.EventBundl
delete(k.configErrors, entityName)

default:
telemetry.Errors.Inc(names.KubeContainer)
log.Errorf("cannot handle event of type %d", event.Type)
}
}

telemetry.Errors.Set(float64(len(k.configErrors)), names.KubeContainer)

return changes
}

Expand Down Expand Up @@ -188,7 +189,6 @@ func (k *ContainerConfigProvider) generateConfig(e workloadmeta.Entity) ([]integ
for _, podContainer := range entity.GetAllContainers() {
container, err := k.workloadmetaStore.GetContainer(podContainer.ID)
if err != nil {
telemetry.Errors.Inc(names.KubeContainer)
log.Debugf("Pod %q has reference to non-existing container %q", entity.Name, podContainer.ID)
continue
}
Expand Down Expand Up @@ -248,7 +248,6 @@ func (k *ContainerConfigProvider) generateConfig(e workloadmeta.Entity) ([]integ
containerNames)...)

default:
telemetry.Errors.Inc(names.KubeContainer)
log.Errorf("cannot handle entity of kind %s", e.GetID().Kind)
}

Expand Down
5 changes: 0 additions & 5 deletions pkg/autodiscovery/providers/endpointschecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (

"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/providers/names"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/telemetry"
"github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/errors"
"github.com/DataDog/datadog-agent/pkg/util/clusteragent"
Expand Down Expand Up @@ -47,7 +46,6 @@ func NewEndpointsChecksConfigProvider(providerConfig *config.ConfigurationProvid
var err error
c.nodeName, err = getNodename(context.TODO())
if err != nil {
telemetry.Errors.Inc(names.EndpointsChecks)
log.Errorf("Cannot get node name: %s", err)
return nil, err
}
Expand Down Expand Up @@ -97,7 +95,6 @@ func (c *EndpointsChecksConfigProvider) Collect(ctx context.Context) ([]integrat
return nil, nil
}

telemetry.Errors.Inc(names.EndpointsChecks)
return nil, err
}

Expand All @@ -119,7 +116,6 @@ func getNodename(ctx context.Context) (string, error) {
}
ku, err := kubelet.GetKubeUtil()
if err != nil {
telemetry.Errors.Inc(names.EndpointsChecks)
log.Errorf("Cannot get kubeUtil object: %s", err)
return "", err
}
Expand All @@ -132,7 +128,6 @@ func (c *EndpointsChecksConfigProvider) initClient() error {
if err == nil {
c.dcaClient = dcaClient
}
telemetry.Errors.Inc(names.EndpointsChecks)
return err
}

Expand Down
7 changes: 0 additions & 7 deletions pkg/autodiscovery/providers/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"github.com/DataDog/datadog-agent/pkg/autodiscovery/common/utils"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/providers/names"
"github.com/DataDog/datadog-agent/pkg/autodiscovery/telemetry"
"github.com/DataDog/datadog-agent/pkg/config"
"github.com/DataDog/datadog-agent/pkg/util/log"
)
Expand Down Expand Up @@ -55,7 +54,6 @@ func NewEtcdConfigProvider(providerConfig *config.ConfigurationProviders) (Confi

cl, err := client.New(clientCfg)
if err != nil {
telemetry.Errors.Inc(names.Etcd)
return nil, fmt.Errorf("Unable to instantiate the etcd client: %s", err)
}
cache := newProviderCache()
Expand Down Expand Up @@ -87,7 +85,6 @@ func (p *EtcdConfigProvider) getIdentifiers(ctx context.Context, key string) []s
identifiers := make([]string, 0)
resp, err := p.Client.Get(ctx, key, &client.GetOptions{Recursive: true})
if err != nil {
telemetry.Errors.Inc(names.Etcd)
log.Error("Can't get templates keys from etcd: ", err)
return identifiers
}
Expand All @@ -110,21 +107,18 @@ func (p *EtcdConfigProvider) getTemplates(ctx context.Context, key string) []int

checkNames, err := p.getCheckNames(ctx, checkNameKey)
if err != nil {
telemetry.Errors.Inc(names.Etcd)
log.Errorf("Failed to retrieve check names at %s. Error: %s", checkNameKey, err)
return nil
}

initConfigs, err := p.getJSONValue(ctx, initKey)
if err != nil {
telemetry.Errors.Inc(names.Etcd)
log.Errorf("Failed to retrieve init configs at %s. Error: %s", initKey, err)
return nil
}

instances, err := p.getJSONValue(ctx, instanceKey)
if err != nil {
telemetry.Errors.Inc(names.Etcd)
log.Errorf("Failed to retrieve instances at %s. Error: %s", instanceKey, err)
return nil
}
Expand Down Expand Up @@ -169,7 +163,6 @@ func (p *EtcdConfigProvider) IsUpToDate(ctx context.Context) (bool, error) {

resp, err := p.Client.Get(ctx, p.templateDir, &client.GetOptions{Recursive: true})
if err != nil {
telemetry.Errors.Inc(names.Etcd)
return false, err
}
identifiers := resp.Node.Nodes
Expand Down
1 change: 0 additions & 1 deletion pkg/autodiscovery/providers/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ func NewFileConfigProvider() *FileConfigProvider {
func (c *FileConfigProvider) Collect(ctx context.Context) ([]integration.Config, error) {
configs, errors, err := ReadConfigFiles(WithoutAdvancedAD)
if err != nil {
telemetry.Errors.Inc(names.File)
return nil, err
}

Expand Down
17 changes: 2 additions & 15 deletions pkg/autodiscovery/providers/kube_endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,11 @@ func NewKubeEndpointsConfigProvider(*config.ConfigurationProviders) (ConfigProvi
// Using GetAPIClient (no wait) as Client should already be initialized by Cluster Agent main entrypoint before
ac, err := apiserver.GetAPIClient()
if err != nil {
telemetry.Errors.Inc(names.KubeEndpoints)
return nil, fmt.Errorf("cannot connect to apiserver: %s", err)
}

servicesInformer := ac.InformerFactory.Core().V1().Services()
if servicesInformer == nil {
telemetry.Errors.Inc(names.KubeEndpoints)
return nil, fmt.Errorf("cannot get service informer: %s", err)
}

Expand All @@ -83,13 +81,11 @@ func NewKubeEndpointsConfigProvider(*config.ConfigurationProviders) (ConfigProvi
UpdateFunc: p.invalidateIfChangedService,
DeleteFunc: p.invalidate,
}); err != nil {
telemetry.Errors.Inc(names.KubeEndpoints)
return nil, fmt.Errorf("cannot add event handler to service informer: %s", err)
}

endpointsInformer := ac.InformerFactory.Core().V1().Endpoints()
if endpointsInformer == nil {
telemetry.Errors.Inc(names.KubeEndpoints)
return nil, fmt.Errorf("cannot get endpoint informer: %s", err)
}

Expand All @@ -98,7 +94,6 @@ func NewKubeEndpointsConfigProvider(*config.ConfigurationProviders) (ConfigProvi
if _, err := endpointsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: p.invalidateIfChangedEndpoints,
}); err != nil {
telemetry.Errors.Inc(names.KubeEndpoints)
return nil, fmt.Errorf("cannot add event handler to endpoint informer: %s", err)
}

Expand All @@ -123,7 +118,6 @@ func (k *kubeEndpointsConfigProvider) Collect(ctx context.Context) ([]integratio
for _, config := range parsedConfigsInfo {
kep, err := k.endpointsLister.Endpoints(config.namespace).Get(config.name)
if err != nil {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Cannot get Kubernetes endpoints: %s", err)
continue
}
Expand All @@ -147,14 +141,12 @@ func (k *kubeEndpointsConfigProvider) invalidate(obj interface{}) {
// It's possible that we got a DeletedFinalStateUnknown here
deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Received unexpected object: %T", obj)
return
}

castedObj, ok = deletedState.Obj.(*v1.Service)
if !ok {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Expected DeletedFinalStateUnknown to contain *v1.Service, got: %T", deletedState.Obj)
return
}
Expand All @@ -172,14 +164,12 @@ func (k *kubeEndpointsConfigProvider) invalidateIfChangedService(old, obj interf
// nil pointers are safely handled by the casting logic.
castedObj, ok := obj.(*v1.Service)
if !ok {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Expected a *v1.Service type, got: %T", obj)
return
}
// Cast the old object, invalidate on casting error
castedOld, ok := old.(*v1.Service)
if !ok {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Expected a *v1.Service type, got: %T", old)
k.setUpToDate(false)
return
Expand All @@ -200,14 +190,12 @@ func (k *kubeEndpointsConfigProvider) invalidateIfChangedEndpoints(old, obj inte
// nil pointers are safely handled by the casting logic.
castedObj, ok := obj.(*v1.Endpoints)
if !ok {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Expected an *v1.Endpoints type, got: %T", obj)
return
}
// Cast the old object, invalidate on casting error
castedOld, ok := old.(*v1.Endpoints)
if !ok {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Expected a *v1.Endpoints type, got: %T", old)
k.setUpToDate(false)
return
Expand Down Expand Up @@ -249,7 +237,6 @@ func (k *kubeEndpointsConfigProvider) parseServiceAnnotationsForEndpoints(servic

endptConf, errors := utils.ExtractTemplatesFromPodAnnotations(endpointsID, svc.Annotations, kubeEndpointID)
for _, err := range errors {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Errorf("Cannot parse endpoint template for service %s/%s: %s", svc.Namespace, svc.Name, err)
}

Expand Down Expand Up @@ -285,13 +272,14 @@ func (k *kubeEndpointsConfigProvider) parseServiceAnnotationsForEndpoints(servic

k.cleanErrorsOfDeletedEndpoints(setEndpointIDs)

telemetry.Errors.Set(float64(len(k.configErrors)), names.KubeEndpoints)

return configsInfo
}

// generateConfigs creates a config template for each Endpoints IP
func generateConfigs(tpl integration.Config, resolveMode endpointResolveMode, kep *v1.Endpoints) []integration.Config {
if kep == nil {
telemetry.Errors.Inc(names.KubeEndpoints)
log.Warn("Nil Kubernetes Endpoints object, cannot generate config templates")
return []integration.Config{tpl}
}
Expand All @@ -306,7 +294,6 @@ func generateConfigs(tpl integration.Config, resolveMode endpointResolveMode, ke
case kubeEndpointResolveIP:
// In case of unknown value, fallback to auto
default:
telemetry.Errors.Inc(names.KubeEndpoints)
log.Warnf("Unknown resolve value: %s for endpoint: %s/%s - fallback to auto mode", resolveMode, namespace, name)
fallthrough
// Auto or empty (default to auto): we try to resolve the POD behind this address
Expand Down
Loading

0 comments on commit 2875643

Please sign in to comment.