Skip to content

Commit

Permalink
Add ability to filter metrics by collection level (#2226)
Browse files Browse the repository at this point in the history
  • Loading branch information
zippolyte committed Sep 19, 2018
1 parent 266e233 commit 554a0fd
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 103 deletions.
212 changes: 125 additions & 87 deletions vsphere/datadog_checks/vsphere/data/conf.yaml.example
Original file line number Diff line number Diff line change
@@ -1,93 +1,131 @@
# Section used for global vsphere check config
## Section used for global vsphere check configuration

init_config:
# This value is used to determine the number of MORs the check will retrieve metrics for in the same API call
# If too many API calls are made by the check, consider increasing this value
# If payloads are too large, consider decreasing this value
# A value <= 0 means unlimited: it will make one query for all the monitored objects
# Optional, default to 50
# batch_morlist_size: 50

# This value is used to determine the maximum number of MORs returned by vCenter in the same API call,
# when exploring the infrastructure
# If too many API calls are made by the check, consider increasing this value
# If payloads are too large, consider decreasing this value
# A value <= 0 means unlimited: it will query for the maximum number of objects at once
# Optional, default to 500
# batch_property_collector_size: 500

# Number of seconds between each discovering and caching of your vSphere environment
# Consider increasing this value if your environment is large, as caching can take some time to complete
# Optional, default to 180 seconds
# refresh_morlist_interval: 180

# Number of seconds to wait before an object is considered deleted from your vSphere environment and removed from the cache
# This value should be higher than refresh_morlist_interval.
# Optional, default to 2 * refresh_morlist_interval
# clean_morlist_interval: 360

# Number of seconds between each refresh of the metrics metadata cache
# Optional, default to 600 seconds
# refresh_metrics_metadata_interval: 600


# Define your list of instances here
# each item is a vCenter instance you want to connect to and
# fetch metrics from

## @param batch_morlist_size - integer - optional - default: 50
## This value is used to determine the number of MORs the check will retrieve metrics for in the same API call
## If too many API calls are made by the check, consider increasing this value
## If payloads are too large, consider decreasing this value
## A value <= 0 means unlimited: it makes one query for all the monitored objects
#
# batch_morlist_size: 50

## @param batch_property_collector_size - integer - optional - default: 500
## This value is used to determine the maximum number of MORs returned by vCenter in the same API call,
## when exploring the infrastructure
## If too many API calls are made by the check, consider increasing this value
## If payloads are too large, consider decreasing this value
## A value <= 0 means unlimited: it will query for the maximum number of objects at once
#
# batch_property_collector_size: 500

## @param refresh_morlist_interval - integer - optional - default: 180
## Number of seconds between each discovering and caching of your vSphere environment
## Consider increasing this value if your environment is large, as caching can take some time to complete
#
# refresh_morlist_interval: 180

## @param clean_morlist_interval - integer - optional - default: 2*refresh_morlist_interval
## Number of seconds to wait before an object is considered deleted from your vSphere environment and removed from the cache
## This value should be higher than refresh_morlist_interval.
#
# clean_morlist_interval: 360

## @param refresh_metrics_metadata_interval - integer - optional - default: 600
## Number of seconds between each refresh of the metrics metadata cache
#
# refresh_metrics_metadata_interval: 600


## Define your list of instances here each item is a
## vCenter instance you want to connect to and fetch metrics from

instances:
# name must be a unique key representing your vCenter instance
# mandatory

## @param name - string - required
## name must be a unique key representing your vCenter instance
#
- name: main-vcenter

# the host used to resolve the vCenter IP
# mandatory
host: vcenter.domain.com
## @param host - string - required
## the host used to resolve the vCenter IP
#
host: <HOSTNAME>

# Read-only credentials to connect to vCenter
# mandatory
# see https://app.datadoghq.com/account/settings#integrations/vsphere
## @param username - string - required
## Enter the username of the read-only credentials
## to connect to vCenter
## see https://app.datadoghq.com/account/settings#integrations/vsphere
#
username: [email protected]
password: mypassword

# Set to false to disable SSL verification, when connecting to vCenter
# optional
# ssl_verify: true

# Set to the absolute file path of a directory containing CA certificates
# in PEM format
# optional
# ssl_capath: "/path/to/directory"

# Use a regex like this if you want only the check
# to fetch metrics for these ESXi hosts and the VMs
# running on it
# optional
# host_include_only_regex: .*\.prod.datadoghq.com

# Use a regex to include only the VMs that are
# matching this pattern.
# optional
# vm_include_only_regex: .*\.sql\.datadoghq\.com

# Set to true if you'd like to only collect metrics on vSphere VMs which
# are marked by a custom field with the value 'DatadogMonitored'
# To set this custom field with PowerCLI, use the follow command:
# Get-VM <MyVMName> | Set-CustomField -Name "DatadogMonitored" -Value "DatadogMonitored"
# optional
# include_only_marked: false

# When set to true, this will collect EVERY metric
# from vCenter, which means a LOT of metrics you probably
# do not care about. We have selected a set of metrics
# that are interesting to monitor for you if false
# optional
# all_metrics: false # /!\ will generate a lot of metrics

# Event config is a dictionary
# For now the only switch you can flip is collect_vcenter_alarms
# which will send as events the alarms set in vCenter
# event_config:
# collect_vcenter_alarms: true # defaults to false

# Set (optional) custom tags for each metric
# tags:
# - optional:tag1

## @param password - string - required
## Enter the password of the read-only credentials
## to connect to vCenter
## see https://app.datadoghq.com/account/settings#integrations/vsphere
#
password: <PASSWORD>

## @param ssl_verify - boolean - optional - default: true
## Set to false to disable SSL verification, when connecting to vCenter
#
# ssl_verify: true

## @param ssl_capath - string - optional
## Set to the absolute file path of a directory containing CA certificates
## in PEM format
#
# ssl_capath: "<DIRECTORY_PATH>"

## @param host_include_only_regex - string - optional
## Use a regex like this if you want only the check
## to fetch metrics for these ESXi hosts and the VMs
## running on it
#
# host_include_only_regex: <REGEX>

## @param vm_include_only_regex - string - optional
## Use a regex to include only the VMs that are
## matching this pattern.
#
# vm_include_only_regex: <REGEX>

## @param include_only_marked - boolean - optional - default: false
## Set to true if you'd like to only collect metrics on vSphere VMs which
## are marked by a custom field with the value 'DatadogMonitored'
## To set this custom field with PowerCLI, use the follow command:
## Get-VM <MyVMName> | Set-CustomField -Name "DatadogMonitored" -Value "DatadogMonitored"
#
# include_only_marked: false

## @param all_metrics - boolean - optional - default: false
## This parameter has been deprecated in favor of `collection_level`
## Please set `collection_level: 4` if you want to collect all the metrics
## available on your system.
#
# all_metrics: false

## @param collection_level - integer - optional - default: 1
## A number between 1 and 4 to specify how many metrics will be sent
## 1: Only basic metrics - 4: every metric available.
## Warning: Depending on the size of the vSphere environment, metric collection can be slow,
## very CPU intensive and put pressure on the vCenter Server
#
# collection_level: 1

## @param event_config - dictionary - optional
## Event config is a dictionary
## For now the only switch you can flip is collect_vcenter_alarms
## which will send as events the alarms set in vCenter
#
# event_config:
# collect_vcenter_alarms: true

## @param tags - list of key:value element - optional
## List of tags to attach to every metric, event and service check emitted by this integration.
##
## Learn more about tagging: https://docs.datadoghq.com/tagging/
#
# tags:
# - <KEY_1>:<VALUE_1>
# - <KEY_2>:<VALUE_2>
73 changes: 58 additions & 15 deletions vsphere/datadog_checks/vsphere/vsphere.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@
vim.Folder
]

SHORT_ROLLUP = {
"average": "avg",
"summation": "sum",
"maximum": "max",
"minimum": "min",
"latest": "latest",
"none": "raw"
}


def trace_method(method):
"""
Expand Down Expand Up @@ -668,27 +677,59 @@ def _cache_metrics_metadata(self, instance):

new_metadata = {}
metric_ids = []
for counter in perfManager.perfCounter:
metric_name = "{}.{}".format(counter.groupInfo.key, counter.nameInfo.key)
new_metadata[counter.key] = {
'name': metric_name,
'unit': counter.unitInfo.key,
}
# Build the list of metrics we will want to collect
if instance.get("all_metrics") or metric_name in BASIC_METRICS:
# Use old behaviour with metrics to collect defined by our constants
if self.in_compatibility_mode(instance, log_warning=True):
for counter in perfManager.perfCounter:
metric_name = self.format_metric_name(counter, compatibility=True)
new_metadata[counter.key] = {
'name': metric_name,
'unit': counter.unitInfo.key,
}
# Build the list of metrics we will want to collect
if instance.get("all_metrics") or metric_name in BASIC_METRICS:
metric_ids.append(vim.PerformanceManager.MetricId(counterId=counter.key, instance="*"))
else:
collection_level = instance.get("collection_level", 1)
for counter in perfManager.QueryPerfCounterByLevel(collection_level):
new_metadata[counter.key] = {
"name": self.format_metric_name(counter),
"unit": counter.unitInfo.key
}
# Build the list of metrics we will want to collect
metric_ids.append(vim.PerformanceManager.MetricId(counterId=counter.key, instance="*"))

self.cache_config.set_last(CacheConfig.Metadata, i_key, time.time())

self.log.info("Finished metadata collection for instance {}".format(i_key))
# Reset metadata
self.metadata_cache.set_metadata(i_key, new_metadata)
self.metadata_cache.set_metric_ids(i_key, metric_ids)

self.cache_config.set_last(CacheConfig.Metadata, i_key, time.time())

# ## <TEST-INSTRUMENTATION>
self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total(), tags=custom_tags)
# ## </TEST-INSTRUMENTATION>

def format_metric_name(self, counter, compatibility=False):
if compatibility:
return "{}.{}".format(counter.groupInfo.key, counter.nameInfo.key)
else:
return "{}.{}.{}".format(counter.groupInfo.key, counter.nameInfo.key, SHORT_ROLLUP[str(counter.rollupType)])

def in_compatibility_mode(self, instance, log_warning=False):
if instance.get("all_metrics") is not None and instance.get("collection_level") is not None:
if log_warning:
self.log.warning("Using both `all_metrics` and `collection_level` configuration flag."
" `all_metrics` will be ignored.")
return False

if instance.get("all_metrics") is not None:
if log_warning:
self.warning("The configuration flag `all_metrics` will soon be deprecated. "
"Consider using `collection_level` instead.")
return True

return False

def _transform_value(self, instance, counter_id, value):
""" Given the counter_id, look up for the metrics metadata to check the vsphere
type of the counter and apply pre-reporting transformation if needed.
Expand Down Expand Up @@ -725,6 +766,7 @@ def _collect_metrics_async(self, instance, query_specs):
self.log.error("Trying to get metrics from object %s deleted from the cache, skipping. "
"Consider increasing the parameter `clean_morlist_interval` to avoid that", mor_name)
continue

for result in mor_perfs.value:
counter_id = result.id.counterId
if not self.metadata_cache.contains(i_key, counter_id):
Expand All @@ -736,12 +778,13 @@ def _collect_metrics_async(self, instance, query_specs):
# Metric types are absolute, delta, and rate
metric_name = self.metadata_cache.get_metadata(i_key, result.id.counterId).get('name')

if metric_name not in ALL_METRICS:
self.log.debug(u"Skipping unknown `%s` metric.", metric_name)
continue
if self.in_compatibility_mode(instance):
if metric_name not in ALL_METRICS:
self.log.debug("Skipping unknown `{}` metric.".format(metric_name))
continue

if not result.value:
self.log.debug(u"Skipping `%s` metric because the value is empty", metric_name)
self.log.debug("Skipping `{}` metric because the value is empty".format(metric_name))
continue

instance_name = result.id.instance or "none"
Expand All @@ -757,7 +800,7 @@ def _collect_metrics_async(self, instance, query_specs):
"vsphere.{}".format(metric_name),
value,
hostname=mor['hostname'],
tags=['instance:{}'.format(instance_name)] + custom_tags
tags=tags + custom_tags
)

# ## <TEST-INSTRUMENTATION>
Expand Down
Loading

0 comments on commit 554a0fd

Please sign in to comment.