Add ability to filter metrics by collection level (#2226)

DataDog · Sep 19, 2018 · 554a0fd · 554a0fd
1 parent 266e233
commit 554a0fd
Show file tree

Hide file tree

Showing 3 changed files with 267 additions and 103 deletions.
diff --git a/vsphere/datadog_checks/vsphere/data/conf.yaml.example b/vsphere/datadog_checks/vsphere/data/conf.yaml.example
@@ -1,93 +1,131 @@
-# Section used for global vsphere check config
+## Section used for global vsphere check configuration
+
 init_config:
-  # This value is used to determine the number of MORs the check will retrieve metrics for in the same API call
-  # If too many API calls are made by the check, consider increasing this value
-  # If payloads are too large, consider decreasing this value
-  # A value <= 0 means unlimited: it will make one query for all the monitored objects
-  # Optional, default to 50
-  # batch_morlist_size: 50
-
-  # This value is used to determine the maximum number of MORs returned by vCenter in the same API call,
-  # when exploring the infrastructure
-  # If too many API calls are made by the check, consider increasing this value
-  # If payloads are too large, consider decreasing this value
-  # A value <= 0 means unlimited: it will query for the maximum number of objects at once
-  # Optional, default to 500
-  # batch_property_collector_size: 500
-
-  # Number of seconds between each discovering and caching of your vSphere environment
-  # Consider increasing this value if your environment is large, as caching can take some time to complete
-  # Optional, default to 180 seconds
-  # refresh_morlist_interval: 180
-
-  # Number of seconds to wait before an object is considered deleted from your vSphere environment and removed from the cache
-  # This value should be higher than refresh_morlist_interval.
-  # Optional, default to 2 * refresh_morlist_interval
-  # clean_morlist_interval: 360
-
-  # Number of seconds between each refresh of the metrics metadata cache
-  # Optional, default to 600 seconds
-  # refresh_metrics_metadata_interval: 600
-
-
-# Define your list of instances here
-# each item is a vCenter instance you want to connect to and
-# fetch metrics from
+
+## @param batch_morlist_size - integer - optional - default: 50
+## This value is used to determine the number of MORs the check will retrieve metrics for in the same API call
+## If too many API calls are made by the check, consider increasing this value
+## If payloads are too large, consider decreasing this value
+## A value <= 0 means unlimited: it makes one query for all the monitored objects
+#
+#  batch_morlist_size: 50
+
+## @param batch_property_collector_size - integer - optional - default: 500
+##  This value is used to determine the maximum number of MORs returned by vCenter in the same API call,
+## when exploring the infrastructure
+## If too many API calls are made by the check, consider increasing this value
+## If payloads are too large, consider decreasing this value
+## A value <= 0 means unlimited: it will query for the maximum number of objects at once
+#
+#  batch_property_collector_size: 500
+
+## @param refresh_morlist_interval - integer - optional - default: 180
+## Number of seconds between each discovering and caching of your vSphere environment
+## Consider increasing this value if your environment is large, as caching can take some time to complete
+#
+#  refresh_morlist_interval: 180
+
+## @param clean_morlist_interval - integer - optional - default: 2*refresh_morlist_interval
+## Number of seconds to wait before an object is considered deleted from your vSphere environment and removed from the cache
+## This value should be higher than refresh_morlist_interval.
+#
+#  clean_morlist_interval: 360
+
+## @param refresh_metrics_metadata_interval - integer - optional - default: 600
+## Number of seconds between each refresh of the metrics metadata cache
+#
+#  refresh_metrics_metadata_interval: 600
+
+
+## Define your list of instances here each item is a
+## vCenter instance you want to connect to and fetch metrics from
+
 instances:
-  # name must be a unique key representing your vCenter instance
-  # mandatory
+
+  ## @param name - string - required
+  ## name must be a unique key representing your vCenter instance
+  #
   - name: main-vcenter
 
-    # the host used to resolve the vCenter IP
-    # mandatory
-    host: vcenter.domain.com
+  ## @param host - string - required
+  ## the host used to resolve the vCenter IP
+  #
+    host: <HOSTNAME>
 
-    # Read-only credentials to connect to vCenter
-    # mandatory
-    # see https://app.datadoghq.com/account/settings#integrations/vsphere
+  ## @param username - string - required 
+  ## Enter the username of the read-only credentials 
+  ## to connect to vCenter
+  ## see https://app.datadoghq.com/account/settings#integrations/vsphere
+  #
     username: [email protected]
-    password: mypassword
-
-    # Set to false to disable SSL verification, when connecting to vCenter
-    # optional
-    # ssl_verify: true
-
-    # Set to the absolute file path of a directory containing CA certificates
-    # in PEM format
-    # optional
-    # ssl_capath: "/path/to/directory"
-
-    # Use a regex like this if you want only the check
-    # to fetch metrics for these ESXi hosts and the VMs
-    # running on it
-    # optional
-    # host_include_only_regex: .*\.prod.datadoghq.com
-
-    # Use a regex to include only the VMs that are
-    # matching this pattern.
-    # optional
-    # vm_include_only_regex: .*\.sql\.datadoghq\.com
-
-    # Set to true if you'd like to only collect metrics on vSphere VMs which
-    # are marked by a custom field with the value 'DatadogMonitored'
-    # To set this custom field with PowerCLI, use the follow command: 
-    #   Get-VM <MyVMName> | Set-CustomField -Name "DatadogMonitored" -Value "DatadogMonitored"
-    # optional
-    # include_only_marked: false
-
-    # When set to true, this will collect EVERY metric
-    # from vCenter, which means a LOT of metrics you probably
-    # do not care about. We have selected a set of metrics
-    # that are interesting to monitor for you if false
-    # optional
-    # all_metrics: false # /!\ will generate a lot of metrics
-
-    # Event config is a dictionary
-    # For now the only switch you can flip is collect_vcenter_alarms
-    # which will send as events the alarms set in vCenter
-    # event_config:
-    #   collect_vcenter_alarms: true # defaults to false
-
-    # Set (optional) custom tags for each metric
-    # tags:
-    #   - optional:tag1
+
+  ## @param password - string - required 
+  ## Enter the password of the read-only credentials 
+  ## to connect to vCenter
+  ## see https://app.datadoghq.com/account/settings#integrations/vsphere
+  # 
+    password: <PASSWORD>
+
+  ## @param ssl_verify - boolean - optional - default: true
+  ## Set to false to disable SSL verification, when connecting to vCenter
+  #
+  #  ssl_verify: true
+
+  ## @param ssl_capath - string - optional
+  ## Set to the absolute file path of a directory containing CA certificates
+  ## in PEM format
+  #
+  #  ssl_capath: "<DIRECTORY_PATH>"
+
+  ## @param host_include_only_regex - string - optional
+  ## Use a regex like this if you want only the check
+  ## to fetch metrics for these ESXi hosts and the VMs
+  ## running on it
+  #
+  #  host_include_only_regex: <REGEX>
+
+  ## @param vm_include_only_regex - string - optional
+  ## Use a regex to include only the VMs that are
+  ## matching this pattern.
+  #
+  #  vm_include_only_regex: <REGEX>
+
+  ## @param include_only_marked - boolean - optional - default: false
+  ## Set to true if you'd like to only collect metrics on vSphere VMs which
+  ## are marked by a custom field with the value 'DatadogMonitored'
+  ## To set this custom field with PowerCLI, use the follow command: 
+  ##   Get-VM <MyVMName> | Set-CustomField -Name "DatadogMonitored" -Value "DatadogMonitored"
+  #
+  #  include_only_marked: false
+
+  ## @param all_metrics - boolean - optional - default: false
+  ## This parameter has been deprecated in favor of `collection_level`
+  ## Please set `collection_level: 4` if you want to collect all the metrics
+  ## available on your system.
+  #
+  #  all_metrics: false
+
+  ## @param collection_level - integer - optional - default: 1
+  ## A number between 1 and 4 to specify how many metrics will be sent
+  ## 1: Only basic metrics - 4: every metric available.
+  ## Warning: Depending on the size of the vSphere environment, metric collection can be slow,
+  ## very CPU intensive and put pressure on the vCenter Server
+  # 
+  #  collection_level: 1
+
+  ## @param event_config - dictionary - optional 
+  ## Event config is a dictionary
+  ## For now the only switch you can flip is collect_vcenter_alarms
+  ## which will send as events the alarms set in vCenter
+  #
+  #  event_config:
+  #    collect_vcenter_alarms: true
+
+  ## @param tags  - list of key:value element - optional 
+  ## List of tags to attach to every metric, event and service check emitted by this integration.
+  ## 
+  ## Learn more about tagging: https://docs.datadoghq.com/tagging/
+  #
+  #  tags:
+  #    - <KEY_1>:<VALUE_1>
+  #    - <KEY_2>:<VALUE_2>
diff --git a/vsphere/datadog_checks/vsphere/vsphere.py b/vsphere/datadog_checks/vsphere/vsphere.py
@@ -67,6 +67,15 @@
     vim.Folder
 ]
 
+SHORT_ROLLUP = {
+    "average": "avg",
+    "summation": "sum",
+    "maximum": "max",
+    "minimum": "min",
+    "latest": "latest",
+    "none": "raw"
+}
+
 
 def trace_method(method):
     """
@@ -668,27 +677,59 @@ def _cache_metrics_metadata(self, instance):
 
         new_metadata = {}
         metric_ids = []
-        for counter in perfManager.perfCounter:
-            metric_name = "{}.{}".format(counter.groupInfo.key, counter.nameInfo.key)
-            new_metadata[counter.key] = {
-                'name': metric_name,
-                'unit': counter.unitInfo.key,
-            }
-            # Build the list of metrics we will want to collect
-            if instance.get("all_metrics") or metric_name in BASIC_METRICS:
+        # Use old behaviour with metrics to collect defined by our constants
+        if self.in_compatibility_mode(instance, log_warning=True):
+            for counter in perfManager.perfCounter:
+                metric_name = self.format_metric_name(counter, compatibility=True)
+                new_metadata[counter.key] = {
+                    'name': metric_name,
+                    'unit': counter.unitInfo.key,
+                }
+                # Build the list of metrics we will want to collect
+                if instance.get("all_metrics") or metric_name in BASIC_METRICS:
+                    metric_ids.append(vim.PerformanceManager.MetricId(counterId=counter.key, instance="*"))
+        else:
+            collection_level = instance.get("collection_level", 1)
+            for counter in perfManager.QueryPerfCounterByLevel(collection_level):
+                new_metadata[counter.key] = {
+                    "name": self.format_metric_name(counter),
+                    "unit": counter.unitInfo.key
+                }
+                # Build the list of metrics we will want to collect
                 metric_ids.append(vim.PerformanceManager.MetricId(counterId=counter.key, instance="*"))
 
-        self.cache_config.set_last(CacheConfig.Metadata, i_key, time.time())
-
         self.log.info("Finished metadata collection for instance {}".format(i_key))
         # Reset metadata
         self.metadata_cache.set_metadata(i_key, new_metadata)
         self.metadata_cache.set_metric_ids(i_key, metric_ids)
 
+        self.cache_config.set_last(CacheConfig.Metadata, i_key, time.time())
+
         # ## <TEST-INSTRUMENTATION>
         self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total(), tags=custom_tags)
         # ## </TEST-INSTRUMENTATION>
 
+    def format_metric_name(self, counter, compatibility=False):
+        if compatibility:
+            return "{}.{}".format(counter.groupInfo.key, counter.nameInfo.key)
+        else:
+            return "{}.{}.{}".format(counter.groupInfo.key, counter.nameInfo.key, SHORT_ROLLUP[str(counter.rollupType)])
+
+    def in_compatibility_mode(self, instance, log_warning=False):
+        if instance.get("all_metrics") is not None and instance.get("collection_level") is not None:
+            if log_warning:
+                self.log.warning("Using both `all_metrics` and `collection_level` configuration flag."
+                                 " `all_metrics` will be ignored.")
+            return False
+
+        if instance.get("all_metrics") is not None:
+            if log_warning:
+                self.warning("The configuration flag `all_metrics` will soon be deprecated. "
+                             "Consider using `collection_level` instead.")
+            return True
+
+        return False
+
     def _transform_value(self, instance, counter_id, value):
         """ Given the counter_id, look up for the metrics metadata to check the vsphere
         type of the counter and apply pre-reporting transformation if needed.
@@ -725,6 +766,7 @@ def _collect_metrics_async(self, instance, query_specs):
                     self.log.error("Trying to get metrics from object %s deleted from the cache, skipping. "
                                    "Consider increasing the parameter `clean_morlist_interval` to avoid that", mor_name)
                     continue
+
                 for result in mor_perfs.value:
                     counter_id = result.id.counterId
                     if not self.metadata_cache.contains(i_key, counter_id):
@@ -736,12 +778,13 @@ def _collect_metrics_async(self, instance, query_specs):
                     # Metric types are absolute, delta, and rate
                     metric_name = self.metadata_cache.get_metadata(i_key, result.id.counterId).get('name')
 
-                    if metric_name not in ALL_METRICS:
-                        self.log.debug(u"Skipping unknown `%s` metric.", metric_name)
-                        continue
+                    if self.in_compatibility_mode(instance):
+                        if metric_name not in ALL_METRICS:
+                            self.log.debug("Skipping unknown `{}` metric.".format(metric_name))
+                            continue
 
                     if not result.value:
-                        self.log.debug(u"Skipping `%s` metric because the value is empty", metric_name)
+                        self.log.debug("Skipping `{}` metric because the value is empty".format(metric_name))
                         continue
 
                     instance_name = result.id.instance or "none"
@@ -757,7 +800,7 @@ def _collect_metrics_async(self, instance, query_specs):
                         "vsphere.{}".format(metric_name),
                         value,
                         hostname=mor['hostname'],
-                        tags=['instance:{}'.format(instance_name)] + custom_tags
+                        tags=tags + custom_tags
                     )
 
         # ## <TEST-INSTRUMENTATION>