DataDog · ofek · Mar 6, 2021 · Feb 15, 2021 · Mar 3, 2021 · Mar 3, 2021
@@ -261,7 +261,110 @@ files:
                 time: [100, 0]
                 <METRIC_COLUMN>: [<TOP_K_LIMIT>, <BOTTOM_K_LIMIT>]
               display_default: false
-
+          - name: statement_samples
+            description: Configure collection of statement samples
+            options:
+              - name: enabled
+                description: |
+                  Enables collection of statement samples. Requires `deep_database_monitoring: true`.
+                value:
+                  type: boolean
+                  example: false
+              - name: collections_per_second
+                description: |
+                  Sets the maximum statement sample collection rate. Each collection involves a single query to one
+                  of the `performance_schema.events_statements_*` tables, followed by at most one `EXPLAIN` query per
+                  unique statement seen.
+                value:
+                  type: number
+                  example: 1
+              - name: explained_statements_per_hour_per_query
+                description: |
+                  Sets the rate limit for how many execution plans will be collected per hour per normalized statement.
+                value:
+                  type: integer
+                  example: 60
+              - name: samples_per_hour_per_query
+                description: |
+                  Sets the rate limit for how many statement sample events will be ingested per hour per normalized
+                  execution plan.
+                value:
+                  type: integer
+                  example: 15
+              - name: explained_statements_cache_maxsize
+                description: |
+                  Sets the max size of the cache used for the explained_statements_per_hour_per_query rate limit. This
+                  should be increased for databases with a very large number of unique normalized queries which exceed the
+                  cache's limit.
+                value:
+                  type: integer
+                  example: 5000
+              - name: seen_samples_cache_maxsize
+                description: |
+                  Sets the max size of the cache used for the samples_per_hour_per_query rate limit. This should be
+                  increased for databases with a very large number of unique normalized execution plans which exceed the
+                  cache's limit.
+                value:
+                  type: integer
+                  example: 10000
+              - name: events_statements_row_limit
+                description: |
+                  Sets the maximum number of rows to read out from a `performance_schema.events_statements_*` table during
+                  a single collection.
+                value:
+                  type: integer
+                  example: 5000
+              - name: events_statements_table
+                description: |
+                  Forces a specific events statements table. Must be one of events_statements_current,
+                  events_statements_history, events_statements_history_long. If not set then the agent will choose
+                  the best available table that is enabled and not empty.
+                value:
+                  type: string
+                  example: events_statements_history_long
+                  display_default: null
+              - name: explain_procedure
+                description: |
+                  Overrides the default procedure used for collecting execution plans. The agent will use this
+                  procedure in each schema where it exists: `{schema}.explain_statement({statement})`.
+                value:
+                  type: string
+                  example: explain_statement
+              - name: fully_qualified_explain_procedure
+                description: |
+                  Overrides the default fully qualified explain procedure used for collecting execution plans for
+                  statements sent from connections that do not have a default schema configured.
+                value:
+                  type: string
+                  example: datadog.explain_statement
+              - name: events_statements_enable_procedure
+                description: |
+                  Overrides the default procedure used for enabling events statements consumers.
+                value:
+                  type: string
+                  example: datadog.enable_events_statements_consumers
+              - name: events_statements_temp_table_name
+                description: |
+                  Overrides the default fully qualified name for the temp table the agent creates while collecting
+                  samples.
+                value:
+                  type: string
+                  example: datadog.temp_events
+              - name: collection_strategy_cache_maxsize
+                description: |
+                  Sets the max size of the cache used for caching collection strategies. This value should be increased
+                  to be at least as many as the number of unique schemas that are being monitored.
+                value:
+                  type: integer
+                  example: 1000
+              - name: collection_strategy_cache_ttl
+                description: |
+                  Sets how long to cache collection strategies. This should only be decreased if the set of enabled
+                  `events_statements_*` tables changes frequently enough to cause stale strategies to return empty
+                  results for an extended period of time.
+                value:
+                  type: integer
+                  example: 300
           - template: instances/default
 
       - template: logs

@@ -30,6 +30,8 @@ def __init__(self, instance):
         self.charset = instance.get('charset')
         self.deep_database_monitoring = is_affirmative(instance.get('deep_database_monitoring', False))
         self.statement_metrics_limits = instance.get('statement_metrics_limits', None)
+        self.statement_samples_config = instance.get('statement_samples', {}) or {}
+        self.min_collection_interval = instance.get('min_collection_interval', 15)
         self.configuration_checks()
 
     def _build_tags(self, custom_tags):

@@ -251,6 +251,96 @@ instances:
     #   - <TOP_K_LIMIT>
     #   - <BOTTOM_K_LIMIT>
 
+    ## Configure collection of statement samples
+    #
+    statement_samples:
+
+        ## @param enabled - boolean - optional - default: false
+        ## Enables collection of statement samples. Requires `deep_database_monitoring: true`.
+        #
+        # enabled: false
+
+        ## @param collections_per_second - number - optional - default: 1
+        ## Sets the maximum statement sample collection rate. Each collection involves a single query to one
+        ## of the `performance_schema.events_statements_*` tables, followed by at most one `EXPLAIN` query per
+        ## unique statement seen.
+        #
+        # collections_per_second: 1
+
+        ## @param explained_statements_per_hour_per_query - integer - optional - default: 60
+        ## Sets the rate limit for how many execution plans will be collected per hour per normalized statement.
+        #
+        # explained_statements_per_hour_per_query: 60
+
+        ## @param samples_per_hour_per_query - integer - optional - default: 15
+        ## Sets the rate limit for how many statement sample events will be ingested per hour per normalized
+        ## execution plan.
+        #
+        # samples_per_hour_per_query: 15
+
+        ## @param explained_statements_cache_maxsize - integer - optional - default: 5000
+        ## Sets the max size of the cache used for the explained_statements_per_hour_per_query rate limit. This
+        ## should be increased for databases with a very large number of unique normalized queries which exceed the
+        ## cache's limit.
+        #
+        # explained_statements_cache_maxsize: 5000
+
+        ## @param seen_samples_cache_maxsize - integer - optional - default: 10000
+        ## Sets the max size of the cache used for the samples_per_hour_per_query rate limit. This should be
+        ## increased for databases with a very large number of unique normalized execution plans which exceed the
+        ## cache's limit.
+        #
+        # seen_samples_cache_maxsize: 10000
+
+        ## @param events_statements_row_limit - integer - optional - default: 5000
+        ## Sets the maximum number of rows to read out from a `performance_schema.events_statements_*` table during
+        ## a single collection.
+        #
+        # events_statements_row_limit: 5000
+
+        ## @param events_statements_table - string - optional
+        ## Forces a specific events statements table. Must be one of events_statements_current,
+        ## events_statements_history, events_statements_history_long. If not set then the agent will choose
+        ## the best available table that is enabled and not empty.
+        #
+        # events_statements_table: events_statements_history_long
+
+        ## @param explain_procedure - string - optional - default: explain_statement
+        ## Overrides the default procedure used for collecting execution plans. The agent will use this
+        ## procedure in each schema where it exists: `{schema}.explain_statement({statement})`.
+        #
+        # explain_procedure: explain_statement
+
+        ## @param fully_qualified_explain_procedure - string - optional - default: datadog.explain_statement
+        ## Overrides the default fully qualified explain procedure used for collecting execution plans for
+        ## statements sent from connections that do not have a default schema configured.
+        #
+        # fully_qualified_explain_procedure: datadog.explain_statement
+
+        ## @param events_statements_enable_procedure - string - optional - default: datadog.enable_events_statements_consumers
+        ## Overrides the default procedure used for enabling events statements consumers.
+        #
+        # events_statements_enable_procedure: datadog.enable_events_statements_consumers
+
+        ## @param events_statements_temp_table_name - string - optional - default: datadog.temp_events
+        ## Overrides the default fully qualified name for the temp table the agent creates while collecting
+        ## samples.
+        #
+        # events_statements_temp_table_name: datadog.temp_events
+
+        ## @param collection_strategy_cache_maxsize - integer - optional - default: 1000
+        ## Sets the max size of the cache used for caching collection strategies. This value should be increased
+        ## to be at least as many as the number of unique schemas that are being monitored.
+        #
+        # collection_strategy_cache_maxsize: 1000
+
+        ## @param collection_strategy_cache_ttl - integer - optional - default: 300
+        ## Sets how long to cache collection strategies. This should only be decreased if the set of enabled
+        ## `events_statements_*` tables changes frequently enough to cause stale strategies to return empty
+        ## results for an extended period of time.
+        #
+        # collection_strategy_cache_ttl: 300
+
     ## @param tags - list of strings - optional
     ## A list of tags to attach to every metric and service check emitted by this instance.
     ##

@@ -45,6 +45,7 @@
     SQL_WORKER_THREADS,
     show_replica_status_query,
 )
+from .statement_samples import MySQLStatementSamples
 from .statements import MySQLStatementMetrics
 from .version_utils import get_version
 
@@ -76,10 +77,11 @@ def __init__(self, name, init_config, instances):
         self._conn = None
 
         self._query_manager = QueryManager(self, self.execute_query_raw, queries=[], tags=self._config.tags)
-        self._statement_metrics = MySQLStatementMetrics(self._config)
         self.check_initializations.append(self._query_manager.compile_queries)
         self.innodb_stats = InnoDBMetrics()
         self.check_initializations.append(self._config.configuration_checks)
+        self._statement_metrics = MySQLStatementMetrics(self._config)
+        self._statement_samples = MySQLStatementSamples(self, self._config, self._get_connection_args())
 
     def execute_query_raw(self, query):
         with closing(self._conn.cursor(pymysql.cursors.SSCursor)) as cursor:
@@ -111,6 +113,7 @@ def check(self, _):
                 self._collect_system_metrics(self._config.host, db, self._config.tags)
                 if self._config.deep_database_monitoring:
                     self._collect_statement_metrics(db, self._config.tags)
+                    self._statement_samples.run_sampler(self.service_check_tags)
 
                 # keeping track of these:
                 self._put_qcache_stats()
@@ -124,6 +127,9 @@ def check(self, _):
             finally:
                 self._conn = None
 
+    def cancel(self):
+        self._statement_samples.cancel()
+
     def _set_qcache_stats(self):
         host_key = self._get_host_key()
         qcache_st = self.qcache_stats.get(host_key, (None, None, None))