From 8e302e17cee9435fcdff5d2367eabe64710f8fa8 Mon Sep 17 00:00:00 2001
From: David Barbarin <68589619+dba-leshop@users.noreply.github.com>
Date: Thu, 2 Jun 2022 17:04:14 +0200
Subject: [PATCH] feat(inputs.sqlserver): Update query store and latch
 performance counters (#11216)

---
 plugins/inputs/sqlserver/README.md           | 104 ++++++++++++++-----
 plugins/inputs/sqlserver/sqlserverqueries.go |  12 ++-
 2 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/plugins/inputs/sqlserver/README.md b/plugins/inputs/sqlserver/README.md
index 48c35bc2313ac..9cea80a5aee2c 100644
--- a/plugins/inputs/sqlserver/README.md
+++ b/plugins/inputs/sqlserver/README.md
@@ -1,7 +1,8 @@
 # SQL Server Input Plugin
 
-The `sqlserver` plugin provides metrics for your SQL Server instance. Recorded metrics are
-lightweight and use Dynamic Management Views supplied by SQL Server.
+The `sqlserver` plugin provides metrics for your SQL Server instance.
+Recorded metrics are lightweight and use Dynamic Management Views
+supplied by SQL Server.
 
 ## The SQL Server plugin supports the following editions/versions of SQL Server
 
@@ -14,7 +15,8 @@ lightweight and use Dynamic Management Views supplied by SQL Server.
 
 ## Additional Setup
 
-You have to create a login on every SQL Server instance or Azure SQL Managed instance you want to monitor, with following script:
+You have to create a login on every SQL Server instance or Azure SQL
+Managed instance you want to monitor, with following script:
 
 ```sql
 USE master;
@@ -27,7 +29,8 @@ GRANT VIEW ANY DEFINITION TO [telegraf];
 GO
 ```
 
-For Azure SQL Database, you require the View Database State permission and can create a user with a password directly in the database.
+For Azure SQL Database, you require the View Database State permission
+and can create a user with a password directly in the database.
 
 ```sql
 CREATE USER [telegraf] WITH PASSWORD = N'mystrongpassword';
@@ -36,9 +39,11 @@ GRANT VIEW DATABASE STATE TO [telegraf];
 GO
 ```
 
-For Azure SQL Elastic Pool, please follow the following instructions to collect metrics.
+For Azure SQL Elastic Pool, please follow the following instructions
+to collect metrics.
 
-On master logical database, create an SQL login 'telegraf' and assign it to the server-level role ##MS_ServerStateReader##.
+On master logical database, create an SQL login 'telegraf' and assign
+it to the server-level role ##MS_ServerStateReader##.
 
 ```sql
 CREATE LOGIN [telegraf] WITH PASSWORD = N'mystrongpassword';
@@ -48,19 +53,27 @@ ALTER SERVER ROLE ##MS_ServerStateReader##
 GO
 ```
 
-Elastic pool metrics can be collected from any database in the pool if a user for the `telegraf` login is created in that database. For collection to work, this database must remain in the pool, and must not be renamed. If you plan to add/remove databases from this pool, create a separate database for monitoring purposes that will remain in the pool.
+Elastic pool metrics can be collected from any database in the pool if a user
+for the `telegraf` login is created in that database. For collection to work,
+this database must remain in the pool, and must not be renamed. If you plan
+to add/remove databases from this pool, create a separate database for
+monitoring purposes that will remain in the pool.
 
-> Note: To avoid duplicate monitoring data, do not collect elastic pool metrics from more than one database in the same pool.
+> Note: To avoid duplicate monitoring data, do not collect elastic pool metrics
+from more than one database in the same pool.
 
 ```sql
 GO
 CREATE USER [telegraf] FOR LOGIN telegraf;
 ```
 
-For Service SID authentication to SQL Server (Windows service installations only).
-[More information about using service SIDs to grant permissions in SQL Server](https://docs.microsoft.com/en-us/sql/relational-databases/security/using-service-sids-to-grant-permissions-to-services-in-sql-server)
+For Service SID authentication to SQL Server (Windows service installations
+only).
 
-In an administrative command prompt configure the telegraf service for use with a service SID
+- [More information about using service SIDs to grant permissions in SQL Server](https://docs.microsoft.com/en-us/sql/relational-databases/security/using-service-sids-to-grant-permissions-to-services-in-sql-server)
+
+In an administrative command prompt configure the telegraf service for use
+with a service SID
 
 ```Batchfile
 sc.exe sidtype "telegraf" unrestricted
@@ -79,7 +92,8 @@ GRANT VIEW ANY DEFINITION TO [NT SERVICE\telegraf];
 GO
 ```
 
-Remove User Id and Password keywords from the connection string in your config file to use windows authentication.
+Remove User Id and Password keywords from the connection string in your
+config file to use windows authentication.
 
 ```toml
 [[inputs.sqlserver]]
@@ -232,11 +246,17 @@ Remove User Id and Password keywords from the connection string in your config f
 
 ## Support for Azure Active Directory (AAD) authentication using [Managed Identity](https://docs.microsoft.com/en-us/azure/active-directory/managed-identities-azure-resources/overview)
 
-Azure SQL Database supports 2 main methods of authentication: [SQL authentication and AAD authentication](https://docs.microsoft.com/en-us/azure/azure-sql/database/security-overview#authentication). The recommended practice is to [use AAD authentication when possible](https://docs.microsoft.com/en-us/azure/azure-sql/database/authentication-aad-overview).
+- Azure SQL Database supports 2 main methods of authentication: [SQL authentication and AAD authentication](https://docs.microsoft.com/en-us/azure/azure-sql/database/security-overview#authentication).
+- The recommended practice is to [use AAD authentication when possible](https://docs.microsoft.com/en-us/azure/azure-sql/database/authentication-aad-overview).
+
+AAD is a more modern authentication protocol, allows for easier
+credential/role management, and can eliminate the need to include passwords
+in a connection string.
 
-AAD is a more modern authentication protocol, allows for easier credential/role management, and can eliminate the need to include passwords in a connection string.
+To enable support for AAD authentication, we leverage the existing AAD
+authentication support.
 
-To enable support for AAD authentication, we leverage the existing AAD authentication support in the [SQL Server driver for Go](https://github.com/denisenkom/go-mssqldb#azure-active-directory-authentication---preview)
+- Please see [SQL Server driver for Go](https://github.com/denisenkom/go-mssqldb#azure-active-directory-authentication---preview)
 
 ### How to use AAD Auth with MSI
 
@@ -266,9 +286,14 @@ EXECUTE ('GRANT VIEW DATABASE STATE TO [<Monitoring_VM_Name>]')
 
 ## Metrics
 
-To provide backwards compatibility, this plugin support two versions of metrics queries.
+To provide backwards compatibility, this plugin support two versions of
+metrics queries.
 
-**Note**: Version 2 queries are not backwards compatible with the old queries. Any dashboards or queries based on the old query format will not work with the new format. The version 2 queries only report raw metrics, no math has been done to calculate deltas. To graph this data you must calculate deltas in your dashboarding software.
+**Note**: Version 2 queries are not backwards compatible with the old queries.
+Any dashboards or queries based on the old query format will not work with
+the new format. The version 2 queries only report raw metrics, no math has
+been done to calculate deltas. To graph this data you must calculate deltas
+in your dashboarding software.
 
 ### Version 1 (query_version=1): This is Deprecated in 1.6, all future development will be under configuration option database_type
 
@@ -303,7 +328,7 @@ The new (version 2) metrics provide:
   - *Memory*: PLE, Page reads/sec, Page writes/sec, + more
   - *TempDB*: Free space, Version store usage, Active temp tables, temp table creation rate, + more
   - *Resource Governor*: CPU Usage, Requests/sec, Queued Requests, and Blocked tasks per workload group + more
-- *Server properties*: Number of databases in all possible states (online, offline, suspect, etc.), cpu count, physical memory, SQL Server service uptime, and SQL Server version. In the case of Azure SQL relevant properties such as Tier, #Vcores, Memory etc.
+- *Server properties*: Number of databases in all possible states (online, offline, suspect, etc.), cpu count, physical memory, SQL Server service uptime, SQL Server SPID, and SQL Server version. In the case of Azure SQL relevant properties such as Tier, #Vcores, Memory etc.
 - *Wait stats*: Wait time in ms, number of waiting tasks, resource wait time, signal wait time, max wait time in ms, wait type, and wait category. The waits are categorized using the same categories used in Query Store.
 - *Schedulers* - This captures `sys.dm_os_schedulers`.
 - *SqlRequests* - This captures a snapshot of `sys.dm_exec_requests` and `sys.dm_exec_sessions` that gives you running requests as well as wait types and
@@ -326,7 +351,9 @@ The new (version 2) metrics provide:
 
 ### database_type = "AzureSQLDB"
 
-These are metrics for Azure SQL Database (single database) and are very similar to version 2 but split out for maintenance reasons, better ability to test,differences in DMVs:
+These are metrics for Azure SQL Database (single database) and are very
+similar to version 2 but split out for maintenance reasons, better ability
+to test,differences in DMVs:
 
 - *AzureSQLDBDatabaseIO*: IO stats from `sys.dm_io_virtual_file_stats` including resource governance time, RBPEX, IO for Hyperscale.
 - *AzureSQLDBMemoryClerks*: Memory clerk breakdown from `sys.dm_os_memory_clerks`.
@@ -340,7 +367,9 @@ These are metrics for Azure SQL Database (single database) and are very similar
 
 ### database_type = "AzureSQLManagedInstance"
 
-These are metrics for Azure SQL Managed instance, are very similar to version 2 but split out for maintenance reasons, better ability to test, differences in DMVs:
+These are metrics for Azure SQL Managed instance, are very similar to version
+2 but split out for maintenance reasons, better ability to test, differences
+in DMVs:
 
 - *AzureSQLMIDatabaseIO*: IO stats from `sys.dm_io_virtual_file_stats` including resource governance time, RBPEX, IO for Hyperscale.
 - *AzureSQLMIMemoryClerks*: Memory clerk breakdown from `sys.dm_os_memory_clerks`.
@@ -353,7 +382,9 @@ These are metrics for Azure SQL Managed instance, are very similar to version 2
 
 ### database_type = "AzureSQLPool"
 
-These are metrics for Azure SQL to monitor resources usage at Elastic Pool level. These metrics require additional permissions to be collected, please ensure to check additional setup section in this documentation.
+These are metrics for Azure SQL to monitor resources usage at Elastic Pool
+level. These metrics require additional permissions to be collected, please
+ensure to check additional setup section in this documentation.
 
 - *AzureSQLPoolResourceStats*: Returns resource usage statistics for the current elastic pool in a SQL Database server. Queried from `sys.dm_resource_governor_resource_pools_history_ex`.
 - *AzureSQLPoolResourceGovernance*: Returns actual configuration and capacity settings used by resource governance mechanisms in the current elastic pool. Queried from `sys.dm_user_db_resource_governance`.
@@ -374,7 +405,7 @@ These are metrics for Azure SQL to monitor resources usage at Elastic Pool level
   - *Memory*: PLE, Page reads/sec, Page writes/sec, + more
   - *TempDB*: Free space, Version store usage, Active temp tables, temp table creation rate, + more
   - *Resource Governor*: CPU Usage, Requests/sec, Queued Requests, and Blocked tasks per workload group + more
-- *SQLServerProperties*: Number of databases in all possible states (online, offline, suspect, etc.), cpu count, physical memory, SQL Server service uptime, and SQL Server version. In the case of Azure SQL relevant properties such as Tier, #Vcores, Memory etc.
+- *SQLServerProperties*: Number of databases in all possible states (online, offline, suspect, etc.), cpu count, physical memory, SQL Server service uptime, SQL Server SPID and SQL Server version. In the case of Azure SQL relevant properties such as Tier, #Vcores, Memory etc.
 - *SQLServerWaitStatsCategorized*: Wait time in ms, number of waiting tasks, resource wait time, signal wait time, max wait time in ms, wait type, and wait category. The waits are categorized using the same categories used in Query Store.
 - *SQLServerSchedulers*: This captures `sys.dm_os_schedulers`.
 - *SQLServerRequests*: This captures a snapshot of `sys.dm_exec_requests` and `sys.dm_exec_sessions` that gives you running requests as well as wait types and
@@ -387,7 +418,8 @@ These are metrics for Azure SQL to monitor resources usage at Elastic Pool level
 
 ### Output Measures
 
-The guiding principal is that all data collected from the same primary DMV ends up in the same measure irrespective of database_type.
+The guiding principal is that all data collected from the same primary DMV ends
+up in the same measure irrespective of database_type.
 
 - `sqlserver_database_io` - Used by  AzureSQLDBDatabaseIO, AzureSQLMIDatabaseIO, SQLServerDatabaseIO, DatabaseIO given the data is from `sys.dm_io_virtual_file_stats`
 - `sqlserver_waitstats` - Used by  WaitStatsCategorized,AzureSQLDBOsWaitstats,AzureSQLMIOsWaitstats
@@ -396,7 +428,8 @@ The guiding principal is that all data collected from the same primary DMV ends
 - `sqlserver_performance` - Used by  SQLServerPerformanceCounters, AzureSQLDBPerformanceCounters, AzureSQLMIPerformanceCounters,PerformanceCounters
 - `sys.dm_os_schedulers`  - Used by SQLServerSchedulers,AzureSQLDBServerSchedulers, AzureSQLMIServerSchedulers
 
-The following Performance counter metrics can be used directly, with no delta calculations:
+The following Performance counter metrics can be used directly, with no delta
+calculations:
 
 - SQLServer:Buffer Manager\Buffer cache hit ratio
 - SQLServer:Buffer Manager\Page life expectancy
@@ -440,9 +473,16 @@ Version 2 queries have the following tags:
 
 ### Health Metric
 
-All collection versions (version 1, version 2, and database_type) support an optional plugin health metric called `sqlserver_telegraf_health`. This metric tracks if connections to SQL Server are succeeding or failing. Users can leverage this metric to detect if their SQL Server monitoring is not working as intended.
+All collection versions (version 1, version 2, and database_type) support an
+optional plugin health metric called `sqlserver_telegraf_health`. This metric
+tracks if connections to SQL Server are succeeding or failing. Users can
+leverage this metric to detect if their SQL Server monitoring is not working
+as intended.
 
-In the configuration file, toggling `health_metric` to `true` will enable collection of this metric. By default, this value is set to `false` and the metric is not collected. The health metric emits one record for each connection specified by `servers` in the configuration file.
+In the configuration file, toggling `health_metric` to `true` will enable
+collection of this metric. By default, this value is set to `false` and
+the metric is not collected. The health metric emits one record for each
+connection specified by `servers` in the configuration file.
 
 The health metric emits the following tags:
 
@@ -455,6 +495,16 @@ The health metric emits the following fields:
 - `successful_queries` - Number of queries that completed successfully for this connection
 - `database_type` - Type of database as specified by `database_type`. If `database_type` is empty, the `QueryVersion` and `AzureDB` fields are concatenated instead
 
-If `attempted_queries` and `successful_queries` are not equal for a given connection, some metrics were not successfully gathered for that connection. If `successful_queries` is 0, no metrics were successfully gathered.
+If `attempted_queries` and `successful_queries` are not equal for
+a given connection, some metrics were not successfully gathered for
+that connection. If `successful_queries` is 0, no metrics were successfully
+gathered.
 
 [cardinality]: /docs/FAQ.md#user-content-q-how-can-i-manage-series-cardinality
+
+## Example Output
+
+```shell
+sqlserver_cpu_other_process_cpu{host="servername",measurement_db_type="SQLServer",sql_instance="SERVERNAME:INST"} 9
+sqlserver_performance{counter="Log File(s) Size (KB)",counter_type="65792",host="servername",instance="instance_name",measurement_db_type="SQLServer",object="MSSQL$INSTANCE_NAME:Databases",sql_instance="SERVERNAME:INSTANCE_NAME"} 1.048568e+06
+```
diff --git a/plugins/inputs/sqlserver/sqlserverqueries.go b/plugins/inputs/sqlserver/sqlserverqueries.go
index 135e94f7b21df..43ace7a09d341 100644
--- a/plugins/inputs/sqlserver/sqlserverqueries.go
+++ b/plugins/inputs/sqlserver/sqlserverqueries.go
@@ -170,7 +170,7 @@ SELECT
 	,REPLACE(@@SERVERNAME,''\'','':'') AS [sql_instance]
 	,DB_NAME(vfs.[database_id]) AS [database_name]
 	,COALESCE(mf.[physical_name],''RBPEX'') AS [physical_filename]	--RPBEX = Resilient Buffer Pool Extension
-	,COALESCE(mf.[name],''RBPEX'') AS [logical_filename]	--RPBEX = Resilient Buffer Pool Extension	
+	,COALESCE(mf.[name],''RBPEX'') AS [logical_filename]	--RPBEX = Resilient Buffer Pool Extension
 	,mf.[type_desc] AS [file_type]
 	,vfs.[io_stall_read_ms] AS [read_latency_ms]
 	,vfs.[num_of_reads] AS [reads]
@@ -333,6 +333,7 @@ SELECT DISTINCT
 			,'Logins/sec'
 			,'Processes blocked'
 			,'Latch Waits/sec'
+			,'Average Latch Wait Time (ms)'
 			,'Full Scans/sec'
 			,'Index Searches/sec'
 			,'Page Splits/sec'
@@ -414,6 +415,9 @@ SELECT DISTINCT
 			,'Distributed Query'
 			,'DTC calls'
 			,'Query Store CPU usage'
+			,'Query Store physical reads'
+			,'Query Store logical reads'
+			,'Query Store logical writes'
 		) OR (
 			spi.[object_name] LIKE '%User Settable%'
 			OR spi.[object_name] LIKE '%SQL Errors%'
@@ -1131,7 +1135,7 @@ END
 
 DECLARE
 	@MajorMinorVersion AS int = CAST(PARSENAME(CAST(SERVERPROPERTY('ProductVersion') AS nvarchar),4) AS int)*100 + CAST(PARSENAME(CAST(SERVERPROPERTY('ProductVersion') AS nvarchar),3) AS int)
-	
+
 IF @MajorMinorVersion >= 1050 BEGIN
 	SELECT DISTINCT
 		'sqlserver_volume_space' AS [measurement]
@@ -1344,7 +1348,7 @@ END;
 
 WITH MostRecentBackups AS
 (
-	SELECT 
+	SELECT
 		database_name AS [Database],
 		MAX(bus.backup_finish_date) AS LastBackupTime,
 		CASE bus.type
@@ -1364,7 +1368,7 @@ BackupsWithSize AS
 
 SELECT
 	'sqlserver_recentbackup' AS [measurement],
-	REPLACE(@@SERVERNAME,'\',':') AS [sql_instance], 
+	REPLACE(@@SERVERNAME,'\',':') AS [sql_instance],
 	d.name AS [database_name],
 	d.database_id as [database_id],
 	d.state_desc AS [state],