fix(metricstrea): set recommended filter by omission

Previously we were collecting all metrics by omission. This commit instead loads a recommended set of filters from a file on disk. The file is kept in sync with the filter we provide for cloudformation. We could have loaded the filter dynamically directly from the hosted URL, but that would be at odds with the idempotent nature of terraform modules. Since we already have to update the binary versions of lambda functions, we can piggyback on that process to mirror the file.
observeinc · May 22, 2024 · 8322d13 · 8322d13
1 parent c104ef7
commit 8322d13
Show file tree

Hide file tree

Showing 8 changed files with 307 additions and 13 deletions.
diff --git a/.github/workflows/update-deps.yaml b/.github/workflows/update-deps.yaml
@@ -41,6 +41,9 @@ jobs:
       - name: Update binaries
         run: make update-binaries
 
+      - name: Update filters
+        run: make update-filters
+
       # Use Peter Evans Pull Request Action to create a pull request
       - name: Create Pull Request
         uses: peter-evans/create-pull-request@v6

diff --git a/Makefile b/Makefile
@@ -12,3 +12,5 @@ update-binaries-forwarder:
 
 update-binaries: update-binaries-logwriter update-binaries-forwarder
 
+update-filters: utilities/update-filters.sh
+
diff --git a/modules/metricstream/README.md b/modules/metricstream/README.md
@@ -62,8 +62,8 @@ No modules.
 | <a name="input_bucket_arn"></a> [bucket\_arn](#input\_bucket\_arn) | S3 Bucket ARN to write log records to. | `string` | n/a | yes |
 | <a name="input_buffering_interval"></a> [buffering\_interval](#input\_buffering\_interval) | Buffer incoming data for the specified period of time, in seconds, before<br>delivering it to S3. | `number` | `60` | no |
 | <a name="input_buffering_size"></a> [buffering\_size](#input\_buffering\_size) | Buffer incoming data to the specified size, in MiBs, before delivering it<br>to S3. | `number` | `1` | no |
-| <a name="input_exclude_filters"></a> [exclude\_filters](#input\_exclude\_filters) | List of exclusion filters. Mutually exclusive with inclusion filters | <pre>list(object({<br>    namespace    = string<br>    metric_names = list(string)<br>  }))</pre> | `[]` | no |
-| <a name="input_include_filters"></a> [include\_filters](#input\_include\_filters) | List of inclusion filters. | <pre>list(object({<br>    namespace    = string<br>    metric_names = list(string)<br>  }))</pre> | `[]` | no |
+| <a name="input_exclude_filters"></a> [exclude\_filters](#input\_exclude\_filters) | List of exclusion filters. Mutually exclusive with inclusion filters. | <pre>list(object({<br>    namespace    = string<br>    metric_names = list(string)<br>  }))</pre> | `null` | no |
+| <a name="input_include_filters"></a> [include\_filters](#input\_include\_filters) | List of inclusion filters. If neither include\_filters or exclude\_filters is<br>set, a default filter will be used. | <pre>list(object({<br>    namespace    = string<br>    metric_names = list(string)<br>  }))</pre> | `null` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name for resources. | `string` | n/a | yes |
 | <a name="input_output_format"></a> [output\_format](#input\_output\_format) | The output format for CloudWatch Metrics. | `string` | `"json"` | no |
 | <a name="input_prefix"></a> [prefix](#input\_prefix) | Optional prefix to write log records to. | `string` | `""` | no |

diff --git a/modules/metricstream/filters/recommended.yaml b/modules/metricstream/filters/recommended.yaml
@@ -0,0 +1,268 @@
+ExcludeFilters:
+  - Namespace: AWS/RDS
+    # https://docs.datadoghq.com/integrations/amazon_rds/
+    MetricNames:
+      - AbortedClients
+      # - ActiveTransactions
+      # - AuroraBinlogReplicaLag
+      - AuroraDMLRejectedMasterFull
+      - AuroraDMLRejectedWriterFull
+      - AuroraEstimatedSharedMemoryBytes
+      # - AuroraReplicaLag
+      # - AuroraReplicaLagMaximum
+      # - AuroraReplicaLagMinimum
+      - AuroraSlowConnectionHandleCount
+      - AuroraSlowHandshakeCount
+      - AuroraVolumeBytesLeftTotal
+      - Aurora_pq_request_attempted
+      - Aurora_pq_request_executed
+      - Aurora_pq_request_failed
+      - Aurora_pq_request_in_progress
+      - Aurora_pq_request_not_chosen
+      - Aurora_pq_request_not_chosen_below_min_rows
+      - Aurora_pq_request_not_chosen_column_bit
+      - Aurora_pq_request_not_chosen_column_geometry
+      - Aurora_pq_request_not_chosen_column_lob
+      - Aurora_pq_request_not_chosen_column_virtual
+      - Aurora_pq_request_not_chosen_custom_charset
+      - Aurora_pq_request_not_chosen_fast_ddl
+      - Aurora_pq_request_not_chosen_few_pages_outside_buffer_pool
+      - Aurora_pq_request_not_chosen_full_text_index
+      - Aurora_pq_request_not_chosen_high_buffer_pool_pct
+      - Aurora_pq_request_not_chosen_index_hint
+      - Aurora_pq_request_not_chosen_innodb_table_format
+      - Aurora_pq_request_not_chosen_instant_ddl
+      - Aurora_pq_request_not_chosen_long_trx
+      - Aurora_pq_request_not_chosen_no_where_clause
+      - Aurora_pq_request_not_chosen_range_scan
+      - Aurora_pq_request_not_chosen_row_length_too_long
+      - Aurora_pq_request_not_chosen_small_table
+      - Aurora_pq_request_not_chosen_temporary_table
+      - Aurora_pq_request_not_chosen_tx_isolation
+      - Aurora_pq_request_not_chosen_unsupported_access
+      - Aurora_pq_request_not_chosen_unsupported_storage_type
+      - Aurora_pq_request_not_chosen_update_delete_stmts
+      - Aurora_pq_request_throttled
+      - AvailabilityPercentage
+      # - BackupRetentionPeriodStorageUsed
+      # - BinLogDiskUsage
+      # - BlockedTransactions
+      # - BufferCacheHitRatio
+      # - BurstBalance
+      - CheckpointLag
+      - ClientConnections
+      - ClientConnectionsClosed
+      - ClientConnectionsNoTLS
+      - ClientConnectionsReceived
+      - ClientConnectionsSetupSucceeded
+      - ClientConnectionsTLS
+      # - CommitLatency
+      # - CommitThroughput
+      - ConnectionAttempts
+      # - CommitThroughput
+      # - CPUCreditBalance
+      # - CPUCreditUsage
+      # - CPUSurplusCreditBalance
+      # - CPUSurplusCreditsCharged
+      # - CPUUtilization
+      # - DBLoad
+      # - DBLoadCPU
+      # - DBLoadNonCPU
+      # - DDLLatency
+      # - DDLThroughput
+      - DatabaseConnectionRequests
+      - DatabaseConnectionRequestsWithTLS
+      - DatabaseConnections
+      - DatabaseConnectionsBorrowLatency
+      - DatabaseConnectionsCurrentlyBorrowed
+      - DatabaseConnectionsCurrentlyInTransaction
+      - DatabaseConnectionsCurrentlySessionPinned
+      - DatabaseConnectionsSetupSucceeded
+      - DatabaseConnectionsWithTLS
+      # - Deadlocks
+      # - DeleteLatency
+      # - DeleteThroughput
+      # - DiskQueueDepth
+      # - DMLLatency
+      # - DMLThroughput
+      - EBSByteBalance%
+      - EBSIOBalance%
+      # - EngineUptime
+      - ForwardingMasterDMLLatency
+      - ForwardingMasterDMLThroughput
+      - ForwardingMasterOpenSessions
+      - ForwardingReplicaDMLLatency
+      - ForwardingReplicaDMLThroughput
+      - ForwardingReplicaOpenSessions
+      - ForwardingReplicaReadWaitLatency
+      - ForwardingReplicaReadWaitThroughput
+      - ForwardingReplicaSelectLatency
+      - ForwardingReplicaSelectThroughput
+      - ForwardingWriterDMLLatency
+      - ForwardingWriterDMLThroughput
+      - ForwardingWriterOpenSessions
+      # - FreeLocalStorage
+      # - FreeStorageSpace
+      # - FreeableMemory
+      # - InsertLatency
+      # - InsertThroughput
+      # - LoginFailures
+      - MaxDatabaseConnectionsAllowed
+      # - MaximumUsedTransactionIDs
+      # - NetworkReceiveThroughput
+      # - NetworkThroughput
+      # - NetworkTransmitThroughput
+      - NumBinaryLogFiles
+      # - OldestReplicationSlotLag
+      - PurgeBoundary
+      - PurgeFinishedPoint
+      # - Queries
+      - QueryDatabaseResponseLatency
+      - QueryRequests
+      - QueryRequestsTLS
+      - QueryResponseLatency
+      - RDSToAuroraPostgreSQLReplicaLag
+      # - ReadIOPS
+      # - ReadLatency
+      # - ReadThroughput
+      # - ReplicationSlotDiskUsage
+      # - ResultSetCacheHitRatio
+      - RollbackSegmentHistoryListLength
+      - RowLockTime
+      # - SelectLatency
+      # - SelectThroughput
+      - StorageNetworkReceiveThroughput
+      - StorageNetworkThroughput
+      - StorageNetworkTransmitThroughput
+      - SumBinaryLogSize
+      # - SwapUsage
+      # - TotalBackupStorageBilled
+      # - TransactionLogsDiskUsage
+      # - TransactionLogsGeneration
+      - TruncateFinishedPoint
+      # - UpdateLatency
+      # - UpdateThroughput
+      # - VolumeBytesUsed
+      # - VolumeReadIOPs
+      # - VolumeWriteIOPs
+      # - WriteIOPS
+      # - WriteLatency
+      # - WriteThroughput
+  - Namespace: AWS/ApplicationELB
+    # https://docs.datadoghq.com/integrations/amazon_elb/#metrics
+    MetricNames:
+      # - ActiveConnectionCount
+      - AnomalousHostCount
+      # - ClientTLSNegotiationErrorCount
+      # - ConsumedLCUs
+      - DesyncMitigationMode_NonCompliant_Request_Count
+      - ForwardedInvalidHeaderRequestCount
+      # - HealthyHostCount
+      - HealthyStateDNS
+      - HealthyStateRouting
+      - HTTPCode_ELB_3XX_Count
+      # - HTTPCode_ELB_4XX_Count
+      # - HTTPCode_ELB_502_Count
+      # - HTTPCode_ELB_503_Count
+      # - HTTPCode_ELB_504_Count
+      # - HTTPCode_ELB_5XX_Count
+      # - HTTPCode_Target_2XX_Count
+      # - HTTPCode_Target_3XX_Count
+      # - HTTPCode_Target_4XX_Count
+      # - HTTPCode_Target_5XX_Count
+      # - HTTP_Redirect_Count
+      - MitigatedHostCount
+      # - NewConnectionCount
+      # - ProcessedBytes
+      # - RequestCount
+      # - RequestCountPerTarget
+      # - TargetResponseTime
+      # - UnHealthyHostCount
+      - UnhealthyStateDNS
+      - UnhealthyStateRouting
+  - Namespace: AWS/AmazonMQ
+    # https://docs.datadoghq.com/integrations/amazon_mq/
+    MetricNames:
+      # - AckRate
+      # - ChannelCount
+      # - ConfirmRate
+      # - ConnectionCount
+      # - ConsumerCount
+      # - ExchangeCount
+      # - MessageCount
+      # - MessageReadyCount
+      # - MessageUnacknowledgedCount
+      # - PublishRate
+      # - QueueCount
+      # - RabbitMQDiskFree
+      # - RabbitMQDiskFreeLimit
+      # - RabbitMQFdUsed
+      - RabbitMQIOReadAverageTime
+      - RabbitMQIOWriteAverageTime
+      # - RabbitMQMemLimit
+      # - RabbitMQMemUsed
+      # - SystemCpuUtilization
+  - Namespace: AWS/ElastiCache
+    # https://docs.datadoghq.com/integrations/amazon_elasticache/
+    MetricNames:
+      # - ActiveDefragHits
+      - AuthenticationFailures
+      # - BytesUsedForCache
+      # - CacheHitRate
+      # - CacheHits
+      # - CacheMisses
+      - ChannelAuthorizationFailures
+      - CommandAuthorizationFailures
+      # - CPUCreditBalance
+      # - CPUCreditUsage
+      # - CPUUtilization
+      # - CurrConnections
+      # - CurrItems
+      - CurrVolatileItems
+      - DatabaseCapacityUsageCountedForEvictPercentage
+      - DatabaseCapacityUsagePercentage
+      - DatabaseMemoryUsageCountedForEvictPercentage
+      # - DatabaseMemoryUsagePercentage
+      # - DB0AverageTTL
+      # - EngineCPUUtilization
+      # - EvalBasedCmds
+      # - Evictions
+      # - FreeableMemory
+      # - GetTypeCmds
+      # - GetTypeCmdsLatency
+      # - HashBasedCmds
+      - IamAuthenticationExpirations
+      - IamAuthenticationThrottling
+      # - IsMaster
+      - KeyAuthorizationFailures
+      # - KeyBasedCmds
+      # - KeyBasedCmdsLatency
+      - KeysTracked
+      # - ListBasedCmds
+      - ListBasedCmdsLatency
+      # - MasterLinkHealthStatus
+      # - MemoryFragmentationRatio
+      - NetworkBandwidthInAllowanceExceeded
+      - NetworkBandwidthOutAllowanceExceeded
+      # - NetworkBytesIn
+      # - NetworkBytesOut
+      - NetworkConntrackAllowanceExceeded
+      - NetworkMaxBytesIn
+      - NetworkMaxBytesOut
+      - NetworkMaxPacketsIn
+      - NetworkMaxPacketsOut
+      # - NetworkPacketsIn
+      # - NetworkPacketsOut
+      - NetworkPacketsPerSecondAllowanceExceeded
+      # - Reclaimed
+      # - ReplicationBytes
+      # - ReplicationLag
+      # - SaveInProgress
+      # - SetTypeCmds
+      # - SetTypeCmdsLatency
+      # - SortedSetBasedCmds
+      # - SortedSetBasedCmdsLatency
+      # - StringBasedCmds
+      # - StringBasedCmdsLatency
+      # - SwapUsage
+      - TrafficManagementActive
diff --git a/modules/metricstream/main.tf b/modules/metricstream/main.tf
@@ -1,9 +1,11 @@
 locals {
-  account_id      = data.aws_caller_identity.current.account_id
-  region          = data.aws_region.current.name
-  name_prefix     = "${substr(var.name, 0, 37)}-"
-  include_filters = var.include_filters
-  exclude_filters = var.exclude_filters
+  account_id          = data.aws_caller_identity.current.account_id
+  region              = data.aws_region.current.name
+  name_prefix         = "${substr(var.name, 0, 37)}-"
+  recommended_filters = yamldecode(file("${path.module}/filters/recommended.yaml"))
+
+  include_filters = var.include_filters != null ? var.include_filters : [for v in lookup(local.recommended_filters, "IncludeFilters", []) : { namespace = v.Namespace, metric_names = v.MetricNames }]
+  exclude_filters = var.exclude_filters != null ? var.exclude_filters : [for v in lookup(local.recommended_filters, "ExcludeFilters", []) : { namespace = v.Namespace, metric_names = v.MetricNames }]
 }
 
 data "aws_caller_identity" "current" {}

diff --git a/modules/metricstream/tests/metricstream.tftest.hcl b/modules/metricstream/tests/metricstream.tftest.hcl
@@ -15,6 +15,13 @@ run "create_bucket" {
 }
 
 run "install" {
+  variables {
+    name       = run.setup.id
+    bucket_arn = run.create_bucket.arn
+  }
+}
+
+run "update_filters" {
   variables {
     name       = run.setup.id
     bucket_arn = run.create_bucket.arn

diff --git a/modules/metricstream/variables.tf b/modules/metricstream/variables.tf
@@ -40,26 +40,25 @@ variable "output_format" {
 
 variable "include_filters" {
   description = <<-EOF
-    List of inclusion filters.
+    List of inclusion filters. If neither include_filters or exclude_filters is
+    set, a default filter will be used.
   EOF
   type = list(object({
     namespace    = string
     metric_names = list(string)
   }))
-  default  = []
-  nullable = false
+  default = null
 }
 
 variable "exclude_filters" {
   description = <<-EOF
-    List of exclusion filters. Mutually exclusive with inclusion filters
+    List of exclusion filters. Mutually exclusive with inclusion filters.
   EOF
   type = list(object({
     namespace    = string
     metric_names = list(string)
   }))
-  default  = []
-  nullable = false
+  default = null
 }
 
 variable "buffering_interval" {

diff --git a/utilities/update-filters.sh b/utilities/update-filters.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -euo pipefail
+
+DIE() { echo "$*" 1>&2; exit 1; }
+
+BUCKET=${BUCKET:-observeinc}
+
+FILTERS=recommended.yaml
+
+for FILTER in ${FILTERS}; do \
+    curl -s https://${BUCKET}.s3.amazonaws.com/cloudwatchmetrics/filters/${FILTER} > modules/metricstream/filters/${FILTER}
+done;
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,3 +12,5 @@ update-binaries-forwarder:

		update-binaries: update-binaries-logwriter update-binaries-forwarder

		update-filters: utilities/update-filters.sh