From 8b7c1ce274c1174b4ab8ca16d58fa5d4d7511c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Taveira=20Ara=C3=BAjo?= Date: Fri, 7 Jun 2024 14:00:02 -0700 Subject: [PATCH] fix(metricstream): lookup recommended filters (#169) Rather than hardcoding a single filter set into the repo, look it up by retrieving the SAM manifest. This allows us to roll back (and forwards) between SAM app releases, and reduces the scope for misconfiguration. The tradeoff is we are now dependent on an extra HTTP call to S3 on every plan / apply. --- Makefile | 11 - modules/metricstream/README.md | 7 +- modules/metricstream/filters/recommended.yaml | 268 ------------------ modules/metricstream/main.tf | 9 +- modules/metricstream/variables.tf | 7 + modules/metricstream/versions.tf | 2 +- modules/stack/README.md | 2 +- modules/stack/metricstream.tf | 9 +- modules/stack/variables.tf | 9 +- utilities/update-filters.sh | 13 - 10 files changed, 32 insertions(+), 305 deletions(-) delete mode 100644 modules/metricstream/filters/recommended.yaml delete mode 100755 utilities/update-filters.sh diff --git a/Makefile b/Makefile index faa796c..de70d0f 100644 --- a/Makefile +++ b/Makefile @@ -3,14 +3,3 @@ test-dir: terraform -chdir=${DIR} init -upgrade terraform -chdir=${DIR} test - -update-binaries-logwriter: - APP=logwriter RESOURCE=Subscriber utilities/update-binaries.sh > modules/subscriber/uris.csv - -update-binaries-forwarder: - APP=forwarder RESOURCE=Forwarder utilities/update-binaries.sh > modules/forwarder/uris.csv - -update-binaries: update-binaries-logwriter update-binaries-forwarder - -update-filters: utilities/update-filters.sh - diff --git a/modules/metricstream/README.md b/modules/metricstream/README.md index 2daef60..73ae5af 100644 --- a/modules/metricstream/README.md +++ b/modules/metricstream/README.md @@ -24,7 +24,7 @@ module "metric_stream" { | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.2 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [aws](#requirement\_aws) | >= 4.0 | ## Providers @@ -35,7 +35,9 @@ module "metric_stream" { ## Modules -No modules. +| Name | Source | Version | +|------|--------|---------| +| [sam\_asset](#module\_sam\_asset) | ../sam_asset | n/a | ## Resources @@ -67,6 +69,7 @@ No modules. | [name](#input\_name) | Name for resources. | `string` | n/a | yes | | [output\_format](#input\_output\_format) | The output format for CloudWatch Metrics. | `string` | `"json"` | no | | [prefix](#input\_prefix) | Optional prefix to write log records to. | `string` | `""` | no | +| [sam\_release\_version](#input\_sam\_release\_version) | Release version for SAM apps as defined on github.com/observeinc/aws-sam-apps. | `string` | `""` | no | ## Outputs diff --git a/modules/metricstream/filters/recommended.yaml b/modules/metricstream/filters/recommended.yaml deleted file mode 100644 index fe986a9..0000000 --- a/modules/metricstream/filters/recommended.yaml +++ /dev/null @@ -1,268 +0,0 @@ -ExcludeFilters: - - Namespace: AWS/RDS - # https://docs.datadoghq.com/integrations/amazon_rds/ - MetricNames: - - AbortedClients - # - ActiveTransactions - # - AuroraBinlogReplicaLag - - AuroraDMLRejectedMasterFull - - AuroraDMLRejectedWriterFull - - AuroraEstimatedSharedMemoryBytes - # - AuroraReplicaLag - # - AuroraReplicaLagMaximum - # - AuroraReplicaLagMinimum - - AuroraSlowConnectionHandleCount - - AuroraSlowHandshakeCount - - AuroraVolumeBytesLeftTotal - - Aurora_pq_request_attempted - - Aurora_pq_request_executed - - Aurora_pq_request_failed - - Aurora_pq_request_in_progress - - Aurora_pq_request_not_chosen - - Aurora_pq_request_not_chosen_below_min_rows - - Aurora_pq_request_not_chosen_column_bit - - Aurora_pq_request_not_chosen_column_geometry - - Aurora_pq_request_not_chosen_column_lob - - Aurora_pq_request_not_chosen_column_virtual - - Aurora_pq_request_not_chosen_custom_charset - - Aurora_pq_request_not_chosen_fast_ddl - - Aurora_pq_request_not_chosen_few_pages_outside_buffer_pool - - Aurora_pq_request_not_chosen_full_text_index - - Aurora_pq_request_not_chosen_high_buffer_pool_pct - - Aurora_pq_request_not_chosen_index_hint - - Aurora_pq_request_not_chosen_innodb_table_format - - Aurora_pq_request_not_chosen_instant_ddl - - Aurora_pq_request_not_chosen_long_trx - - Aurora_pq_request_not_chosen_no_where_clause - - Aurora_pq_request_not_chosen_range_scan - - Aurora_pq_request_not_chosen_row_length_too_long - - Aurora_pq_request_not_chosen_small_table - - Aurora_pq_request_not_chosen_temporary_table - - Aurora_pq_request_not_chosen_tx_isolation - - Aurora_pq_request_not_chosen_unsupported_access - - Aurora_pq_request_not_chosen_unsupported_storage_type - - Aurora_pq_request_not_chosen_update_delete_stmts - - Aurora_pq_request_throttled - - AvailabilityPercentage - # - BackupRetentionPeriodStorageUsed - # - BinLogDiskUsage - # - BlockedTransactions - # - BufferCacheHitRatio - # - BurstBalance - - CheckpointLag - - ClientConnections - - ClientConnectionsClosed - - ClientConnectionsNoTLS - - ClientConnectionsReceived - - ClientConnectionsSetupSucceeded - - ClientConnectionsTLS - # - CommitLatency - # - CommitThroughput - - ConnectionAttempts - # - CommitThroughput - # - CPUCreditBalance - # - CPUCreditUsage - # - CPUSurplusCreditBalance - # - CPUSurplusCreditsCharged - # - CPUUtilization - # - DBLoad - # - DBLoadCPU - # - DBLoadNonCPU - # - DDLLatency - # - DDLThroughput - - DatabaseConnectionRequests - - DatabaseConnectionRequestsWithTLS - - DatabaseConnections - - DatabaseConnectionsBorrowLatency - - DatabaseConnectionsCurrentlyBorrowed - - DatabaseConnectionsCurrentlyInTransaction - - DatabaseConnectionsCurrentlySessionPinned - - DatabaseConnectionsSetupSucceeded - - DatabaseConnectionsWithTLS - # - Deadlocks - # - DeleteLatency - # - DeleteThroughput - # - DiskQueueDepth - # - DMLLatency - # - DMLThroughput - - EBSByteBalance% - - EBSIOBalance% - # - EngineUptime - - ForwardingMasterDMLLatency - - ForwardingMasterDMLThroughput - - ForwardingMasterOpenSessions - - ForwardingReplicaDMLLatency - - ForwardingReplicaDMLThroughput - - ForwardingReplicaOpenSessions - - ForwardingReplicaReadWaitLatency - - ForwardingReplicaReadWaitThroughput - - ForwardingReplicaSelectLatency - - ForwardingReplicaSelectThroughput - - ForwardingWriterDMLLatency - - ForwardingWriterDMLThroughput - - ForwardingWriterOpenSessions - # - FreeLocalStorage - # - FreeStorageSpace - # - FreeableMemory - # - InsertLatency - # - InsertThroughput - # - LoginFailures - - MaxDatabaseConnectionsAllowed - # - MaximumUsedTransactionIDs - # - NetworkReceiveThroughput - # - NetworkThroughput - # - NetworkTransmitThroughput - - NumBinaryLogFiles - # - OldestReplicationSlotLag - - PurgeBoundary - - PurgeFinishedPoint - # - Queries - - QueryDatabaseResponseLatency - - QueryRequests - - QueryRequestsTLS - - QueryResponseLatency - - RDSToAuroraPostgreSQLReplicaLag - # - ReadIOPS - # - ReadLatency - # - ReadThroughput - # - ReplicationSlotDiskUsage - # - ResultSetCacheHitRatio - - RollbackSegmentHistoryListLength - - RowLockTime - # - SelectLatency - # - SelectThroughput - - StorageNetworkReceiveThroughput - - StorageNetworkThroughput - - StorageNetworkTransmitThroughput - - SumBinaryLogSize - # - SwapUsage - # - TotalBackupStorageBilled - # - TransactionLogsDiskUsage - # - TransactionLogsGeneration - - TruncateFinishedPoint - # - UpdateLatency - # - UpdateThroughput - # - VolumeBytesUsed - # - VolumeReadIOPs - # - VolumeWriteIOPs - # - WriteIOPS - # - WriteLatency - # - WriteThroughput - - Namespace: AWS/ApplicationELB - # https://docs.datadoghq.com/integrations/amazon_elb/#metrics - MetricNames: - # - ActiveConnectionCount - - AnomalousHostCount - # - ClientTLSNegotiationErrorCount - # - ConsumedLCUs - - DesyncMitigationMode_NonCompliant_Request_Count - - ForwardedInvalidHeaderRequestCount - # - HealthyHostCount - - HealthyStateDNS - - HealthyStateRouting - - HTTPCode_ELB_3XX_Count - # - HTTPCode_ELB_4XX_Count - # - HTTPCode_ELB_502_Count - # - HTTPCode_ELB_503_Count - # - HTTPCode_ELB_504_Count - # - HTTPCode_ELB_5XX_Count - # - HTTPCode_Target_2XX_Count - # - HTTPCode_Target_3XX_Count - # - HTTPCode_Target_4XX_Count - # - HTTPCode_Target_5XX_Count - # - HTTP_Redirect_Count - - MitigatedHostCount - # - NewConnectionCount - # - ProcessedBytes - # - RequestCount - # - RequestCountPerTarget - # - TargetResponseTime - # - UnHealthyHostCount - - UnhealthyStateDNS - - UnhealthyStateRouting - - Namespace: AWS/AmazonMQ - # https://docs.datadoghq.com/integrations/amazon_mq/ - MetricNames: - # - AckRate - # - ChannelCount - # - ConfirmRate - # - ConnectionCount - # - ConsumerCount - # - ExchangeCount - # - MessageCount - # - MessageReadyCount - # - MessageUnacknowledgedCount - # - PublishRate - # - QueueCount - # - RabbitMQDiskFree - # - RabbitMQDiskFreeLimit - # - RabbitMQFdUsed - - RabbitMQIOReadAverageTime - - RabbitMQIOWriteAverageTime - # - RabbitMQMemLimit - # - RabbitMQMemUsed - # - SystemCpuUtilization - - Namespace: AWS/ElastiCache - # https://docs.datadoghq.com/integrations/amazon_elasticache/ - MetricNames: - # - ActiveDefragHits - - AuthenticationFailures - # - BytesUsedForCache - # - CacheHitRate - # - CacheHits - # - CacheMisses - - ChannelAuthorizationFailures - - CommandAuthorizationFailures - # - CPUCreditBalance - # - CPUCreditUsage - # - CPUUtilization - # - CurrConnections - # - CurrItems - - CurrVolatileItems - - DatabaseCapacityUsageCountedForEvictPercentage - - DatabaseCapacityUsagePercentage - - DatabaseMemoryUsageCountedForEvictPercentage - # - DatabaseMemoryUsagePercentage - # - DB0AverageTTL - # - EngineCPUUtilization - # - EvalBasedCmds - # - Evictions - # - FreeableMemory - # - GetTypeCmds - # - GetTypeCmdsLatency - # - HashBasedCmds - - IamAuthenticationExpirations - - IamAuthenticationThrottling - # - IsMaster - - KeyAuthorizationFailures - # - KeyBasedCmds - # - KeyBasedCmdsLatency - - KeysTracked - # - ListBasedCmds - - ListBasedCmdsLatency - # - MasterLinkHealthStatus - # - MemoryFragmentationRatio - - NetworkBandwidthInAllowanceExceeded - - NetworkBandwidthOutAllowanceExceeded - # - NetworkBytesIn - # - NetworkBytesOut - - NetworkConntrackAllowanceExceeded - - NetworkMaxBytesIn - - NetworkMaxBytesOut - - NetworkMaxPacketsIn - - NetworkMaxPacketsOut - # - NetworkPacketsIn - # - NetworkPacketsOut - - NetworkPacketsPerSecondAllowanceExceeded - # - Reclaimed - # - ReplicationBytes - # - ReplicationLag - # - SaveInProgress - # - SetTypeCmds - # - SetTypeCmdsLatency - # - SortedSetBasedCmds - # - SortedSetBasedCmdsLatency - # - StringBasedCmds - # - StringBasedCmdsLatency - # - SwapUsage - - TrafficManagementActive diff --git a/modules/metricstream/main.tf b/modules/metricstream/main.tf index ae74574..5a1dbe6 100644 --- a/modules/metricstream/main.tf +++ b/modules/metricstream/main.tf @@ -2,8 +2,8 @@ locals { account_id = data.aws_caller_identity.current.account_id region = data.aws_region.current.name name_prefix = "${substr(var.name, 0, 37)}-" - recommended = yamldecode(file("${path.module}/filters/recommended.yaml")) use_recommended = var.include_filters == null && var.exclude_filters == null + recommended = local.use_recommended ? yamldecode(module.sam_asset[0].body) : null filter = local.use_recommended ? { # must convert from cloudformation CamelCase to terraform snake_case when falling back to recommended filter include_filters = try([for v in local.recommended["IncludeFilters"] : { namespace = v.Namespace, metric_names = v.MetricNames }], []) @@ -17,3 +17,10 @@ locals { data "aws_caller_identity" "current" {} data "aws_region" "current" {} + +module "sam_asset" { + count = local.use_recommended ? 1 : 0 + source = "../sam_asset" + asset = "cloudwatchmetrics/filters/recommended.yaml" + release_version = var.sam_release_version +} diff --git a/modules/metricstream/variables.tf b/modules/metricstream/variables.tf index e37aa2c..1af2933 100644 --- a/modules/metricstream/variables.tf +++ b/modules/metricstream/variables.tf @@ -80,3 +80,10 @@ variable "buffering_size" { nullable = false default = 1 } + +variable "sam_release_version" { + description = "Release version for SAM apps as defined on github.com/observeinc/aws-sam-apps." + type = string + default = "" + nullable = false +} diff --git a/modules/metricstream/versions.tf b/modules/metricstream/versions.tf index a6e62b8..4c505f8 100644 --- a/modules/metricstream/versions.tf +++ b/modules/metricstream/versions.tf @@ -1,5 +1,5 @@ terraform { - required_version = ">= 1.2" + required_version = ">= 1.3" required_providers { aws = { source = "hashicorp/aws" diff --git a/modules/stack/README.md b/modules/stack/README.md index 22c22f8..de7623e 100644 --- a/modules/stack/README.md +++ b/modules/stack/README.md @@ -172,7 +172,7 @@ You can additionally configure other submodules in this manner: | [destination](#input\_destination) | Destination filedrop |
object({
arn = optional(string, "")
bucket = optional(string, "")
prefix = optional(string, "")
# exclusively for backward compatible HTTP endpoint
uri = optional(string, "")
})
| n/a | yes | | [forwarder](#input\_forwarder) | Variables for forwarder module. |
object({
source_bucket_names = optional(list(string), [])
source_topic_arns = optional(list(string), [])
content_type_overrides = optional(list(object({ pattern = string, content_type = string })), [])
max_file_size = optional(number)
lambda_memory_size = optional(number)
lambda_timeout = optional(number)
lambda_env_vars = optional(map(string))
retention_in_days = optional(number)
queue_max_receive_count = optional(number)
queue_delay_seconds = optional(number)
queue_message_retention_seconds = optional(number)
queue_batch_size = optional(number)
queue_maximum_batching_window_in_seconds = optional(number)
code_uri = optional(string)
sam_release_version = optional(string)
})
| `{}` | no | | [logwriter](#input\_logwriter) | Variables for AWS CloudWatch Logs collection. |
object({
log_group_name_patterns = optional(list(string))
log_group_name_prefixes = optional(list(string))
exclude_log_group_name_prefixes = optional(list(string))
buffering_interval = optional(number)
buffering_size = optional(number)
filter_name = optional(string)
filter_pattern = optional(string)
num_workers = optional(number)
discovery_rate = optional(string, "24 hours")
lambda_memory_size = optional(number)
lambda_timeout = optional(number)
code_uri = optional(string)
sam_release_version = optional(string)
})
| `null` | no | -| [metricstream](#input\_metricstream) | Variables for AWS CloudWatch Metrics Stream collection. |
object({
include_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) })))
exclude_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) })))
buffering_interval = optional(number)
buffering_size = optional(number)
})
| `null` | no | +| [metricstream](#input\_metricstream) | Variables for AWS CloudWatch Metrics Stream collection. |
object({
include_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) })))
exclude_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) })))
buffering_interval = optional(number)
buffering_size = optional(number)
sam_release_version = optional(string)
})
| `null` | no | | [name](#input\_name) | Name of role. Since this name must be unique within the
account, it will be reused for most of the resources created by this
module. | `string` | n/a | yes | | [s3\_bucket\_lifecycle\_expiration](#input\_s3\_bucket\_lifecycle\_expiration) | Expiration in days for S3 objects in collection bucket | `number` | `4` | no | | [sam\_release\_version](#input\_sam\_release\_version) | Release version for SAM apps as defined on github.com/observeinc/aws-sam-apps. | `string` | `null` | no | diff --git a/modules/stack/metricstream.tf b/modules/stack/metricstream.tf index 81cd99e..e54bf3d 100644 --- a/modules/stack/metricstream.tf +++ b/modules/stack/metricstream.tf @@ -5,8 +5,9 @@ module "metricstream" { name = "${var.name}-metricstream" bucket_arn = aws_s3_bucket.this.arn - include_filters = var.metricstream.include_filters - exclude_filters = var.metricstream.exclude_filters - buffering_interval = var.metricstream.buffering_interval - buffering_size = var.metricstream.buffering_size + include_filters = var.metricstream.include_filters + exclude_filters = var.metricstream.exclude_filters + buffering_interval = var.metricstream.buffering_interval + buffering_size = var.metricstream.buffering_size + sam_release_version = try(coalesce(var.metricstream.sam_release_version, var.sam_release_version), null) } diff --git a/modules/stack/variables.tf b/modules/stack/variables.tf index 0a01c5e..b22cf1d 100644 --- a/modules/stack/variables.tf +++ b/modules/stack/variables.tf @@ -99,10 +99,11 @@ variable "metricstream" { Variables for AWS CloudWatch Metrics Stream collection. EOF type = object({ - include_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) }))) - exclude_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) }))) - buffering_interval = optional(number) - buffering_size = optional(number) + include_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) }))) + exclude_filters = optional(list(object({ namespace = string, metric_names = optional(list(string)) }))) + buffering_interval = optional(number) + buffering_size = optional(number) + sam_release_version = optional(string) }) default = null } diff --git a/utilities/update-filters.sh b/utilities/update-filters.sh deleted file mode 100755 index e8232b3..0000000 --- a/utilities/update-filters.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -euo pipefail - -DIE() { echo "$*" 1>&2; exit 1; } - -BUCKET=${BUCKET:-observeinc} - -FILTERS=recommended.yaml - -for FILTER in ${FILTERS}; do \ - curl -s https://${BUCKET}.s3.amazonaws.com/cloudwatchmetrics/filters/${FILTER} > modules/metricstream/filters/${FILTER} -done; -