Skip to content

Commit

Permalink
feat(observability): adds a monitor for watching GH action jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
todd-hagler committed Nov 17, 2022
1 parent 9ab46fc commit fcaf44a
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 0 deletions.
3 changes: 3 additions & 0 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,9 @@ module "runners" {
ami_id_ssm_parameter_name = var.ami_id_ssm_parameter_name

sqs_build_queue = aws_sqs_queue.queued_builds
sqs_workflow_job_queue = length(aws_sqs_queue.webhook_events_workflow_job_queue) > 0 ? aws_sqs_queue.webhook_events_workflow_job_queue[0] : null
sqs_workflow_job_queue_name = "${var.prefix}-webhook_events_workflow_job_queue"
pending_job_timeout_mins = var.pending_job_timeout_mins
github_app_parameters = local.github_app_parameters
enable_organization_runners = var.enable_organization_runners
enable_ephemeral_runners = var.enable_ephemeral_runners
Expand Down
19 changes: 19 additions & 0 deletions modules/runners/runner-monitor.tf
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,22 @@ resource "aws_iam_role_policy" "monitor_ami_id_ssm_parameter_read" {
}
JSON
}

resource "aws_cloudwatch_event_rule" "run_monitor" {
name = "run-monitor"
description = "Check for jobs in orgs with no runners"
schedule_expression = var.runner_monitor_chron
}

resource "aws_cloudwatch_event_target" "run_monitor" {
rule = aws_cloudwatch_event_rule.run_monitor.name
arn = aws_lambda_function.runner_monitor.arn
}

resource "aws_lambda_permission" "allow_cloudwatch_to_lambda" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.runner_monitor.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.run_monitor.arn
}
6 changes: 6 additions & 0 deletions modules/runners/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -609,3 +609,9 @@ variable "pending_job_timeout_mins" {
type = number
default = 10
}

variable "runner_monitor_chron" {
description = "Chron schedule for excecuting runner monitor"
type = string
default = "cron(0/10 * * * ? *)"
}
12 changes: 12 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -766,3 +766,15 @@ variable "monitor_lambda_zip" {
type = string
default = null
}

variable "pending_job_timeout_mins" {
description = "The number of minutes to wait, before considering a job to be in a que, that is not going to be picked up by runners. Alerts will only be triggered if there are no runners available to the org, so long queues should not trigger this."
type = number
default = 10
}

variable "runner_monitor_chron" {
type = string
description = "Chron schedule for excecuting runner monitor"
default = "cron(10 * * * * *)"
}

0 comments on commit fcaf44a

Please sign in to comment.