diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json index cc727fb6c7..c0fd6b5688 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/cloudwatch_agent_config.json @@ -193,6 +193,25 @@ ], "feature_conditions": [] }, + { + "timestamp_format_key": "json", + "file_path": "/var/log/parallelcluster/clustermgtd.nodemap", + "log_stream_name": "clustermgtd_nodemap", + "log_group_key": "job_info", + "schedulers": [ + "slurm" + ], + "platforms": [ + "amazon", + "centos", + "redhat", + "ubuntu" + ], + "node_roles": [ + "HeadNode" + ], + "feature_conditions": [] + }, { "timestamp_format_key": "json", "file_path": "/var/log/parallelcluster/slurm_resume.events", diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py index 6e1067a517..82aa79b159 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py @@ -30,6 +30,17 @@ def parse_args(): help="Role this node plays in the cluster " "(i.e., is it a compute node or the head node?)", ) parser.add_argument("--scheduler", required=True, choices=["slurm", "awsbatch"], help="Scheduler") + parser.add_argument( + "--additional-log-groups", + required=False, + help="format: =,=,=,...", + ) + parser.add_argument( + "--output-path", + required=False, + default=AWS_CLOUDWATCH_CFG_PATH, + help="Overwrite the default output path", + ) return parser.parse_args() @@ -38,15 +49,17 @@ def gethostname(): return socket.gethostname().split(".")[0] -def write_config(config): - """Write config to AWS_CLOUDWATCH_CFG_PATH.""" - with open(AWS_CLOUDWATCH_CFG_PATH, "w+", encoding="utf-8") as output_config_file: +def write_config(config, output_file): + """Write config to output_file.""" + with open(output_file, "w+", encoding="utf-8") as output_config_file: json.dump(config, output_config_file, indent=4) -def add_log_group_name_params(log_group_name, configs): +def add_log_group_name_params(default_log_group_name, log_group_name_map, configs): """Add a "log_group_name": log_group_name to every config.""" for config in configs: + group_key = config.get("log_group_key", None) + log_group_name = log_group_name_map.get(group_key, default_log_group_name) config.update({"log_group_name": log_group_name}) return configs @@ -111,12 +124,23 @@ def select_configs_for_feature(configs): return selected_configs -def select_logs(configs, args): +def select_configs_for_log_groups(configs, log_group_map): + selected_configs = [] + for config in configs: + condition = config.get("log_group_key", None) + if condition and condition not in log_group_map.keys(): + continue + selected_configs.append(config) + return selected_configs + + +def select_logs(configs, log_group_map, args): """Select the appropriate set of log configs.""" selected_configs = select_configs_for_scheduler(configs, args.scheduler) selected_configs = select_configs_for_node_role(selected_configs, args.node_role) selected_configs = select_configs_for_platform(selected_configs, args.platform) selected_configs = select_configs_for_feature(selected_configs) + selected_configs = select_configs_for_log_groups(selected_configs, log_group_map) return selected_configs @@ -215,19 +239,31 @@ def get_dict_value(value, attributes, default=None): return value +def parse_additional_log_groups_map(log_group_string): + if not log_group_string: + return {} + pairs = log_group_string.split(",") + log_group_map = {} + for pair in pairs: + pair_split = pair.split("=") + log_group_map[pair_split[0]] = pair_split[1] + return log_group_map + + def main(): """Create cloudwatch agent config file.""" args = parse_args() config_data = read_data(args.config) - log_configs = select_logs(config_data["log_configs"], args) + log_group_map = parse_additional_log_groups_map(args.additional_log_groups) + log_configs = select_logs(config_data["log_configs"], log_group_map, args) log_configs = add_timestamps(log_configs, config_data["timestamp_formats"]) - log_configs = add_log_group_name_params(args.log_group, log_configs) + log_configs = add_log_group_name_params(args.log_group, log_group_map, log_configs) log_configs = add_instance_log_stream_prefixes(log_configs) log_configs = filter_output_fields(log_configs) metric_configs = select_metrics(config_data["metric_configs"], args) metric_configs = add_append_dimensions(metric_configs, config_data["metric_configs"]) metric_configs = add_aggregation_dimensions(metric_configs, config_data["metric_configs"]) - write_config(create_config(log_configs, metric_configs)) + write_config(create_config(log_configs, metric_configs), args.output_path) if __name__ == "__main__": diff --git a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb index 8eb15383e4..715733ba52 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb @@ -152,7 +152,8 @@ def package_path command "#{cookbook_virtualenv_path}/bin/python #{config_script_path} "\ "--platform #{node['platform']} --config $CONFIG_DATA_PATH --log-group $LOG_GROUP_NAME "\ - "--scheduler $SCHEDULER --node-role $NODE_ROLE" + "--scheduler $SCHEDULER --node-role $NODE_ROLE "\ + "--additional-log-groups job-info=$LOG_GROUP_NAME" end unless redhat_on_docker? execute "cloudwatch-agent-start" do diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb index a4e19b76a3..4a51660ac6 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb @@ -40,7 +40,7 @@ describe file('/usr/local/bin/write_cloudwatch_agent_json.py') do it { should exist } - its('sha256sum') { should eq '4fc20cdb5f3e08f23192842bcf96a8e44bf269bf8d1b6121d225ecc999db433e' } + its('sha256sum') { should eq '8c1afffab3b0e5d0fe7b00c0c733a7f4f1fad533e93b71fa5ac66d4b0e32cb38' } its('owner') { should eq 'root' } its('group') { should eq 'root' } its('mode') { should cmp '0755' }