diff --git a/src/kepler_model/cmd/cmd_util.py b/src/kepler_model/cmd/cmd_util.py index 0e89ef7f..39662ebe 100644 --- a/src/kepler_model/cmd/cmd_util.py +++ b/src/kepler_model/cmd/cmd_util.py @@ -7,6 +7,7 @@ from kepler_model.util.prom_types import ( SOURCE_COL, energy_component_to_query, + vm_energy_component_to_query, node_info_column, prom_responses_to_results, ) @@ -99,7 +100,7 @@ def summary_validation(validate_df): print("{} data: \t{}".format(metric, target_df[">0"].values)) -def get_validate_df(data_path, benchmark_filename, query_response): +def get_validate_df(data_path, benchmark_filename, query_response, use_vm_metrics=False): items = [] query_results = prom_responses_to_results(query_response) container_queries = [query for query in query_results.keys() if "container" in query] @@ -175,10 +176,14 @@ def get_validate_df(data_path, benchmark_filename, query_response): item["total"] = filtered_df[query].max() items += [item] energy_queries = [query for query in query_results.keys() if "_joules" in query] - print("Energy Queries: ", container_queries) + print("Energy Queries: ", energy_queries) for energy_source, energy_components in PowerSourceMap.items(): for component in energy_components: query = energy_component_to_query(component) + if use_vm_metrics: + query = vm_energy_component_to_query(component) + else: + query = energy_component_to_query(component) df = query_results[query] df = df[df[SOURCE_COL] == energy_source] if len(df) == 0: diff --git a/src/kepler_model/cmd/main.py b/src/kepler_model/cmd/main.py index 11065dd7..0f5968fc 100644 --- a/src/kepler_model/cmd/main.py +++ b/src/kepler_model/cmd/main.py @@ -156,7 +156,7 @@ def query(args): if args.to_csv: save_query_results(data_path, args.output, response) # try validation if applicable - validate_df = get_validate_df(data_path, benchmark_filename, response) + validate_df = get_validate_df(data_path, benchmark_filename, response, use_vm_metrics=args.vm_train) summary_validation(validate_df) save_csv(path=data_path, name=args.output + "_validate_result", data=validate_df) @@ -487,7 +487,9 @@ def train(args): print("cannot get pipeline") exit() for energy_source in energy_sources: + print("energy source: ", energy_source) energy_components = PowerSourceMap[energy_source] + print("energy components: ", energy_components) for feature_group in valid_feature_groups: success, abs_data, dyn_data = pipeline.process_multiple_query( input_query_results_list, energy_components, energy_source, feature_group=feature_group.name, replace_node_type=node_type diff --git a/src/kepler_model/train/extractor/extractor.py b/src/kepler_model/train/extractor/extractor.py index 2fa72a7e..ddcfe0cc 100644 --- a/src/kepler_model/train/extractor/extractor.py +++ b/src/kepler_model/train/extractor/extractor.py @@ -168,7 +168,6 @@ def get_workload_feature_data(self, query_results, features, use_vm_metrics=Fals print("no data in ", query) return None aggr_query_data = query_results[query].copy() - if all(col in aggr_query_data.columns for col in cols_to_use): if use_vm_metrics: aggr_query_data = aggr_query_data.loc[aggr_query_data["job"] == VM_JOB_NAME] @@ -256,9 +255,9 @@ def get_power_data(self, query_results, energy_components, source, use_vm_metric return None aggr_query_data = query_results[query].copy() if not use_vm_metrics: - aggr_query_data = aggr_query_data.loc[aggr_query_data["job"] != VM_JOB_NAME] - # filter source - aggr_query_data = aggr_query_data[aggr_query_data[SOURCE_COL] == source] + # aggr_query_data = aggr_query_data.loc[aggr_query_data["job"] != VM_JOB_NAME] + # filter source + aggr_query_data = aggr_query_data[aggr_query_data[SOURCE_COL] == source] if len(aggr_query_data) == 0: return None if unit_col is not None: