-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add heuristics using stage spill metrics to skip apps (#1002)
* Add heuristics using stage spill metrics to skip apps Signed-off-by: Partho Sarthi <[email protected]> * Remove disk spill metrics Signed-off-by: Partho Sarthi <[email protected]> * Add skip reason Signed-off-by: Partho Sarthi <[email protected]> * Revert "Add skip reason" This reverts commit b774958. * Generate skip reason to intermediate output directory Signed-off-by: Partho Sarthi <[email protected]> * Change delimiter to semi colon and update reason column name Signed-off-by: Partho Sarthi <[email protected]> * Add function to convert size to human-readable format Signed-off-by: Partho Sarthi <[email protected]> --------- Signed-off-by: Partho Sarthi <[email protected]>
- Loading branch information
Showing
5 changed files
with
220 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
128 changes: 128 additions & 0 deletions
128
user_tools/src/spark_rapids_tools/tools/additional_heuristics.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# Copyright (c) 2024, NVIDIA CORPORATION. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""Implementation class for Additional Heuristics logic.""" | ||
|
||
import os | ||
import re | ||
from dataclasses import dataclass, field | ||
from logging import Logger | ||
|
||
import pandas as pd | ||
|
||
from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer | ||
from spark_rapids_pytools.common.utilities import ToolLogging | ||
from spark_rapids_tools.tools.model_xgboost import find_paths, RegexPattern | ||
from spark_rapids_tools.utils import Utilities | ||
|
||
|
||
@dataclass | ||
class AdditionalHeuristics: | ||
""" | ||
Encapsulates the logic to apply additional heuristics to skip applications. | ||
""" | ||
logger: Logger = field(default=None, init=False) | ||
props: JSONPropertiesContainer = field(default=None, init=False) | ||
tools_output_dir: str = field(default=None, init=False) | ||
output_file: str = field(default=None, init=False) | ||
|
||
def __init__(self, props: dict, tools_output_dir: str, output_file: str): | ||
self.props = JSONPropertiesContainer(props, file_load=False) | ||
self.tools_output_dir = tools_output_dir | ||
self.output_file = output_file | ||
self.logger = ToolLogging.get_and_setup_logger(f'rapids.tools.{self.__class__.__name__}') | ||
|
||
def _apply_heuristics(self, app_ids: list) -> pd.DataFrame: | ||
""" | ||
Apply additional heuristics to applications to determine if they can be accelerated on GPU. | ||
""" | ||
profile_list = find_paths( | ||
self.tools_output_dir, | ||
RegexPattern.rapids_profile.match, | ||
return_directories=True, | ||
) | ||
if len(profile_list) == 0: | ||
self.logger.warning('No RAPIDS profiles found in output directory: %s', self.tools_output_dir) | ||
return pd.DataFrame(columns=self.props.get_value('resultCols')) | ||
|
||
profile_path = profile_list[0] | ||
result_arr = [] | ||
if not os.listdir(profile_path) or len(app_ids) == 0: | ||
self.logger.warning('Skipping empty profile: %s', profile_list[0]) | ||
else: | ||
for app_id in app_ids: | ||
app_id_path = os.path.join(profile_path, app_id) | ||
try: | ||
# Apply heuristics and determine if the application should be skipped. | ||
# Note: `should_skip` flag can be a combination of multiple heuristic checks. | ||
should_skip, reason = self.heuristics_based_on_spills(app_id_path) | ||
except Exception as e: # pylint: disable=broad-except | ||
should_skip = False | ||
reason = f'Cannot apply heuristics for qualification. Reason - {type(e).__name__}:{e}' | ||
self.logger.error(reason) | ||
result_arr.append([app_id, should_skip, reason]) | ||
|
||
return pd.DataFrame(result_arr, columns=self.props.get_value('resultCols')) | ||
|
||
def heuristics_based_on_spills(self, app_id_path: str) -> (bool, str): | ||
""" | ||
Apply heuristics based on spills to determine if the app can be accelerated on GPU. | ||
""" | ||
# Load stage aggregation metrics (this contains spill information) | ||
job_stage_agg_metrics_file = self.props.get_value('spillBased', 'jobStageAggMetrics', 'fileName') | ||
job_stage_agg_metrics = pd.read_csv(os.path.join(app_id_path, job_stage_agg_metrics_file)) | ||
job_stage_agg_metrics = job_stage_agg_metrics[self.props.get_value('spillBased', | ||
'jobStageAggMetrics', 'columns')] | ||
|
||
# Load sql-to-stage information (this contains Exec names) | ||
sql_to_stage_info_file = self.props.get_value('spillBased', 'sqlToStageInfo', 'fileName') | ||
sql_to_stage_info = pd.read_csv(os.path.join(app_id_path, sql_to_stage_info_file)) | ||
sql_to_stage_info = sql_to_stage_info[self.props.get_value('spillBased', | ||
'sqlToStageInfo', 'columns')] | ||
|
||
# Identify stages with significant spills | ||
spill_threshold_bytes = self.props.get_value('spillBased', 'spillThresholdBytes') | ||
stages_with_spills = job_stage_agg_metrics[ | ||
job_stage_agg_metrics['ID'].str.startswith('stage') & | ||
(job_stage_agg_metrics['memoryBytesSpilled_sum'] > spill_threshold_bytes) | ||
].copy() | ||
stages_with_spills['stageId'] = stages_with_spills['ID'].str.extract(r'(\d+)').astype(int) | ||
|
||
# Merge stages with spills with SQL-to-stage information | ||
merged_df = pd.merge(stages_with_spills, sql_to_stage_info, on='stageId', how='inner') | ||
|
||
# Identify stages with spills caused by Execs other than the ones allowed (Join, Aggregate or Sort) | ||
# Note: Column 'SQL Nodes(IDs)' contains the Exec names | ||
pattern = '|'.join(map(re.escape, self.props.get_value('spillBased', 'allowedExecs'))) | ||
relevant_stages_with_spills = merged_df[~merged_df['SQL Nodes(IDs)'].apply( | ||
lambda x: isinstance(x, str) and bool(re.search(pattern, x)))] | ||
# If there are any stages with spills caused by non-allowed Execs, skip the application | ||
if not relevant_stages_with_spills.empty: | ||
stages_str = '; '.join(relevant_stages_with_spills['stageId'].astype(str)) | ||
spill_threshold_human_readable = Utilities.bytes_to_human_readable(spill_threshold_bytes) | ||
reason = f'Skipping due to spills in stages [{stages_str}] exceeding {spill_threshold_human_readable}' | ||
return True, reason | ||
return False, '' | ||
|
||
def apply_heuristics(self, all_apps: pd.DataFrame) -> pd.DataFrame: | ||
try: | ||
heuristics_df = self._apply_heuristics(all_apps['App ID'].unique()) | ||
# Save the heuristics results to a file and drop the reason column | ||
heuristics_df.to_csv(self.output_file, index=False) | ||
heuristics_df.drop(columns=['Reason'], inplace=True) | ||
all_apps = pd.merge(all_apps, heuristics_df, on=['App ID'], how='left') | ||
except Exception as e: # pylint: disable=broad-except | ||
self.logger.error('Error occurred while applying additional heuristics. ' | ||
'Reason - %s:%s', type(e).__name__, e) | ||
return all_apps |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters