-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
sync w/ internal repo; update models #1083
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -330,7 +330,9 @@ def infer_app_meta(eventlogs: List[str]) -> Mapping[str, Mapping]: | |
) | ||
toc_list.append(tmp) | ||
|
||
if toc_list: | ||
if not toc_list: | ||
raise ValueError(f'No CSV files found for: {ds_name}') | ||
else: | ||
toc = pd.concat(toc_list) | ||
raw_features = extract_raw_features(toc, node_level_supp, qualtool_filter) | ||
if raw_features.empty: | ||
|
@@ -415,7 +417,7 @@ def combine_tables(table_name: str) -> pd.DataFrame: | |
|
||
# normalize WholeStageCodegen labels | ||
ops_tbl.loc[ | ||
ops_tbl['nodeName'].str.startswith('WholeStageCodegen') == True, 'nodeName' | ||
ops_tbl['nodeName'].str.startswith('WholeStageCodegen'), 'nodeName' | ||
] = 'WholeStageCodegen' | ||
|
||
# format WholeStageCodegen for merging | ||
|
@@ -449,7 +451,7 @@ def combine_tables(table_name: str) -> pd.DataFrame: | |
] | ||
for op in dynamic_op_labels: | ||
sql_ops_counter.loc[ | ||
sql_ops_counter['nodeName'].str.startswith(op) == True, 'nodeName' | ||
sql_ops_counter['nodeName'].str.startswith(op), 'nodeName' | ||
] = op | ||
|
||
# count occurrences | ||
|
@@ -533,7 +535,7 @@ def combine_tables(table_name: str) -> pd.DataFrame: | |
|
||
# filter rows w/ sqlID | ||
sql_job_agg_tbl['hasSqlID'] = sql_job_agg_tbl['sqlID'] >= 0 | ||
full_tbl = sql_job_agg_tbl[sql_job_agg_tbl['hasSqlID'] == True] | ||
full_tbl = sql_job_agg_tbl[sql_job_agg_tbl['hasSqlID']] | ||
|
||
# add runType features from toc | ||
app_runtype = toc[['appId', 'runType']].drop_duplicates() | ||
|
@@ -929,21 +931,22 @@ def scan_tbl( | |
sql_to_stage, left_on='ID', right_on='stageId' | ||
) | ||
total_stage_time = ( | ||
stage_agg_tbl[['sqlID', 'ID', 'Duration']] | ||
stage_agg_tbl[['sqlID', 'Duration']] | ||
.groupby('sqlID') | ||
.agg('sum') | ||
.reset_index() | ||
) | ||
failed_stage_time = ( | ||
stage_agg_tbl[['sqlID', 'ID', 'Duration']] | ||
.merge(failed_stages, left_on='ID', right_on='stageId', how='inner') | ||
.merge(failed_stages, left_on='ID', right_on='stageId', how='inner')[ | ||
['sqlID', 'Duration'] | ||
] | ||
.groupby('sqlID') | ||
.agg('sum') | ||
.reset_index() | ||
.drop(columns=['sqlID']) | ||
) | ||
stage_times = total_stage_time.merge( | ||
failed_stage_time, on='ID', how='inner' | ||
failed_stage_time, on='sqlID', how='inner' | ||
) | ||
stage_times.info() | ||
sqls_to_drop = set( | ||
|
@@ -1074,10 +1077,14 @@ def load_qtool_execs(qtool_execs: List[str]) -> Optional[pd.DataFrame]: | |
if qtool_execs: | ||
exec_info = pd.concat([pd.read_csv(f) for f in qtool_execs]) | ||
node_level_supp = exec_info.copy() | ||
node_level_supp['Exec Is Supported'] = node_level_supp[ | ||
'Exec Is Supported' | ||
] | node_level_supp['Exec Name'].apply( | ||
lambda x: any([x.startswith(nm) for nm in unsupported_overrides]) | ||
node_level_supp['Exec Is Supported'] = ( | ||
node_level_supp['Exec Is Supported'] | ||
| node_level_supp['Exec Name'].apply( | ||
lambda x: any([x.startswith(nm) for nm in unsupported_overrides]) | ||
) | ||
| node_level_supp['Exec Name'].apply( | ||
lambda x: x.startswith('WholeStageCodegen') | ||
) | ||
Comment on lines
+1080
to
+1087
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought that change was rolled back..Just double checking. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per @eordentlich's commit to internal repo, this is a workaround for #860. |
||
) | ||
node_level_supp = ( | ||
node_level_supp[['App ID', 'SQL ID', 'SQL Node Id', 'Exec Is Supported']] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets set the standard:
For the case when it is "not toc_list", do we want to raise an error? or return 1.0 after logging that files were missing?
CC: @eordentlich and @parthosa
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@amahussein this should be a very rare case. Basically, the user has requested to load CSV files for a dataset, and we have found none at all for the entire dataset. Previously, we would just skip this dataset (failing silently), so added this code to raise it to the user's attention. Presumably, the legacy Speedup would also fail in this case too (or would it just return 1.0?).