Skip to content

Commit

Permalink
SPMI: Improve speed significantly for large diffs (#76238)
Browse files Browse the repository at this point in the history
This starts communicating more information about diffs back from
superpmi and starts using it in the driver. The current information is
the base size, diff size, base instructions, diff instructions and
context size.

The driver uses the base size/diff size to pick a small number of
interesting contexts and only creates disassembly for these contexts.
jit-analyze is then also invoked on only these contexts, but the
contexts are picked so that they overlap with what jit-analyze would
display. Additionally, we also pick the top 20 smallest contexts (in
terms of context size) -- I frequently use the smallest contexts with
diffs to investigate my change.

The new behavior is only enabled when no custom metrics are specified.
If custom metrics are specified we fall back to producing all
disassembly files and leave it up to jit-analyze to extract and analyze
the metrics specified.

Also, the retainTopFilesOnly option is no longer necessary since our CI
pipeline will at most produce 100 .dasm files now. Another benefit is that
this means that all contexts mentioned in the jit-analyze output will now be
part of the artifacts.

The net result is that we can now get SPMI diffs for changes with even
hundreds of thousands of diffs in the same time as it takes to get diffs
for a small change.

Fix #76178
  • Loading branch information
jakobbotsch authored Sep 29, 2022
1 parent ebaba40 commit cc934c7
Show file tree
Hide file tree
Showing 13 changed files with 518 additions and 178 deletions.
27 changes: 27 additions & 0 deletions src/coreclr/inc/clr_std/vector
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,33 @@ namespace std
}
}

vector(const vector<T>&) = delete;
vector<T>& operator=(const vector<T>&) = delete;

vector(vector<T>&& v) noexcept
: m_size(v.m_size)
, m_capacity(v.m_capacity)
, m_pelements(v.m_pelements)
, m_isBufferOwner(v.m_isBufferOwner)
{
v.m_isBufferOwner = false;
}

vector<T>& operator=(vector<T>&& v) noexcept
{
if (m_isBufferOwner)
{
erase(m_pelements, 0, m_size);
delete [] (BYTE*)m_pelements;
}

m_size = v.m_size;
m_capacity = v.m_capacity;
m_pelements = v.m_pelements;
m_isBufferOwner = v.m_isBufferOwner;
v.m_isBufferOwner = false;
return *this;
}

size_t size() const
{
Expand Down
114 changes: 84 additions & 30 deletions src/coreclr/scripts/superpmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,6 @@ def add_core_root_arguments(parser, build_type_default, build_type_help):
asm_diff_parser.add_argument("--debuginfo", action="store_true", help="Include debug info after disassembly (sets COMPlus_JitDebugDump).")
asm_diff_parser.add_argument("-tag", help="Specify a word to add to the directory name where the asm diffs will be placed")
asm_diff_parser.add_argument("-metrics", action="append", help="Metrics option to pass to jit-analyze. Can be specified multiple times, or pass comma-separated values.")
asm_diff_parser.add_argument("-retainOnlyTopFiles", action="store_true", help="Retain only top .dasm files with largest improvements or regressions and delete remaining files.")
asm_diff_parser.add_argument("--diff_with_release", action="store_true", help="Specify if this is asmdiff using release binaries.")
asm_diff_parser.add_argument("--git_diff", action="store_true", help="Produce a '.diff' file from 'base' and 'diff' folders if there were any differences.")

Expand Down Expand Up @@ -501,6 +500,11 @@ def read_csv_metrics(path):

return dict

def read_csv_diffs(path):
with open(path) as csv_file:
reader = csv.DictReader(csv_file)
return list(reader)

def determine_clrjit_compiler_version(clrjit_path):
""" Obtain the version of the compiler that was used to compile the clrjit at the specified path.
Expand Down Expand Up @@ -1568,7 +1572,7 @@ def replay_with_asm_diffs(self):
logging.info("Running asm diffs of %s", mch_file)

fail_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_fail.mcl")
diff_mcl_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff.mcl")
diffs_info = os.path.join(temp_location, os.path.basename(mch_file) + "_diffs.csv")

base_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_base_metrics.csv")
diff_metrics_summary_file = os.path.join(temp_location, os.path.basename(mch_file) + "_diff_metrics.csv")
Expand All @@ -1577,7 +1581,7 @@ def replay_with_asm_diffs(self):
"-a", # Asm diffs
"-v", "ewmi", # display errors, warnings, missing, jit info
"-f", fail_mcl_file, # Failing mc List
"-diffMCList", diff_mcl_file, # Create all of the diffs in an mcl file
"-diffsInfo", diffs_info, # Information about diffs
"-r", os.path.join(temp_location, "repro"), # Repro name, create .mc repro files
"-baseMetricsSummary", base_metrics_summary_file, # Create summary of metrics we can use to get total code size impact
"-diffMetricsSummary", diff_metrics_summary_file,
Expand Down Expand Up @@ -1633,8 +1637,10 @@ def replay_with_asm_diffs(self):
repro_base_command_line = "{} {} {}".format(self.superpmi_path, " ".join(altjit_asm_diffs_flags), self.diff_jit_path)
save_repro_mc_files(temp_location, self.coreclr_args, artifacts_base_name, repro_base_command_line)

diffs = read_csv_diffs(diffs_info)

# This file had asm diffs; keep track of that.
has_diffs = is_nonzero_length_file(diff_mcl_file)
has_diffs = len(diffs) > 0
if has_diffs:
files_with_asm_diffs.append(mch_file)

Expand All @@ -1649,12 +1655,6 @@ def replay_with_asm_diffs(self):
if return_code == 0:
logging.warning("Warning: SuperPMI returned a zero exit code, but generated a non-zero-sized mcl file")

self.diff_mcl_contents = None
with open(diff_mcl_file) as file_handle:
mcl_lines = file_handle.readlines()
mcl_lines = [item.strip() for item in mcl_lines]
self.diff_mcl_contents = mcl_lines

asm_root_dir = create_unique_directory_name(self.coreclr_args.spmi_location, "asm.{}".format(artifacts_base_name))
base_asm_location = os.path.join(asm_root_dir, "base")
diff_asm_location = os.path.join(asm_root_dir, "diff")
Expand All @@ -1672,13 +1672,13 @@ def replay_with_asm_diffs(self):
text_differences = queue.Queue()
jit_dump_differences = queue.Queue()

async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
async def create_replay_artifacts(print_prefix, context_index, self, mch_file, env, jit_differences_queue, base_location, diff_location, extension):
""" Run superpmi over an MC to create JIT asm or JIT dumps for the method.
"""
# Setup flags to call SuperPMI for both the diff jit and the base jit

flags = [
"-c", item,
"-c", str(context_index),
"-v", "q" # only log from the jit.
]
flags += altjit_replay_flags
Expand All @@ -1690,7 +1690,7 @@ async def create_replay_artifacts(print_prefix, item, self, mch_file, env, jit_d

async def create_one_artifact(jit_path: str, location: str, flags) -> str:
command = [self.superpmi_path] + flags + [jit_path, mch_file]
item_path = os.path.join(location, "{}{}".format(item, extension))
item_path = os.path.join(location, "{}{}".format(context_index, extension))
modified_env = env.copy()
modified_env['COMPlus_JitStdOutFile'] = item_path
logging.debug("%sGenerating %s", print_prefix, item_path)
Expand All @@ -1706,22 +1706,20 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
diff_txt = await create_one_artifact(self.diff_jit_path, diff_location, flags + diff_option_flags_for_diff_artifact)

if base_txt != diff_txt:
jit_differences_queue.put_nowait(item)
jit_differences_queue.put_nowait(context_index)
################################################################################################ end of create_replay_artifacts()

diff_items = []
for item in self.diff_mcl_contents:
diff_items.append(item)

logging.info("Creating dasm files: %s %s", base_asm_location, diff_asm_location)
subproc_helper = AsyncSubprocessHelper(diff_items, verbose=True)
dasm_contexts = self.pick_contexts_to_disassemble(diffs)
subproc_helper = AsyncSubprocessHelper(dasm_contexts, verbose=True)
subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, asm_complus_vars_full_env, text_differences, base_asm_location, diff_asm_location, ".dasm")

if self.coreclr_args.diff_jit_dump:
logging.info("Creating JitDump files: %s %s", base_dump_location, diff_dump_location)
subproc_helper.run_to_completion(create_replay_artifacts, self, mch_file, jit_dump_complus_vars_full_env, jit_dump_differences, base_dump_location, diff_dump_location, ".txt")

logging.info("Differences found. To replay SuperPMI use:")
logging.info("Differences found. To replay SuperPMI use:".format(len(diffs)))

logging.info("")
for var, value in asm_complus_vars.items():
print_platform_specific_environment_vars(logging.INFO, self.coreclr_args, var, value)
Expand All @@ -1736,9 +1734,10 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
logging.info("%s %s -c ### %s %s", self.superpmi_path, " ".join(altjit_replay_flags), self.diff_jit_path, mch_file)
logging.info("")

logging.debug("Method numbers with binary differences:")
for item in self.diff_mcl_contents:
logging.debug(item)
smallest = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
logging.debug("Smallest {} contexts with binary differences:".format(len(smallest)))
for diff in smallest:
logging.debug(diff["Context"])
logging.debug("")

if base_metrics is not None and diff_metrics is not None:
Expand Down Expand Up @@ -1769,13 +1768,22 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
# It appears we have a built jit-analyze on the path, so try to run it.
jit_analyze_summary_file = os.path.join(asm_root_dir, "summary.md")
command = [ jit_analyze_path, "--md", jit_analyze_summary_file, "-r", "--base", base_asm_location, "--diff", diff_asm_location ]
if self.coreclr_args.retainOnlyTopFiles:
command += [ "--retainOnlyTopFiles" ]
if self.coreclr_args.metrics:
command += [ "--metrics", ",".join(self.coreclr_args.metrics) ]
elif base_bytes is not None and diff_bytes is not None:
command += [ "--override-total-base-metric", str(base_bytes), "--override-total-diff-metric", str(diff_bytes) ]

if len(dasm_contexts) < len(diffs):
# Avoid producing analysis output that assumes these are all contexts
# with diffs. When no custom metrics are specified we pick only a subset
# of interesting contexts to disassemble, to avoid spending too long.
# See pick_contexts_to_disassemble.
command += [ "--is-subset-of-diffs" ]
else:
# Ask jit-analyze to avoid producing analysis output that assumes these are
# all contexts (we never produce .dasm files for contexts without diffs).
command += [ "--is-diffs-only" ]

run_and_log(command, logging.INFO)
ran_jit_analyze = True

Expand All @@ -1800,6 +1808,14 @@ async def create_one_artifact(jit_path: str, location: str, flags) -> str:
else:
logging.warning("No textual differences. Is this an issue with coredistools?")

# If we are not specifying custom metrics then print a summary here, otherwise leave the summarization up to jit-analyze.
if self.coreclr_args.metrics is None:
num_improvements = sum(1 for r in diffs if int(r["Diff size"]) < int(r["Base size"]))
num_regressions = sum(1 for r in diffs if int(r["Diff size"]) > int(r["Base size"]))
logging.info("{} contexts with differences found ({} improvements, {} regressions)".format(len(diffs), num_improvements, num_regressions))
logging.info("")
logging.info("")

if self.coreclr_args.diff_jit_dump:
try:
current_jit_dump_diff = jit_dump_differences.get_nowait()
Expand Down Expand Up @@ -1994,6 +2010,49 @@ def write_pivot_section(row):
return result
################################################################################################ end of replay_with_asm_diffs()

def pick_contexts_to_disassemble(self, diffs):
""" Given information about diffs, pick the context numbers to create .dasm files for
Returns:
List of method context numbers to create disassembly for.
"""

# If there are non-default metrics then we need to disassemble
# everything so that jit-analyze can handle those.
if self.coreclr_args.metrics is not None:
contexts = diffs
else:
# In the default case we have size improvements/regressions
# available without needing to disassemble all, so pick a subset of
# interesting diffs to pass to jit-analyze.

# 20 smallest method contexts with diffs
smallest_contexts = sorted(diffs, key=lambda r: int(r["Context size"]))[:20]
# Order by byte-wise improvement, largest improvements first
by_diff_size = sorted(diffs, key=lambda r: int(r["Diff size"]) - int(r["Base size"]))
# 20 top improvements, byte-wise
top_improvements_bytes = by_diff_size[:20]
# 20 top regressions, byte-wise
top_regressions_bytes = by_diff_size[-20:]

# Order by percentage-wise size improvement, largest improvements first
def diff_pct(r):
base = int(r["Base size"])
if base == 0:
return 0
diff = int(r["Diff size"])
return (diff - base) / base

by_diff_size_pct = sorted(diffs, key=diff_pct)
top_improvements_pct = by_diff_size_pct[:20]
top_regressions_pct = by_diff_size_pct[-20:]

contexts = smallest_contexts + top_improvements_bytes + top_regressions_bytes + top_improvements_pct + top_regressions_pct

final_contexts_indices = list(set(int(r["Context"]) for r in contexts))
final_contexts_indices.sort()
return final_contexts_indices

################################################################################
# SuperPMI Replay/TP diff
################################################################################
Expand Down Expand Up @@ -3908,11 +3967,6 @@ def verify_base_diff_args():
lambda unused: True,
"Unable to set metrics.")

coreclr_args.verify(args,
"retainOnlyTopFiles",
lambda unused: True,
"Unable to set retainOnlyTopFiles.")

coreclr_args.verify(args,
"diff_with_release",
lambda unused: True,
Expand Down
3 changes: 1 addition & 2 deletions src/coreclr/scripts/superpmi_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,7 @@ def main(main_args):
"-spmi_location", spmi_location,
"-error_limit", "100",
"-log_level", "debug",
"-log_file", log_file,
"-retainOnlyTopFiles"])
"-log_file", log_file])

print("Running superpmi.py tpdiff")

Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/tools/superpmi/superpmi-shared/standardpch.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,15 @@

// Getting STL to work with PAL is difficult, so reimplement STL functionality to not require it.
#ifdef TARGET_UNIX
#include "clr_std/utility"
#include "clr_std/string"
#include "clr_std/algorithm"
#include "clr_std/vector"
#else // !TARGET_UNIX
#ifndef USE_STL
#define USE_STL
#endif // USE_STL
#include <utility>
#include <string>
#include <algorithm>
#include <vector>
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/tools/superpmi/superpmi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ set(SUPERPMI_SOURCES
neardiffer.cpp
parallelsuperpmi.cpp
superpmi.cpp
fileio.cpp
jithost.cpp
../superpmi-shared/callutils.cpp
../superpmi-shared/compileresult.cpp
Expand Down
8 changes: 4 additions & 4 deletions src/coreclr/tools/superpmi/superpmi/commandline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,15 +318,15 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)

o->mclFilename = argv[i];
}
else if ((_strnicmp(&argv[i][1], "diffMCList", 10) == 0))
else if ((_strnicmp(&argv[i][1], "diffsInfo", 9) == 0))
{
if (++i >= argc)
{
DumpHelp(argv[0]);
return false;
}

o->diffMCLFilename = argv[i];
o->diffsInfo = argv[i];
}
else if ((_strnicmp(&argv[i][1], "target", 6) == 0))
{
Expand Down Expand Up @@ -641,9 +641,9 @@ bool CommandLine::Parse(int argc, char* argv[], /* OUT */ Options* o)
DumpHelp(argv[0]);
return false;
}
if (o->diffMCLFilename != nullptr && !o->applyDiff)
if (o->diffsInfo != nullptr && !o->applyDiff)
{
LogError("-diffMCList specified without -applyDiff.");
LogError("-diffsInfo specified without -applyDiff.");
DumpHelp(argv[0]);
return false;
}
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/tools/superpmi/superpmi/commandline.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class CommandLine
, baseMetricsSummaryFile(nullptr)
, diffMetricsSummaryFile(nullptr)
, mclFilename(nullptr)
, diffMCLFilename(nullptr)
, diffsInfo(nullptr)
, targetArchitecture(nullptr)
, compileList(nullptr)
, offset(-1)
Expand Down Expand Up @@ -72,7 +72,7 @@ class CommandLine
char* baseMetricsSummaryFile;
char* diffMetricsSummaryFile;
char* mclFilename;
char* diffMCLFilename;
char* diffsInfo;
char* targetArchitecture;
char* compileList;
int offset;
Expand Down
Loading

0 comments on commit cc934c7

Please sign in to comment.