From ee5e878ad794dd38e778d37099373f3eda762d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20=C5=9Aliwak?= Date: Wed, 9 Feb 2022 18:05:38 +0100 Subject: [PATCH] benchmark_diff: A Python script for diffing summarized benchmarks from external tests --- scripts/externalTests/benchmark_diff.py | 212 ++++++++++++ .../summarized-benchmarks-branch.json | 100 ++++++ .../summarized-benchmarks-develop.json | 99 ++++++ .../test_externalTests_benchmark_diff.py | 309 ++++++++++++++++++ 4 files changed, 720 insertions(+) create mode 100755 scripts/externalTests/benchmark_diff.py create mode 100644 test/scripts/fixtures/summarized-benchmarks-branch.json create mode 100644 test/scripts/fixtures/summarized-benchmarks-develop.json create mode 100644 test/scripts/test_externalTests_benchmark_diff.py diff --git a/scripts/externalTests/benchmark_diff.py b/scripts/externalTests/benchmark_diff.py new file mode 100755 index 000000000000..0669e62ebad1 --- /dev/null +++ b/scripts/externalTests/benchmark_diff.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +from argparse import ArgumentParser +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any, Optional, Union +import json +import sys + + +class DifferenceStyle(Enum): + ABSOLUTE = 'absolute' + RELATIVE = 'relative' + HUMANIZED = 'humanized' + + +DEFAULT_RELATIVE_PRECISION = 4 +DEFAULT_DIFFERENCE_STYLE = DifferenceStyle.ABSOLUTE + + +class ValidationError(Exception): + pass + + +class CommandLineError(ValidationError): + pass + + +class BenchmarkDiffer: + difference_style: DifferenceStyle + relative_precision: Optional[int] + + def __init__( + self, + difference_style: DifferenceStyle, + relative_precision: Optional[int], + ): + self.difference_style = difference_style + self.relative_precision = relative_precision + + def run(self, before: Any, after: Any) -> Optional[Union[dict, str, int, float]]: + if not isinstance(before, dict) or not isinstance(after, dict): + return self._diff_scalars(before, after) + + if before.get('version') != after.get('version'): + return self._humanize_diff('!V') + + diff = {} + for key in (set(before) | set(after)) - {'version'}: + value_diff = self.run(before.get(key), after.get(key)) + if value_diff not in [None, {}]: + diff[key] = value_diff + + return diff + + def _diff_scalars(self, before: Any, after: Any) -> Optional[Union[str, int, float, dict]]: + assert not isinstance(before, dict) or not isinstance(after, dict) + + if before is None and after is None: + return {} + if before is None: + return self._humanize_diff('!B') + if after is None: + return self._humanize_diff('!A') + if not isinstance(before, (int, float)) or not isinstance(after, (int, float)): + return self._humanize_diff('!T') + + number_diff = self._diff_numbers(before, after) + if self.difference_style != DifferenceStyle.HUMANIZED: + return number_diff + + return self._humanize_diff(number_diff) + + def _diff_numbers(self, value_before: Union[int, float], value_after: Union[int, float]) -> Union[str, int, float]: + diff: Union[str, int, float] + + if self.difference_style == DifferenceStyle.ABSOLUTE: + diff = value_after - value_before + if isinstance(diff, float) and diff.is_integer(): + diff = int(diff) + + return diff + + if value_before == 0: + if value_after > 0: + return '+INF' + elif value_after < 0: + return '-INF' + else: + return 0 + + diff = (value_after - value_before) / abs(value_before) + if self.relative_precision is not None: + rounded_diff = round(diff, self.relative_precision) + if rounded_diff == 0 and diff < 0: + diff = '-0' + elif rounded_diff == 0 and diff > 0: + diff = '+0' + else: + diff = rounded_diff + + if isinstance(diff, float) and diff.is_integer(): + diff = int(diff) + + return diff + + def _humanize_diff(self, diff: Union[str, int, float]) -> str: + if isinstance(diff, str) and diff.startswith('!'): + return diff + + value: Union[str, int, float] + if isinstance(diff, (int, float)): + value = diff * 100 + if isinstance(value, float) and self.relative_precision is not None: + # The multiplication can result in new significant digits appearing. We need to reround. + # NOTE: round() works fine with negative precision. + value = round(value, self.relative_precision - 2) + if isinstance(value, float) and value.is_integer(): + value = int(value) + prefix = '' + if diff < 0: + prefix = '' + elif diff > 0: + prefix = '+' + else: + value = diff + prefix = '' + + return f"{prefix}{value}%" + + +@dataclass(frozen=True) +class CommandLineOptions: + report_before: Path + report_after: Path + difference_style: DifferenceStyle + relative_precision: int + + +def process_commandline() -> CommandLineOptions: + script_description = ( + "Compares summarized benchmark reports and outputs JSON with the same structure but listing only differences." + ) + + parser = ArgumentParser(description=script_description) + parser.add_argument(dest='report_before', help="Path to a JSON file containing original benchmark results.") + parser.add_argument(dest='report_after', help="Path to a JSON file containing new benchmark results.") + parser.add_argument( + '--style', + dest='difference_style', + choices=[s.value for s in DifferenceStyle], + help=( + "How to present numeric differences: " + f"'{DifferenceStyle.ABSOLUTE.value}' subtracts new from original; " + f"'{DifferenceStyle.RELATIVE.value}' also divides by the original; " + f"'{DifferenceStyle.HUMANIZED.value}' is like relative but value is a percentage and " + "positive/negative changes are emphasized. " + f"(default: '{DEFAULT_DIFFERENCE_STYLE}')." + ) + ) + # NOTE: Negative values are valid for precision. round() handles them in a sensible way. + parser.add_argument( + '--precision', + dest='relative_precision', + type=int, + default=DEFAULT_RELATIVE_PRECISION, + help=( + "Number of significant digits for relative differences. " + f"Note that with --style={DifferenceStyle.HUMANIZED.value} the rounding is applied " + "**before** converting the value to a percentage so you need to add 2. " + f"Has no effect when used together with --style={DifferenceStyle.ABSOLUTE.value}. " + f"(default: {DEFAULT_RELATIVE_PRECISION})" + ) + ) + + options = parser.parse_args() + + if options.difference_style is not None: + difference_style = DifferenceStyle(options.difference_style) + else: + difference_style = DEFAULT_DIFFERENCE_STYLE + + processed_options = CommandLineOptions( + report_before=Path(options.report_before), + report_after=Path(options.report_after), + difference_style=difference_style, + relative_precision=options.relative_precision, + ) + + return processed_options + + +def main(): + try: + options = process_commandline() + + differ = BenchmarkDiffer(options.difference_style, options.relative_precision) + diff = differ.run( + json.loads(options.report_before.read_text('utf-8')), + json.loads(options.report_after.read_text('utf-8')), + ) + + print(json.dumps(diff, indent=4, sort_keys=True)) + + return 0 + except CommandLineError as exception: + print(f"ERROR: {exception}", file=sys.stderr) + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/scripts/fixtures/summarized-benchmarks-branch.json b/test/scripts/fixtures/summarized-benchmarks-branch.json new file mode 100644 index 000000000000..0ba46b87d682 --- /dev/null +++ b/test/scripts/fixtures/summarized-benchmarks-branch.json @@ -0,0 +1,100 @@ +{ + "bleeps": { + "ir-optimize-evm+yul": { + "bytecode_size": 132868, + "deployment_gas": 0, + "method_gas": 39289198, + "version": "bb90cd0" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 137869, + "deployment_gas": 0, + "method_gas": 38863224, + "version": "bb90cd0" + } + }, + "colony": { + "legacy-no-optimize": { + "bytecode_size": 664190, + "deployment_gas": null, + "method_gas": null, + "version": "573399b" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 363606, + "deployment_gas": null, + "method_gas": null, + "version": "573399b" + } + }, + "elementfi": { + "legacy-no-optimize": { + "bytecode_size": null, + "deployment_gas": 69200158, + "method_gas": null, + "version": "87f8b5e" + }, + "legacy-optimize-evm+yul": { + "deployment_gas": 40951128, + "version": "87f8b5e" + }, + "ir-optimize-evm-only": {}, + "ir-no-optimize": { + "deployment_gas": null, + "method_gas": 2777867251, + "version": "87f8b5e" + } + }, + "euler": { + "ir-no-optimize": { + "bytecode_size": 328540, + "deployment_gas": 61591870, + "method_gas": 3537419168, + "version": "2ef99fc" + }, + "legacy-no-optimize": { + "bytecode_size": 328540, + "deployment_gas": 62590688, + "method_gas": 3537419168, + "version": "2ef99fc" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 182190, + "deployment_gas": 35236828, + "method_gas": 2777867251, + "version": "2ef99fc" + }, + "legacy-optimize-evm-only": { + "bytecode_size": 205211, + "deployment_gas": 39459629, + "method_gas": 2978467272, + "version": "2ef99fc" + }, + "ir-optimize-evm-only": { + "bytecode_size": 205211, + "deployment_gas": 39459629, + "method_gas": 2978467272, + "version": "2ef99fc" + }, + "ir-optimize-evm+yul": { + "bytecode_size": 205211, + "deployment_gas": 39459629, + "method_gas": 2777867251 + } + }, + "gnosis": { + "ir-optimize-evm+yul": { + "bytecode_size": 56069, + "deployment_gas": null, + "method_gas": null, + "version": "ea09294" + } + }, + "zeppelin": { + "legacy-optimize-evm+yul": { + "bytecode_size": 510428, + "deployment_gas": 94501114, + "version": "af7ec04" + } + } +} diff --git a/test/scripts/fixtures/summarized-benchmarks-develop.json b/test/scripts/fixtures/summarized-benchmarks-develop.json new file mode 100644 index 000000000000..1961870f3f45 --- /dev/null +++ b/test/scripts/fixtures/summarized-benchmarks-develop.json @@ -0,0 +1,99 @@ +{ + "bleeps": { + "ir-optimize-evm+yul": { + "bytecode_size": 132165, + "deployment_gas": 0, + "method_gas": 39289935, + "version": "bb90cd0" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 137869, + "deployment_gas": 0, + "method_gas": 38863224, + "version": "bb90cd0" + } + }, + "colony": { + "ir-optimize-evm+yul": { + "bytecode_size": 363606, + "deployment_gas": null, + "method_gas": null, + "version": "573399b" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 363606, + "deployment_gas": null, + "method_gas": null, + "version": "573399b" + } + }, + "elementfi": { + "legacy-no-optimize": { + "bytecode_size": 890560, + "deployment_gas": null, + "method_gas": null, + "version": "87f8b5e" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 536668, + "version": "87f8b5e" + }, + "legacy-optimize-evm-only": {}, + "ir-no-optimize": { + "bytecode_size": null, + "method_gas": 2777867251, + "version": "87f8b5e" + } + }, + "euler": { + "ir-no-optimize": { + "bytecode_size": 323909, + "deployment_gas": 61591870, + "method_gas": 3452105184, + "version": "2ef99fc" + }, + "legacy-no-optimize": { + "bytecode_size": 323909, + "deployment_gas": 61591870, + "method_gas": 3452105184, + "version": "c23e8bd" + }, + "legacy-optimize-evm+yul": { + "bytecode_size": 182190, + "deployment_gas": 35236828, + "method_gas": 2777867251, + "version": "c23e8bd" + }, + "legacy-optimize-evm-only": { + "bytecode_size": 202106, + "deployment_gas": 38790600, + "method_gas": 2907368790, + "version": "v1.2.3" + }, + "ir-optimize-evm-only": { + "bytecode_size": 182190, + "deployment_gas": 35236828, + "method_gas": 2777867251 + }, + "ir-optimize-evm+yul": { + "bytecode_size": 182190, + "deployment_gas": 35236828, + "method_gas": 2777867251 + } + }, + "ens": { + "legacy-optimize-evm+yul": { + "bytecode_size": 156937, + "deployment_gas": 30073789, + "method_gas": 105365362, + "version": "v0.0.8" + } + }, + "zeppelin": { + "legacy-optimize-evm+yul": { + "bytecode_size": 510428, + "deployment_gas": 94501114, + "version": "af7ec04" + } + } +} diff --git a/test/scripts/test_externalTests_benchmark_diff.py b/test/scripts/test_externalTests_benchmark_diff.py new file mode 100644 index 000000000000..b4ccf07450d5 --- /dev/null +++ b/test/scripts/test_externalTests_benchmark_diff.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 + +import json +import unittest + +from unittest_helpers import FIXTURE_DIR, load_fixture + +# NOTE: This test file file only works with scripts/ added to PYTHONPATH so pylint can't find the imports +# pragma pylint: disable=import-error +from externalTests.benchmark_diff import BenchmarkDiffer, DifferenceStyle +# pragma pylint: enable=import-error + +SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH = FIXTURE_DIR / 'summarized-benchmarks-develop.json' +SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH = FIXTURE_DIR / 'summarized-benchmarks-branch.json' + + +class TestBenchmarkDiff(unittest.TestCase): + def setUp(self): + self.maxDiff = 10000 + + def test_benchmark_diff(self): + report_before = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_DEVELOP_JSON_PATH)) + report_after = json.loads(load_fixture(SUMMARIZED_BENCHMARKS_BRANCH_JSON_PATH)) + expected_diff = { + "bleeps": { + "ir-optimize-evm+yul": { + # Numerical difference -> negative/positive/zero. + # Zeros are not skipped to differentiate them from missing values. + "bytecode_size": 132868 - 132165, + "deployment_gas": 0, + "method_gas": 39289198 - 39289935, + }, + "legacy-optimize-evm+yul": { + # No differences within preset -> zeros still present. + "bytecode_size": 0, + "deployment_gas": 0, + "method_gas": 0, + }, + }, + "colony": { + # Preset missing on one side -> replace dict with string + "ir-optimize-evm+yul": "!A", + "legacy-no-optimize": "!B", + "legacy-optimize-evm+yul": { + "bytecode_size": 0, + # Attribute missing on both sides -> skip + #"deployment_gas": + #"method_gas": + }, + }, + "elementfi": { + "legacy-no-optimize": { + # Attributes null on one side -> replace value with string + "bytecode_size": "!A", + "deployment_gas": "!B", + # Attribute null on both sides -> skip + #"method_gas": + }, + "legacy-optimize-evm+yul": { + # Attributes missing on one side -> replace value with string + "bytecode_size": "!A", + "deployment_gas": "!B", + # Attribute missing on both sides -> skip + #"method_gas": + }, + "ir-no-optimize": { + # Attributes missing on one side, null on the other -> skip + #"bytecode_size": + #"deployment_gas": + "method_gas": 0, + }, + # Empty preset missing on one side -> replace dict with string + "legacy-optimize-evm-only": "!A", + "ir-optimize-evm-only": "!B", + }, + "euler": { + # Matching versions -> show attributes, skip version + "ir-no-optimize": { + "bytecode_size": 328540 - 323909, + "deployment_gas": 0, + "method_gas": 3537419168 - 3452105184, + }, + # Different versions, different values -> replace whole preset with string + "legacy-no-optimize": "!V", + # Different versions, same values -> replace whole preset with string + "legacy-optimize-evm+yul": "!V", + # Different versions (not a commit hash), different values -> replace whole preset with string + "legacy-optimize-evm-only": "!V", + # Version missing on one side -> replace whole preset with string + "ir-optimize-evm-only": "!V", + # Version missing on both sides -> assume same version + "ir-optimize-evm+yul": { + "bytecode_size": 205211 - 182190, + "deployment_gas": 39459629 - 35236828, + "method_gas": 0, + }, + }, + "zeppelin": { + "legacy-optimize-evm+yul": { + # Whole project identical -> attributes still present, with zeros + "bytecode_size": 0, + "deployment_gas": 0, + # Field missing on both sides -> skip + #"method_gas": + } + }, + # Empty project missing on one side -> replace its dict with a string + "gnosis": "!B", + "ens": "!A", + } + differ = BenchmarkDiffer(DifferenceStyle.ABSOLUTE, None) + self.assertEqual(differ.run(report_before, report_after), expected_diff) + + +class TestBenchmarkDiffer(unittest.TestCase): + def setUp(self): + self.maxDiff = 10000 + + @staticmethod + def _nest(value, levels): + nested_value = value + for level in levels: + nested_value = {level: nested_value} + + return nested_value + + def _assert_single_value_diff_matches(self, differ, cases, nest_result=True, nestings=None): + if nestings is None: + nestings = [[], ['p'], ['p', 's'], ['p', 's', 'a']] + + for levels in nestings: + for (before, after, expected_diff) in cases: + self.assertEqual( + differ.run(self._nest(before, levels), self._nest(after, levels)), + self._nest(expected_diff, levels) if nest_result else expected_diff, + f'Wrong diff for {self._nest(before, levels)} vs {self._nest(after, levels)}' + ) + + def test_empty(self): + for style in DifferenceStyle: + differ = BenchmarkDiffer(style, None) + self._assert_single_value_diff_matches(differ, [({}, {}, {})], nest_result=False) + + def test_null(self): + for style in DifferenceStyle: + differ = BenchmarkDiffer(style, None) + self._assert_single_value_diff_matches(differ, [(None, None, {})], nest_result=False) + + def test_number_diff_absolute_json(self): + self._assert_single_value_diff_matches( + BenchmarkDiffer(DifferenceStyle.ABSOLUTE, 4), + [ + (2, 2, 0), + (2, 5, 3), + (5, 2, -3), + (2.0, 2.0, 0), + (2, 2.0, 0), + (2.0, 2, 0), + (2, 2.5, 2.5 - 2), + (2.5, 2, 2 - 2.5), + + (0, 0, 0), + (0, 2, 2), + (0, -2, -2), + + (-3, -1, 2), + (-1, -3, -2), + (2, 0, -2), + (-2, 0, 2), + + (1.00006, 1, 1 - 1.00006), + (1, 1.00006, 1.00006 - 1), + (1.00004, 1, 1 - 1.00004), + (1, 1.00004, 1.00004 - 1), + ], + ) + + def test_number_diff_json(self): + self._assert_single_value_diff_matches( + BenchmarkDiffer(DifferenceStyle.RELATIVE, 4), + [ + (2, 2, 0), + (2, 5, (5 - 2) / 2), + (5, 2, (2 - 5) / 5), + (2.0, 2.0, 0), + (2, 2.0, 0), + (2.0, 2, 0), + (2, 2.5, (2.5 - 2) / 2), + (2.5, 2, (2 - 2.5) / 2.5), + + (0, 0, 0), + (0, 2, '+INF'), + (0, -2, '-INF'), + + (-3, -1, 0.6667), + (-1, -3, -2), + (2, 0, -1), + (-2, 0, 1), + + (1.00006, 1, -0.0001), + (1, 1.00006, 0.0001), + (1.000004, 1, '-0'), + (1, 1.000004, '+0'), + ], + ) + + def test_number_diff_humanized_json(self): + self._assert_single_value_diff_matches( + BenchmarkDiffer(DifferenceStyle.HUMANIZED, 4), + [ + (2, 2, '0%'), + (2, 5, '+150%'), + (5, 2, '-60%'), + (2.0, 2.0, '0%'), + (2, 2.0, '0%'), + (2.0, 2, '0%'), + (2, 2.5, '+25%'), + (2.5, 2, '-20%'), + + (0, 0, '0%'), + (0, 2, '+INF%'), + (0, -2, '-INF%'), + + (-3, -1, '+66.67%'), + (-1, -3, '-200%'), + (2, 0, '-100%'), + (-2, 0, '+100%'), + + (1.00006, 1, '-0.01%'), + (1, 1.00006, '+0.01%'), + (1.000004, 1, '-0%'), + (1, 1.000004, '+0%'), + ], + ) + + def test_type_mismatch(self): + for style in DifferenceStyle: + self._assert_single_value_diff_matches( + BenchmarkDiffer(style, 4), + [ + (1, {}, '!T'), + ({}, 1, '!T'), + (1.5, {}, '!T'), + ({}, 1.5, '!T'), + ('1', {}, '!T'), + ({}, '1', '!T'), + (1, '1', '!T'), + ('1', 1, '!T'), + (1.5, '1', '!T'), + ('1', 1.5, '!T'), + ('1', '1', '!T'), + ], + ) + + def test_version_mismatch(self): + for style in DifferenceStyle: + self._assert_single_value_diff_matches( + BenchmarkDiffer(style, 4), + [ + ({'a': 123, 'version': 1}, {'a': 123, 'version': 2}, '!V'), + ({'a': 123, 'version': 2}, {'a': 123, 'version': 1}, '!V'), + ({'a': 123, 'version': 'a'}, {'a': 123, 'version': 'b'}, '!V'), + ({'a': 123, 'version': 'a'}, {'a': 123, 'version': 1}, '!V'), + + ({'a': 'a', 'version': 1}, {'a': 'a', 'version': 2}, '!V'), + ({'a': {}, 'version': 1}, {'a': {}, 'version': 2}, '!V'), + ({'s': {'a': 1}, 'version': 1}, {'s': {'a': 1}, 'version': 2}, '!V'), + + ({'a': 123, 'version': 1}, {'a': 456, 'version': 2}, '!V'), + ({'a': 'a', 'version': 1}, {'a': 'b', 'version': 2}, '!V'), + ({'s': {'a': 1}, 'version': 1}, {'s': {'a': 2}, 'version': 2}, '!V'), + ], + ) + + def test_missing(self): + for style in DifferenceStyle: + self._assert_single_value_diff_matches( + BenchmarkDiffer(style, None), + [ + (1, None, '!A'), + (None, 1, '!B'), + ('1', None, '!A'), + (None, '1', '!B'), + ({}, None, '!A'), + (None, {}, '!B'), + + ({'x': 1}, {}, {'x': '!A'}), + ({}, {'x': 1}, {'x': '!B'}), + ({'x': 1}, {'x': None}, {'x': '!A'}), + ({'x': None}, {'x': 1}, {'x': '!B'}), + ({'x': 1}, {'y': 1}, {'x': '!A', 'y': '!B'}), + + ({'x': {}}, {}, {'x': '!A'}), + ({}, {'x': {}}, {'x': '!B'}), + ({'p': {'x': {}}}, {}, {'p': '!A'}), + ({}, {'p': {'x': {}}}, {'p': '!B'}), + ], + ) + + def test_missing_vs_null(self): + for style in DifferenceStyle: + self._assert_single_value_diff_matches( + BenchmarkDiffer(style, None), + [ + ({'a': None}, {}, {}), + ({}, {'a': None}, {}), + ], + nest_result=False, + )