From b35b586f1053af248b1a8ce8293403924b08f8cc Mon Sep 17 00:00:00 2001 From: Zhigao Tong Date: Thu, 30 Jun 2022 17:05:50 +0800 Subject: [PATCH] Add options to support `Profile Guided Optimization` --- CMakeLists.txt | 52 +++++ dbms/src/Common/TiFlashBuildInfo.cpp | 17 ++ format-diff.py | 3 - .../include/common/config_common.h.in | 4 + release-centos7-llvm/env/prepare-sysroot.sh | 1 + release-centos7-llvm/scripts/perf-tpch.py | 211 ++++++++++++++++++ 6 files changed, 285 insertions(+), 3 deletions(-) create mode 100755 release-centos7-llvm/scripts/perf-tpch.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e14c205f18..98eca3e3ef4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -134,6 +134,48 @@ if (COMPILER_CLANG) endif () endif () +option (ENABLE_LLVM_PROFILE_INSTR "Generate instrumented code to collect execution counts" OFF) +option (ENABLE_LLVM_PGO "Enables flags for Profile Guided Optimization (PGO)" OFF) +option (ENABLE_LLVM_PGO_USE_SAMPLE "Enables flags for Profile Guided Optimization (PGO) and use sampling profilers" OFF) +set (USE_LLVM_FDO OFF CACHE BOOL "" FORCE) + +if (ENABLE_LLVM_PGO) + if (ENABLE_LLVM_PROFILE_INSTR) + message (FATAL_ERROR "`ENABLE_LLVM_PROFILE_INSTR` can not be used with `ENABLE_LLVM_PGO`") + endif () + if (ENABLE_LLVM_PGO_USE_SAMPLE) + + # Follow https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers + # Use https://github.com/google/autofdo + + set (_LLVM_PGO_USE_SAMPLE_FLAGS "-gline-tables-only -fdebug-info-for-profiling -funique-internal-linkage-names") + + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_LLVM_PGO_USE_SAMPLE_FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${_LLVM_PGO_USE_SAMPLE_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--no-rosegment") + message (STATUS "Add flags `${_LLVM_PGO_USE_SAMPLE_FLAGS}` for profiling") + + if (NOT "$ENV{TIFLASH_LLVM_PROFDATA}" STREQUAL "") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-sample-use=$ENV{TIFLASH_LLVM_PROFDATA}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-sample-use=$ENV{TIFLASH_LLVM_PROFDATA}") + message (STATUS "Use sample profile data `$ENV{TIFLASH_LLVM_PROFDATA}` for profile-guided optimization") + set (USE_LLVM_FDO ON CACHE BOOL "" FORCE) + else () + message (STATUS "NOT use sample profile data") + endif () + + unset (_LLVM_PGO_USE_SAMPLE_FLAGS) + else () + if ("$ENV{TIFLASH_LLVM_PROFDATA}" STREQUAL "") + message (FATAL_ERROR "Please set env var `TIFLASH_LLVM_PROFDATA`") + endif () + + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-use=$ENV{TIFLASH_LLVM_PROFDATA} -Wno-profile-instr-unprofiled") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-use=$ENV{TIFLASH_LLVM_PROFDATA} -Wno-profile-instr-unprofiled") + message (STATUS "Use instrumentation data `$ENV{TIFLASH_LLVM_PROFDATA}` for profile-guided optimization") + endif () +endif () + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # clang: warning: argument unused during compilation: '-stdlib=libc++' # clang: warning: argument unused during compilation: '-specs=/usr/share/dpkg/no-pie-compile.specs' [-Wunused-command-line-argument] @@ -448,6 +490,16 @@ if (TEST_LLVM_COVERAGE AND CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-instr-generate -fcoverage-mapping -DTIFLASH_LLVM_COVERAGE=1") endif () +# `ENABLE_LLVM_PROFILE_INSTR` will make executable binary generate profile data automatically. Make it only work at modules dbms and libs. +if (ENABLE_LLVM_PROFILE_INSTR) + if (ENABLE_LLVM_PGO) + message (FATAL_ERROR "`ENABLE_LLVM_PROFILE_INSTR` can not be used with `ENABLE_LLVM_PGO`") + endif () + message (STATUS "Using flag `-fprofile-instr-generate`. Generate instrumented code to collect execution counts into default.profraw file(overridden by '=' form of option or `LLVM_PROFILE_FILE` env var). Follow https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization.") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fprofile-instr-generate") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-instr-generate") +endif () + if (ARCH_AMD64) include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-mvpclmulqdq -Werror -Wall -Wextra" TIFLASH_COMPILER_VPCLMULQDQ_SUPPORT) diff --git a/dbms/src/Common/TiFlashBuildInfo.cpp b/dbms/src/Common/TiFlashBuildInfo.cpp index ff46e04384d..f2353256b9b 100644 --- a/dbms/src/Common/TiFlashBuildInfo.cpp +++ b/dbms/src/Common/TiFlashBuildInfo.cpp @@ -101,6 +101,23 @@ std::string getEnabledFeatures() #if ENABLE_THINLTO "thinlto", #endif + +// Profile instrumentation +#if ENABLE_LLVM_PROFILE_INSTR + "profile-instr", +#endif + +// PGO +#if ENABLE_LLVM_PGO_USE_SAMPLE + "pgo-sample", +#elif ENABLE_LLVM_PGO + "pgo-instr", +#endif + +// FDO +#if USE_LLVM_FDO + "fdo", +#endif }; return fmt::format("{}", fmt::join(features.begin(), features.end(), " ")); } diff --git a/format-diff.py b/format-diff.py index c8d12925fb3..cf4fe793dca 100755 --- a/format-diff.py +++ b/format-diff.py @@ -96,9 +96,6 @@ def main(): else: print("Format check passed") else: - cmd = 'clang-format -i {}'.format(' '.join(files_to_format)) - if subprocess.Popen(cmd, shell=True, cwd=tics_repo_path).wait(): - exit(-1) print("Finish code format") else: print('No file to format') diff --git a/libs/libcommon/include/common/config_common.h.in b/libs/libcommon/include/common/config_common.h.in index 46f167ea683..0ec263216f4 100644 --- a/libs/libcommon/include/common/config_common.h.in +++ b/libs/libcommon/include/common/config_common.h.in @@ -14,3 +14,7 @@ #cmakedefine01 USE_UNWIND #cmakedefine01 USE_LLVM_LIBUNWIND #cmakedefine01 ENABLE_THINLTO +#cmakedefine01 ENABLE_LLVM_PGO +#cmakedefine01 ENABLE_LLVM_PROFILE_INSTR +#cmakedefine01 ENABLE_LLVM_PGO_USE_SAMPLE +#cmakedefine01 USE_LLVM_FDO diff --git a/release-centos7-llvm/env/prepare-sysroot.sh b/release-centos7-llvm/env/prepare-sysroot.sh index e4132eae667..1134bb37ee7 100755 --- a/release-centos7-llvm/env/prepare-sysroot.sh +++ b/release-centos7-llvm/env/prepare-sysroot.sh @@ -37,6 +37,7 @@ function install_llvm() { mkdir -p llvm-project/build cd llvm-project/build + # TODO: enable `bolt` for >= 14.0.0. https://github.com/llvm/llvm-project/tree/main/bolt cmake -DCMAKE_BUILD_TYPE=Release \ -GNinja \ -DLLVM_ENABLE_PROJECTS="clang;lld;polly;clang-tools-extra" \ diff --git a/release-centos7-llvm/scripts/perf-tpch.py b/release-centos7-llvm/scripts/perf-tpch.py new file mode 100755 index 00000000000..2af518ab2f3 --- /dev/null +++ b/release-centos7-llvm/scripts/perf-tpch.py @@ -0,0 +1,211 @@ +#!/usr/bin/python3 +# Copyright 2022 PingCAP, Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import signal +import sys +import time +import logging +import types +import subprocess + +logger = None + + +def get_tz_offset(): + import datetime + now_stamp = time.time() + local_time = datetime.datetime.fromtimestamp(now_stamp) + utc_time = datetime.datetime.utcfromtimestamp(now_stamp) + offset = local_time - utc_time + total_seconds = offset.total_seconds() + flag = '+' + if total_seconds < 0: + flag = '-' + total_seconds = -total_seconds + mm, ss = divmod(total_seconds, 60) + hh, mm = divmod(mm, 60) + tz_offset = "%s%02d:%02d" % (flag, hh, mm) + return tz_offset + + +def init_logger(): + global logger + + tz_offset = get_tz_offset() + + orig_record_factory = logging.getLogRecordFactory() + log_colors = { + logging.DEBUG: "\033[1;34m", # blue + logging.INFO: "\033[1;32m", # green + logging.WARNING: "\033[1;35m", # magenta + logging.ERROR: "\033[1;31m", # red + logging.CRITICAL: "\033[1;41m", # red reverted + } + + def get_message(ori): + msg = str(ori.msg) + if ori.args: + msg = msg % ori.args + msg = "{}{}{}".format(log_colors[ori.levelno], msg, "\033[0m") + return msg + + def record_factory(*args, **kwargs): + record = orig_record_factory(*args, **kwargs) + record.getMessage = types.MethodType(get_message, record) + return record + + logging.setLogRecordFactory(record_factory) + + root = logging.getLogger() + root.setLevel(logging.DEBUG) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.DEBUG) + handler.setFormatter( + fmt=logging.Formatter('[%(asctime)s.%(msecs)03d {}][%(levelname)s][%(message)s]'.format(tz_offset), + datefmt='%Y/%m/%d %H:%M:%S')) + root.addHandler(handler) + logger = root + + +init_logger() + + +def wrap_run_time(func): + def wrap_func(*args, **kwargs): + bg = time.time() + r = func(*args, **kwargs) + logger.debug('Time cost {:.3f}s'.format(time.time() - bg)) + return r + + return wrap_func + + +@wrap_run_time +def run_cmd(cmd): + logger.debug("RUN CMD:\n{}\n".format(' '.join(cmd))) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + # stderr.decode('utf-8') + return stdout, stderr, proc.returncode + + +class Runner: + def __init__(self): + usage = """ +1. compile TiFlash with cmake option `-DENABLE_LLVM_PGO=ON -DENABLE_LLVM_PGO_USE_SAMPLE=ON` +2. compile https://github.com/google/autofdo and get binary `create_llvm_prof` for converting perf data to llvm profile data +3. start TiFlash process and get `` +4. prepare workload scripts file +5. run `python3 perf-tpch.py --perf --pid --workload --convert-llvm --convert-tool --binary ` +6. get llvm perf file(`tiflash.llvm.code.prof` by default) +7. compile TiFlash with env `TIFLASH_LLVM_PROFDATA=` and cmake option `-DENABLE_LLVM_PGO=ON -DENABLE_LLVM_PGO_USE_SAMPLE=ON` +8. re-run workload and compare result +""" + parser = argparse.ArgumentParser( + description="Auto FDO tools", formatter_class=argparse.ArgumentDefaultsHelpFormatter, + usage=usage) + parser.add_argument( + '--perf', help='run perf with workload', action='store_true') + parser.add_argument( + '--convert-llvm', help='convert linux perf data to llvm profile data', action='store_true') + + parser.add_argument( + '--workload', help='absolute path of workload script', required=False) + parser.add_argument( + '--pid', help='pid of TiFlash process', required=False) + parser.add_argument( + '--output', help='output file of perf data', required=False) + parser.add_argument( + '--convert-tool', help='tool to conver linux perf data to llvm profile data',) + parser.add_argument( + '--input-perf-file', help='input linux perf data file path') + parser.add_argument( + '--binary', help='binary to run workload') + parser.add_argument( + '--output-llvm-prof', help='output llvm profile data path', default='tiflash.llvm.code.prof') + self.args = parser.parse_args() + self.linux_perf_data = None + + def run(self): + if self.args.perf: + self.run_perf() + if self.args.convert_llvm: + self.convert_llvm_perf() + + def convert_llvm_perf(self): + assert self.args.convert_tool + if self.linux_perf_data is None: + assert self.args.input_perf_file + else: + self.args.input_perf_file = self.linux_perf_data + + self.args.output_llvm_prof = 'tiflash.llvm.code.prof' + + assert self.args.binary + logger.info('start to convert linux perf data `{}` to llvm profile data `{}`'.format( + self.args.input_perf_file, self.args.output_llvm_prof)) + stdout, stderr, e = run_cmd([self.args.convert_tool, '--profile', '{}'.format(self.args.input_perf_file), + '--binary', "{}".format(self.args.binary), + '--out', '{}'.format(self.args.output_llvm_prof)]) + logger.info( + 'finish convert. stdout `{}`, stderr `{}`'.format(stdout.decode('utf-8'), stderr.decode('utf-8'))) + assert e == 0 + + def run_perf(self): + assert self.args.pid + assert self.args.workload + + pid = self.args.pid + output = 'tiflash.perf.data' if self.args.output is None else self.args.output + logger.info('using output file `{}`'.format(output)) + + def workload(): + # git clone git@github.com:pingcap/go-tpc.git + # cd go-tpc + # make build + # bin/go-tpc tpch run --queries q1 --host {} -P {} --db {} --count 1 + logger.info('start to run workload `{}`'.format( + self.args.workload)) + stdout, stderr, err = run_cmd([self.args.workload]) + logger.info('finish workload `{}`. stdout `{}`, stderr `{}`'.format( + self.args.workload, stdout.decode('utf-8'), stderr.decode('utf-8'))) + assert err == 0 + perf_cmd = ["perf", "record", "-p", "{}".format( + pid), "-e", "cycles:up", "-j", "any,u", "-a", "-o", "{}".format(output)] + logger.info("start perf with cmd `{}`".format(' '.join(perf_cmd))) + perf_proc = subprocess.Popen( + perf_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + # + workload() + # + perf_proc.send_signal(signal.SIGTERM) + stdout, stderr = perf_proc.communicate() + logger.info( + "stop perf. stdout `{}`, stderr `{}`".format(stdout.decode('utf-8'), stderr.decode('utf-8'))) + _ = perf_proc.wait() + # check file exits + with open(output, 'r') as f: + f.close() + self.linux_perf_data = output + + +def main(): + Runner().run() + + +if __name__ == '__main__': + main()