diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst index 2e1bb48af8c88e..ed8de888b3bc21 100644 --- a/Doc/howto/perf_profiling.rst +++ b/Doc/howto/perf_profiling.rst @@ -155,6 +155,9 @@ active since the start of the Python interpreter, you can use the `-Xperf` optio $ python -Xperf my_script.py +You can also set the :envvar:`PYTHONPERFSUPPORT` to a nonzero value to actiavate perf +profiling mode globally. + There is also support for dynamically activating and deactivating the perf profiling mode by using the APIs in the :mod:`sys` module: diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 5ecc882d818fce..24078ded8bb988 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -582,6 +582,8 @@ Miscellaneous options .. versionadded:: 3.11 The ``-X frozen_modules`` option. + .. versionadded:: 3.11 + The ``-X perf`` option. Options you shouldn't use diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py index c2aad85b652e35..f587995b008f68 100644 --- a/Lib/test/test_perf_profiler.py +++ b/Lib/test/test_perf_profiler.py @@ -58,7 +58,7 @@ def baz(): script = make_script(script_dir, "perftest", code) with subprocess.Popen( [sys.executable, "-Xperf", script], - universal_newlines=True, + text=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE, ) as process: diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c index 02206b2786c87f..2cbe3741f26fbc 100644 --- a/Objects/perf_trampoline.c +++ b/Objects/perf_trampoline.c @@ -284,12 +284,23 @@ new_code_arena(void) void *start = &_Py_trampoline_func_start; void *end = &_Py_trampoline_func_end; size_t code_size = end - start; + // TODO: Check the effect of alignment of the code chunks. Initial investigation + // showed that this has no effect on performance in x86-64 or aarch64 and the current + // version has the advantage that the unwinder in GDB can unwind across JIT-ed code. + // + // We should check the values in the future and see if there is a + // measurable performance improvement by rounding trampolines up to 32-bit + // or 64-bit alignment. size_t n_copies = mem_size / code_size; for (size_t i = 0; i < n_copies; i++) { memcpy(memory + i * code_size, start, code_size * sizeof(char)); } // Some systems may prevent us from creating executable code on the fly. + // TODO: Call icache invalidation intrinsics if available: + // __builtin___clear_cache/__clear_cache (depending if clang/gcc). This is + // technically not necessary but we could be missing something so better be + // safe. int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); if (res == -1) { PyErr_SetFromErrno(PyExc_OSError);