From dc8fdf5fd5f572e87152883f15f35c354fa4b4bf Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 16 Aug 2023 16:26:43 -0700 Subject: [PATCH] gh-106581: Split `CALL_PY_EXACT_ARGS` into uops (#107760) * Split `CALL_PY_EXACT_ARGS` into uops This is only the first step for doing `CALL` in Tier 2. The next step involves tracing into the called code object and back. After that we'll have to do the remaining `CALL` specialization. Finally we'll have to deal with `KW_NAMES`. Note: this moves setting `frame->return_offset` directly in front of `DISPATCH_INLINED()`, to make it easier to move it into `_PUSH_FRAME`. --- Include/internal/pycore_opcode_metadata.h | 47 +++++++++- Lib/test/test_capi/test_misc.py | 17 ++++ Python/abstract_interp_cases.c.h | 28 ++++++ Python/bytecodes.c | 82 ++++++++++++++---- Python/ceval.c | 6 +- Python/ceval_macros.h | 5 ++ Python/executor.c | 2 + Python/executor_cases.c.h | 98 +++++++++++++++++++-- Python/generated_cases.c.h | 101 ++++++++++++++++------ Python/optimizer.c | 9 ++ Tools/cases_generator/flags.py | 2 +- Tools/cases_generator/generate_cases.py | 34 ++++++-- Tools/cases_generator/instructions.py | 37 +++----- Tools/cases_generator/stacking.py | 60 ++++++++----- 14 files changed, 412 insertions(+), 116 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 7dbd78b32a8957..afe8aa172b703b 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -52,10 +52,16 @@ #define _ITER_CHECK_RANGE 328 #define _IS_ITER_EXHAUSTED_RANGE 329 #define _ITER_NEXT_RANGE 330 -#define _POP_JUMP_IF_FALSE 331 -#define _POP_JUMP_IF_TRUE 332 -#define JUMP_TO_TOP 333 -#define INSERT 334 +#define _CHECK_PEP_523 331 +#define _CHECK_FUNCTION_EXACT_ARGS 332 +#define _CHECK_STACK_SPACE 333 +#define _INIT_CALL_PY_EXACT_ARGS 334 +#define _PUSH_FRAME 335 +#define _POP_JUMP_IF_FALSE 336 +#define _POP_JUMP_IF_TRUE 337 +#define JUMP_TO_TOP 338 +#define SAVE_CURRENT_IP 339 +#define INSERT 340 extern int _PyOpcode_num_popped(int opcode, int oparg, bool jump); #ifdef NEED_OPCODE_METADATA @@ -473,6 +479,16 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return oparg + 2; case CALL_BOUND_METHOD_EXACT_ARGS: return oparg + 2; + case _CHECK_PEP_523: + return 0; + case _CHECK_FUNCTION_EXACT_ARGS: + return oparg + 2; + case _CHECK_STACK_SPACE: + return oparg + 2; + case _INIT_CALL_PY_EXACT_ARGS: + return oparg + 2; + case _PUSH_FRAME: + return 1; case CALL_PY_EXACT_ARGS: return oparg + 2; case CALL_PY_WITH_DEFAULTS: @@ -561,6 +577,8 @@ int _PyOpcode_num_popped(int opcode, int oparg, bool jump) { return 0; case SAVE_IP: return 0; + case SAVE_CURRENT_IP: + return 0; case EXIT_TRACE: return 0; case INSERT: @@ -987,6 +1005,16 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 1; case CALL_BOUND_METHOD_EXACT_ARGS: return 1; + case _CHECK_PEP_523: + return 0; + case _CHECK_FUNCTION_EXACT_ARGS: + return oparg + 2; + case _CHECK_STACK_SPACE: + return oparg + 2; + case _INIT_CALL_PY_EXACT_ARGS: + return 1; + case _PUSH_FRAME: + return 1; case CALL_PY_EXACT_ARGS: return 1; case CALL_PY_WITH_DEFAULTS: @@ -1075,6 +1103,8 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { return 0; case SAVE_IP: return 0; + case SAVE_CURRENT_IP: + return 0; case EXIT_TRACE: return 0; case INSERT: @@ -1088,6 +1118,7 @@ int _PyOpcode_num_pushed(int opcode, int oparg, bool jump) { enum InstructionFormat { INSTR_FMT_IB, INSTR_FMT_IBC, + INSTR_FMT_IBC0, INSTR_FMT_IBC00, INSTR_FMT_IBC000, INSTR_FMT_IBC00000000, @@ -1132,6 +1163,7 @@ struct opcode_macro_expansion { #define OPARG_CACHE_4 4 #define OPARG_TOP 5 #define OPARG_BOTTOM 6 +#define OPARG_SAVE_IP 7 #define OPCODE_METADATA_FMT(OP) (_PyOpcode_opcode_metadata[(OP)].instr_format) #define SAME_OPCODE_METADATA(OP1, OP2) \ @@ -1474,6 +1506,7 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[OPCODE_MACRO_EXPAN [GET_YIELD_FROM_ITER] = { .nuops = 1, .uops = { { GET_YIELD_FROM_ITER, 0, 0 } } }, [WITH_EXCEPT_START] = { .nuops = 1, .uops = { { WITH_EXCEPT_START, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { PUSH_EXC_INFO, 0, 0 } } }, + [CALL_PY_EXACT_ARGS] = { .nuops = 7, .uops = { { _CHECK_PEP_523, 0, 0 }, { _CHECK_FUNCTION_EXACT_ARGS, 2, 1 }, { _CHECK_STACK_SPACE, 0, 0 }, { _INIT_CALL_PY_EXACT_ARGS, 0, 0 }, { SAVE_IP, 7, 3 }, { SAVE_CURRENT_IP, 0, 0 }, { _PUSH_FRAME, 0, 0 } } }, [CALL_NO_KW_TYPE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TYPE_1, 0, 0 } } }, [CALL_NO_KW_STR_1] = { .nuops = 1, .uops = { { CALL_NO_KW_STR_1, 0, 0 } } }, [CALL_NO_KW_TUPLE_1] = { .nuops = 1, .uops = { { CALL_NO_KW_TUPLE_1, 0, 0 } } }, @@ -1531,9 +1564,15 @@ const char * const _PyOpcode_uop_name[OPCODE_UOP_NAME_SIZE] = { [_ITER_CHECK_RANGE] = "_ITER_CHECK_RANGE", [_IS_ITER_EXHAUSTED_RANGE] = "_IS_ITER_EXHAUSTED_RANGE", [_ITER_NEXT_RANGE] = "_ITER_NEXT_RANGE", + [_CHECK_PEP_523] = "_CHECK_PEP_523", + [_CHECK_FUNCTION_EXACT_ARGS] = "_CHECK_FUNCTION_EXACT_ARGS", + [_CHECK_STACK_SPACE] = "_CHECK_STACK_SPACE", + [_INIT_CALL_PY_EXACT_ARGS] = "_INIT_CALL_PY_EXACT_ARGS", + [_PUSH_FRAME] = "_PUSH_FRAME", [_POP_JUMP_IF_FALSE] = "_POP_JUMP_IF_FALSE", [_POP_JUMP_IF_TRUE] = "_POP_JUMP_IF_TRUE", [JUMP_TO_TOP] = "JUMP_TO_TOP", + [SAVE_CURRENT_IP] = "SAVE_CURRENT_IP", [INSERT] = "INSERT", }; #endif // NEED_OPCODE_METADATA diff --git a/Lib/test/test_capi/test_misc.py b/Lib/test/test_capi/test_misc.py index c81212202d9ef2..3dfbfdc26e7416 100644 --- a/Lib/test/test_capi/test_misc.py +++ b/Lib/test/test_capi/test_misc.py @@ -2618,6 +2618,23 @@ def testfunc(it): with self.assertRaises(StopIteration): next(it) + def test_call_py_exact_args(self): + def testfunc(n): + def dummy(x): + return x+1 + for i in range(n): + dummy(i) + + opt = _testinternalcapi.get_uop_optimizer() + with temporary_optimizer(opt): + testfunc(10) + + ex = get_first_executor(testfunc) + self.assertIsNotNone(ex) + uops = {opname for opname, _, _ in ex} + self.assertIn("_PUSH_FRAME", uops) + + if __name__ == "__main__": unittest.main() diff --git a/Python/abstract_interp_cases.c.h b/Python/abstract_interp_cases.c.h index 6bfcf534646b1e..eef071119bcd84 100644 --- a/Python/abstract_interp_cases.c.h +++ b/Python/abstract_interp_cases.c.h @@ -612,6 +612,30 @@ break; } + case _CHECK_PEP_523: { + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { + break; + } + + case _CHECK_STACK_SPACE: { + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + STACK_SHRINK(oparg); + STACK_SHRINK(1); + PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1)), true); + break; + } + + case _PUSH_FRAME: { + PARTITIONNODE_OVERWRITE((_Py_PARTITIONNODE_t *)PARTITIONNODE_NULLROOT, PEEK(-(-1)), true); + break; + } + case CALL_NO_KW_TYPE_1: { STACK_SHRINK(oparg); STACK_SHRINK(1); @@ -751,6 +775,10 @@ break; } + case SAVE_CURRENT_IP: { + break; + } + case EXIT_TRACE: { break; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 2a5ad2c942fb3e..ae2923c65b3308 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -956,13 +956,13 @@ dummy_func( { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -995,13 +995,13 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2587,7 +2587,6 @@ dummy_func( DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -2595,6 +2594,7 @@ dummy_func( SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -2949,32 +2949,72 @@ dummy_func( GO_TO_INSTRUCTION(CALL_PY_EXACT_ARGS); } - inst(CALL_PY_EXACT_ARGS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { - ASSERT_KWNAMES_IS_NULL(); + op(_CHECK_PEP_523, (--)) { DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; - } + } + + op(_CHECK_FUNCTION_EXACT_ARGS, (func_version/2, callable, self_or_null, unused[oparg] -- callable, self_or_null, unused[oparg])) { + ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(!PyFunction_Check(callable), CALL); PyFunctionObject *func = (PyFunctionObject *)callable; DEOPT_IF(func->func_version != func_version, CALL); PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + + op(_CHECK_STACK_SPACE, (callable, unused, unused[oparg] -- callable, unused, unused[oparg])) { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + } + + op(_INIT_CALL_PY_EXACT_ARGS, (callable, self_or_null, args[oparg] -- new_frame: _PyInterpreterFrame*)) { + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); for (int i = 0; i < argcount; i++) { new_frame->localsplus[i] = args[i]; } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); + } + + // The 'unused' output effect represents the return value + // (which will be pushed when the frame returns). + // It is needed so CALL_PY_EXACT_ARGS matches its family. + op(_PUSH_FRAME, (new_frame: _PyInterpreterFrame* -- unused)) { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + assert(tstate->interp->eval_frame == NULL); + _PyFrame_SetStackPointer(frame, stack_pointer); + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + ERROR_IF(_Py_EnterRecursivePy(tstate), exit_unwind); + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif } + macro(CALL_PY_EXACT_ARGS) = + unused/1 + // Skip over the counter + _CHECK_PEP_523 + + _CHECK_FUNCTION_EXACT_ARGS + + _CHECK_STACK_SPACE + + _INIT_CALL_PY_EXACT_ARGS + + SAVE_IP + // Tier 2 only; special-cased oparg + SAVE_CURRENT_IP + // Sets frame->prev_instr + _PUSH_FRAME; + inst(CALL_PY_WITH_DEFAULTS, (unused/1, func_version/2, callable, self_or_null, args[oparg] -- unused)) { ASSERT_KWNAMES_IS_NULL(); DEOPT_IF(tstate->interp->eval_frame, CALL); @@ -3735,6 +3775,16 @@ dummy_func( frame->prev_instr = ip_offset + oparg; } + op(SAVE_CURRENT_IP, (--)) { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + } + op(EXIT_TRACE, (--)) { frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Python/ceval.c b/Python/ceval.c index b966399a342d08..26e741ed7c7547 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -602,11 +602,6 @@ int _Py_CheckRecursiveCallPy( return 0; } -static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { - return (tstate->py_recursion_remaining-- <= 0) && - _Py_CheckRecursiveCallPy(tstate); -} - static inline void _Py_LeaveRecursiveCallPy(PyThreadState *tstate) { tstate->py_recursion_remaining++; @@ -770,6 +765,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #endif { +#define TIER_ONE 1 #include "generated_cases.c.h" /* INSTRUMENTED_LINE has to be here, rather than in bytecodes.c, diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 8dc8b754485856..5e2db1e0b394e6 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -364,3 +364,8 @@ static const convertion_func_ptr CONVERSION_FUNCTIONS[4] = { #else #define _Py_atomic_load_relaxed_int32(ATOMIC_VAL) _Py_atomic_load_relaxed(ATOMIC_VAL) #endif + +static inline int _Py_EnterRecursivePy(PyThreadState *tstate) { + return (tstate->py_recursion_remaining-- <= 0) && + _Py_CheckRecursiveCallPy(tstate); +} diff --git a/Python/executor.c b/Python/executor.c index 4a18618c0c6c0c..5a571e6da4673f 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -81,6 +81,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject OBJECT_STAT_INC(optimization_uops_executed); switch (opcode) { +#define TIER_TWO 2 #include "executor_cases.c.h" default: @@ -106,6 +107,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject pop_2_error: STACK_SHRINK(1); pop_1_error: +pop_1_exit_unwind: STACK_SHRINK(1); error: // On ERROR_IF we return NULL as the frame. diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 85d27777423abd..b3dd3133530562 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -103,7 +103,6 @@ } case TO_BOOL: { - static_assert(INLINE_CACHE_ENTRIES_TO_BOOL == 3, "incorrect cache size"); PyObject *value; PyObject *res; value = stack_pointer[-1]; @@ -363,7 +362,6 @@ } case BINARY_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *res; @@ -557,7 +555,6 @@ } case STORE_SUBSCR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_SUBSCR == 1, "incorrect cache size"); PyObject *sub; PyObject *container; PyObject *v; @@ -862,7 +859,6 @@ } case UNPACK_SEQUENCE: { - static_assert(INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE == 1, "incorrect cache size"); PyObject *seq; seq = stack_pointer[-1]; #if ENABLE_SPECIALIZATION @@ -950,7 +946,6 @@ } case STORE_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_STORE_ATTR == 4, "incorrect cache size"); PyObject *owner; PyObject *v; owner = stack_pointer[-1]; @@ -1061,7 +1056,6 @@ } case LOAD_GLOBAL: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size"); PyObject *res; PyObject *null = NULL; #if ENABLE_SPECIALIZATION @@ -1554,7 +1548,6 @@ } case LOAD_ATTR: { - static_assert(INLINE_CACHE_ENTRIES_LOAD_ATTR == 9, "incorrect cache size"); PyObject *owner; PyObject *attr; PyObject *self_or_null = NULL; @@ -1650,7 +1643,6 @@ } case COMPARE_OP: { - static_assert(INLINE_CACHE_ENTRIES_COMPARE_OP == 1, "incorrect cache size"); PyObject *right; PyObject *left; PyObject *res; @@ -2155,6 +2147,84 @@ break; } + case _CHECK_PEP_523: { + DEOPT_IF(tstate->interp->eval_frame, CALL); + break; + } + + case _CHECK_FUNCTION_EXACT_ARGS: { + PyObject *self_or_null; + PyObject *callable; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + uint32_t func_version = (uint32_t)operand; + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + break; + } + + case _CHECK_STACK_SPACE: { + PyObject *callable; + callable = stack_pointer[-2 - oparg]; + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); + break; + } + + case _INIT_CALL_PY_EXACT_ARGS: { + PyObject **args; + PyObject *self_or_null; + PyObject *callable; + _PyInterpreterFrame *new_frame; + args = stack_pointer - oparg; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } + STACK_SHRINK(oparg); + STACK_SHRINK(1); + stack_pointer[-1] = (PyObject *)new_frame; + break; + } + + case _PUSH_FRAME: { + _PyInterpreterFrame *new_frame; + new_frame = (_PyInterpreterFrame *)stack_pointer[-1]; + STACK_SHRINK(1); + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. + frame->return_offset = 0; + assert(tstate->interp->eval_frame == NULL); + _PyFrame_SetStackPointer(frame, stack_pointer); + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + if (_Py_EnterRecursivePy(tstate)) goto pop_1_exit_unwind; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + break; + } + case CALL_NO_KW_TYPE_1: { PyObject **args; PyObject *null; @@ -2656,7 +2726,6 @@ } case BINARY_OP: { - static_assert(INLINE_CACHE_ENTRIES_BINARY_OP == 1, "incorrect cache size"); PyObject *rhs; PyObject *lhs; PyObject *res; @@ -2726,6 +2795,17 @@ break; } + case SAVE_CURRENT_IP: { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + break; + } + case EXIT_TRACE: { frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 2661a39e047c4d..11d560a6e77adf 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -1191,13 +1191,13 @@ { PyGenObject *gen = (PyGenObject *)receiver; _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } if (Py_IsNone(v) && PyIter_Check(receiver)) { @@ -1237,13 +1237,13 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, SEND); STAT_INC(SEND, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; STACK_SHRINK(1); _PyFrame_StackPush(gen_frame, v); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; tstate->exc_info = &gen->gi_exc_state; SKIP_OVER(INLINE_CACHE_ENTRIES_SEND); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); } @@ -3343,7 +3343,6 @@ DEOPT_IF(gen->gi_frame_state >= FRAME_EXECUTING, FOR_ITER); STAT_INC(FOR_ITER, hit); _PyInterpreterFrame *gen_frame = (_PyInterpreterFrame *)gen->gi_iframe; - frame->return_offset = oparg; _PyFrame_StackPush(gen_frame, Py_None); gen->gi_frame_state = FRAME_EXECUTING; gen->gi_exc_state.previous_item = tstate->exc_info; @@ -3351,6 +3350,7 @@ SKIP_OVER(INLINE_CACHE_ENTRIES_FOR_ITER); assert(next_instr[oparg].op.code == END_FOR || next_instr[oparg].op.code == INSTRUMENTED_END_FOR); + frame->return_offset = oparg; DISPATCH_INLINED(gen_frame); STACK_GROW(1); } @@ -3764,38 +3764,83 @@ TARGET(CALL_PY_EXACT_ARGS) { PREDICTED(CALL_PY_EXACT_ARGS); - PyObject **args; PyObject *self_or_null; PyObject *callable; - args = stack_pointer - oparg; + PyObject **args; + _PyInterpreterFrame *new_frame; + // _CHECK_PEP_523 + { + DEOPT_IF(tstate->interp->eval_frame, CALL); + } + // _CHECK_FUNCTION_EXACT_ARGS self_or_null = stack_pointer[-1 - oparg]; callable = stack_pointer[-2 - oparg]; - uint32_t func_version = read_u32(&next_instr[1].cache); - ASSERT_KWNAMES_IS_NULL(); - DEOPT_IF(tstate->interp->eval_frame, CALL); - int argcount = oparg; - if (self_or_null != NULL) { - args--; - argcount++; + { + uint32_t func_version = read_u32(&next_instr[1].cache); + ASSERT_KWNAMES_IS_NULL(); + DEOPT_IF(!PyFunction_Check(callable), CALL); + PyFunctionObject *func = (PyFunctionObject *)callable; + DEOPT_IF(func->func_version != func_version, CALL); + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(code->co_argcount != oparg + (self_or_null != NULL), CALL); + } + // _CHECK_STACK_SPACE + callable = stack_pointer[-2 - oparg]; + { + PyFunctionObject *func = (PyFunctionObject *)callable; + PyCodeObject *code = (PyCodeObject *)func->func_code; + DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); } - DEOPT_IF(!PyFunction_Check(callable), CALL); - PyFunctionObject *func = (PyFunctionObject *)callable; - DEOPT_IF(func->func_version != func_version, CALL); - PyCodeObject *code = (PyCodeObject *)func->func_code; - DEOPT_IF(code->co_argcount != argcount, CALL); - DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize), CALL); - STAT_INC(CALL, hit); - _PyInterpreterFrame *new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); - for (int i = 0; i < argcount; i++) { - new_frame->localsplus[i] = args[i]; + // _INIT_CALL_PY_EXACT_ARGS + args = stack_pointer - oparg; + self_or_null = stack_pointer[-1 - oparg]; + callable = stack_pointer[-2 - oparg]; + { + int argcount = oparg; + if (self_or_null != NULL) { + args--; + argcount++; + } + STAT_INC(CALL, hit); + PyFunctionObject *func = (PyFunctionObject *)callable; + new_frame = _PyFrame_PushUnchecked(tstate, func, argcount); + for (int i = 0; i < argcount; i++) { + new_frame->localsplus[i] = args[i]; + } } - // Manipulate stack directly since we leave using DISPATCH_INLINED(). - STACK_SHRINK(oparg + 2); - SKIP_OVER(INLINE_CACHE_ENTRIES_CALL); - frame->return_offset = 0; - DISPATCH_INLINED(new_frame); + // SAVE_CURRENT_IP + next_instr += 3; + { + #if TIER_ONE + frame->prev_instr = next_instr - 1; + #endif + #if TIER_TWO + // Relies on a preceding SAVE_IP + frame->prev_instr--; + #endif + } + // _PUSH_FRAME STACK_SHRINK(oparg); - STACK_SHRINK(1); + STACK_SHRINK(2); + { + // Write it out explicitly because it's subtly different. + // Eventually this should be the only occurrence of this code. + frame->return_offset = 0; + assert(tstate->interp->eval_frame == NULL); + _PyFrame_SetStackPointer(frame, stack_pointer); + new_frame->previous = frame; + CALL_STAT_INC(inlined_py_calls); + #if TIER_ONE + frame = cframe.current_frame = new_frame; + goto start_frame; + #endif + #if TIER_TWO + frame = tstate->cframe->current_frame = new_frame; + if (_Py_EnterRecursivePy(tstate)) goto pop_1_exit_unwind; + stack_pointer = _PyFrame_GetStackPointer(frame); + ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; + #endif + } } TARGET(CALL_PY_WITH_DEFAULTS) { diff --git a/Python/optimizer.c b/Python/optimizer.c index d3ac2424038ef9..559c4ae987263e 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -606,6 +606,10 @@ translate_bytecode_to_trace( case OPARG_BOTTOM: // Second half of super-instr oparg = orig_oparg & 0xF; break; + case OPARG_SAVE_IP: // op==SAVE_IP; oparg=next instr + oparg = INSTR_IP(instr + offset, code); + break; + default: fprintf(stderr, "opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n", @@ -615,6 +619,11 @@ translate_bytecode_to_trace( Py_FatalError("garbled expansion"); } ADD_TO_TRACE(expansion->uops[i].uop, oparg, operand); + if (expansion->uops[i].uop == _PUSH_FRAME) { + assert(i + 1 == nuops); + ADD_TO_TRACE(SAVE_IP, 0, 0); + goto done; + } } break; } diff --git a/Tools/cases_generator/flags.py b/Tools/cases_generator/flags.py index f7ebdeb0d65677..962f003b194dbd 100644 --- a/Tools/cases_generator/flags.py +++ b/Tools/cases_generator/flags.py @@ -92,7 +92,7 @@ def variable_used_unspecialized(node: parsing.Node, name: str) -> bool: if text == "#if": if ( i + 1 < len(node.tokens) - and node.tokens[i + 1].text == "ENABLE_SPECIALIZATION" + and node.tokens[i + 1].text in ("ENABLE_SPECIALIZATION", "TIER_ONE") ): skipping = True elif text in ("#else", "#endif"): diff --git a/Tools/cases_generator/generate_cases.py b/Tools/cases_generator/generate_cases.py index d991cb4690063d..f31b6658d6599e 100644 --- a/Tools/cases_generator/generate_cases.py +++ b/Tools/cases_generator/generate_cases.py @@ -25,6 +25,7 @@ PseudoInstruction, StackEffect, OverriddenInstructionPlaceHolder, + TIER_ONE, TIER_TWO, ) import parsing @@ -65,6 +66,7 @@ "OPARG_CACHE_4": 4, "OPARG_TOP": 5, "OPARG_BOTTOM": 6, + "OPARG_SAVE_IP": 7, } INSTR_FMT_PREFIX = "INSTR_FMT_" @@ -501,7 +503,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No if instr.kind == "inst" and instr.is_viable_uop(): # Construct a dummy Component -- input/output mappings are not used part = Component(instr, instr.active_caches) - self.write_macro_expansions(instr.name, [part]) + self.write_macro_expansions( + instr.name, [part], instr.cache_offset + ) elif instr.kind == "inst" and variable_used( instr.inst, "oparg1" ): @@ -511,7 +515,9 @@ def write_metadata(self, metadata_filename: str, pymetadata_filename: str) -> No self.write_super_expansions(instr.name) case parsing.Macro(): mac = self.macro_instrs[thing.name] - self.write_macro_expansions(mac.name, mac.parts) + self.write_macro_expansions( + mac.name, mac.parts, mac.cache_offset + ) case parsing.Pseudo(): pass case _: @@ -630,7 +636,9 @@ def add(name: str) -> None: if instr.kind == "op" and instr.is_viable_uop(): add(instr.name) - def write_macro_expansions(self, name: str, parts: MacroParts) -> None: + def write_macro_expansions( + self, name: str, parts: MacroParts, cache_offset: int + ) -> None: """Write the macro expansions for a macro-instruction.""" # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc. offset = 0 # Cache effect offset @@ -650,7 +658,10 @@ def write_macro_expansions(self, name: str, parts: MacroParts) -> None: ) return if not part.active_caches: - size, offset = OPARG_SIZES["OPARG_FULL"], 0 + if part.instr.name == "SAVE_IP": + size, offset = OPARG_SIZES["OPARG_SAVE_IP"], cache_offset + else: + size, offset = OPARG_SIZES["OPARG_FULL"], 0 else: # If this assert triggers, is_viable_uops() lied assert len(part.active_caches) == 1, (name, part.instr.name) @@ -753,7 +764,9 @@ def write_instructions( case parsing.Macro(): n_macros += 1 mac = self.macro_instrs[thing.name] - stacking.write_macro_instr(mac, self.out, self.families.get(mac.name)) + stacking.write_macro_instr( + mac, self.out, self.families.get(mac.name) + ) # self.write_macro(self.macro_instrs[thing.name]) case parsing.Pseudo(): pass @@ -789,7 +802,9 @@ def write_executor_instructions( n_instrs += 1 self.out.emit("") with self.out.block(f"case {thing.name}:"): - instr.write(self.out, tier=TIER_TWO) + stacking.write_single_instr( + instr, self.out, tier=TIER_TWO + ) if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit("break;") @@ -851,8 +866,13 @@ def write_instr(self, instr: Instruction) -> None: with self.out.block(f"TARGET({name})"): if instr.predicted: self.out.emit(f"PREDICTED({name});") - instr.write(self.out) + self.out.static_assert_family_size( + instr.name, instr.family, instr.cache_offset + ) + stacking.write_single_instr(instr, self.out, tier=TIER_ONE) if not instr.always_exits: + if instr.cache_offset: + self.out.emit(f"next_instr += {instr.cache_offset};") if instr.check_eval_breaker: self.out.emit("CHECK_EVAL_BREAKER();") self.out.emit(f"DISPATCH();") diff --git a/Tools/cases_generator/instructions.py b/Tools/cases_generator/instructions.py index a505df08fa265b..9143ae0db7be81 100644 --- a/Tools/cases_generator/instructions.py +++ b/Tools/cases_generator/instructions.py @@ -59,7 +59,7 @@ class Instruction: block_line: int # First line of block in original code # Computed by constructor - always_exits: bool + always_exits: str # If the block always exits, its last line; else "" has_deopt: bool cache_offset: int cache_effects: list[parsing.CacheEffect] @@ -120,13 +120,13 @@ def __init__(self, inst: parsing.InstDef): def is_viable_uop(self) -> bool: """Whether this instruction is viable as a uop.""" dprint: typing.Callable[..., None] = lambda *args, **kwargs: None - # if self.name.startswith("CALL"): - # dprint = print + if "FRAME" in self.name: + dprint = print if self.name == "EXIT_TRACE": return True # This has 'return frame' but it's okay if self.always_exits: - dprint(f"Skipping {self.name} because it always exits") + dprint(f"Skipping {self.name} because it always exits: {self.always_exits}") return False if len(self.active_caches) > 1: # print(f"Skipping {self.name} because it has >1 cache entries") @@ -140,23 +140,6 @@ def is_viable_uop(self) -> bool: res = False return res - def write(self, out: Formatter, tier: Tiers = TIER_ONE) -> None: - """Write one instruction, sans prologue and epilogue.""" - - # Write a static assertion that a family's cache size is correct - out.static_assert_family_size(self.name, self.family, self.cache_offset) - - # Write input stack effect variable declarations and initializations - stacking.write_single_instr(self, out, tier) - - # Skip the rest if the block always exits - if self.always_exits: - return - - # Write cache effect - if tier == TIER_ONE and self.cache_offset: - out.emit(f"next_instr += {self.cache_offset};") - def write_body( self, out: Formatter, @@ -341,16 +324,16 @@ def extract_block_text(block: parsing.Block) -> tuple[list[str], bool, int]: return blocklines, check_eval_breaker, block_line -def always_exits(lines: list[str]) -> bool: +def always_exits(lines: list[str]) -> str: """Determine whether a block always ends in a return/goto/etc.""" if not lines: - return False + return "" line = lines[-1].rstrip() # Indent must match exactly (TODO: Do something better) if line[:12] != " " * 12: - return False + return "" line = line[12:] - return line.startswith( + if line.startswith( ( "goto ", "return ", @@ -359,4 +342,6 @@ def always_exits(lines: list[str]) -> bool: "Py_UNREACHABLE()", "ERROR_IF(true, ", ) - ) + ): + return line + return "" diff --git a/Tools/cases_generator/stacking.py b/Tools/cases_generator/stacking.py index 31a21e026cb49c..8361eb99f88a7c 100644 --- a/Tools/cases_generator/stacking.py +++ b/Tools/cases_generator/stacking.py @@ -1,6 +1,7 @@ import dataclasses import typing +from flags import variable_used_unspecialized from formatting import ( Formatter, UNUSED, @@ -146,6 +147,8 @@ class EffectManager: # Track offsets from stack pointer min_offset: StackOffset final_offset: StackOffset + # Link to previous manager + pred: "EffectManager | None" = None def __init__( self, @@ -167,7 +170,8 @@ def __init__( self.pokes.append(StackItem(offset=self.final_offset.clone(), effect=eff)) self.final_offset.higher(eff) - if pred: + self.pred = pred + while pred: # Replace push(x) + pop(y) with copy(x, y). # Check that the sources and destinations are disjoint. sources: set[str] = set() @@ -192,6 +196,11 @@ def __init__( sources, destinations, ) + # See if we can get more copies of a earlier predecessor. + if self.peeks and not pred.pokes and not pred.peeks: + pred = pred.pred + else: + pred = None # Break def adjust_deeper(self, eff: StackEffect) -> None: for peek in self.peeks: @@ -295,6 +304,7 @@ def write_single_instr( [Component(instr, instr.active_caches)], out, tier, + 0, ) except AssertionError as err: raise AssertionError(f"Error writing instruction {instr.name}") from err @@ -303,37 +313,32 @@ def write_single_instr( def write_macro_instr( mac: MacroInstruction, out: Formatter, family: Family | None ) -> None: - parts = [part for part in mac.parts if isinstance(part, Component)] - - cache_adjust = 0 - for part in mac.parts: - match part: - case CacheEffect(size=size): - cache_adjust += size - case Component(instr=instr): - cache_adjust += instr.cache_offset - case _: - typing.assert_never(part) - + parts = [ + part + for part in mac.parts + if isinstance(part, Component) and part.instr.name != "SAVE_IP" + ] out.emit("") with out.block(f"TARGET({mac.name})"): if mac.predicted: out.emit(f"PREDICTED({mac.name});") - out.static_assert_family_size(mac.name, family, cache_adjust) + out.static_assert_family_size(mac.name, family, mac.cache_offset) try: - write_components(parts, out, TIER_ONE) + next_instr_is_set = write_components(parts, out, TIER_ONE, mac.cache_offset) except AssertionError as err: raise AssertionError(f"Error writing macro {mac.name}") from err - if cache_adjust: - out.emit(f"next_instr += {cache_adjust};") - out.emit("DISPATCH();") + if not parts[-1].instr.always_exits and not next_instr_is_set: + if mac.cache_offset: + out.emit(f"next_instr += {mac.cache_offset};") + out.emit("DISPATCH();") def write_components( parts: list[Component], out: Formatter, tier: Tiers, -) -> None: + cache_offset: int, +) -> bool: managers = get_managers(parts) all_vars: dict[str, StackEffect] = {} @@ -354,6 +359,7 @@ def write_components( for name, eff in all_vars.items(): out.declare(eff, None) + next_instr_is_set = False for mgr in managers: if len(parts) > 1: out.emit(f"// {mgr.instr.name}") @@ -374,13 +380,25 @@ def write_components( poke.as_stack_effect(lax=True), ) + if mgr.instr.name == "_PUSH_FRAME": + # Adjust stack to min_offset (input effects materialized) + out.stack_adjust(mgr.min_offset.deep, mgr.min_offset.high) + # Use clone() since adjust_inverse() mutates final_offset + mgr.adjust_inverse(mgr.final_offset.clone()) + + if mgr.instr.name == "SAVE_CURRENT_IP": + next_instr_is_set = True + if cache_offset: + out.emit(f"next_instr += {cache_offset};") + if len(parts) == 1: mgr.instr.write_body(out, 0, mgr.active_caches, tier) else: with out.block(""): mgr.instr.write_body(out, -4, mgr.active_caches, tier) - if mgr is managers[-1]: + if mgr is managers[-1] and not next_instr_is_set: + # TODO: Explain why this adjustment is needed. out.stack_adjust(mgr.final_offset.deep, mgr.final_offset.high) # Use clone() since adjust_inverse() mutates final_offset mgr.adjust_inverse(mgr.final_offset.clone()) @@ -392,6 +410,8 @@ def write_components( poke.effect, ) + return next_instr_is_set + def write_single_instr_for_abstract_interp( instr: Instruction, out: Formatter