From 7c75cbfcaa69d9995799254c7610adb1a0378679 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Sat, 6 May 2023 22:11:33 -0700 Subject: [PATCH] [wasm] Add Vector128 and PackedSimd support to the jiterpreter; add PackedSimd to the interpreter (#82773) * Add PackedSIMD support to the interpreter (off by default) * Add SIMD support to the jiterpreter * Add runtime options governing interpreter vector128 and packedsimd support * Add some R4 vector128 operations to the interpreter * Fix jiterpreter MINT_POPCNT_I8 implementation * Enable compiling the runtime with wasm simd support so that intrinsics can be used * Add browser-bench measurements for packing vector128 --- src/mono/CMakeLists.txt | 1 + src/mono/mono/mini/interp/interp-internals.h | 8 +- .../mono/mini/interp/interp-simd-intrins.def | 266 ++++++++---- src/mono/mono/mini/interp/interp-simd.c | 137 +++++- src/mono/mono/mini/interp/interp-simd.h | 6 + src/mono/mono/mini/interp/interp.c | 38 ++ src/mono/mono/mini/interp/mintops.h | 18 +- src/mono/mono/mini/interp/simd-methods.def | 20 + src/mono/mono/mini/interp/transform-simd.c | 266 +++++++++++- src/mono/mono/utils/options-def.h | 8 + src/mono/sample/wasm/browser-bench/Vector.cs | 21 + src/mono/wasm/runtime/CMakeLists.txt | 2 + src/mono/wasm/runtime/cwraps.ts | 4 + src/mono/wasm/runtime/genmintops.py | 62 ++- .../wasm/runtime/jiterpreter-interp-entry.ts | 9 +- src/mono/wasm/runtime/jiterpreter-jit-call.ts | 17 +- src/mono/wasm/runtime/jiterpreter-opcodes.ts | 246 +++++++++++ src/mono/wasm/runtime/jiterpreter-support.ts | 141 +++++-- .../runtime/jiterpreter-trace-generator.ts | 395 +++++++++++++++++- src/mono/wasm/runtime/jiterpreter.ts | 50 ++- .../runtime/wasm-simd-feature-detect.wasm | Bin 0 -> 39 bytes .../wasm/runtime/wasm-simd-feature-detect.wat | 6 + src/mono/wasm/wasm.proj | 8 +- 23 files changed, 1533 insertions(+), 196 deletions(-) create mode 100644 src/mono/wasm/runtime/wasm-simd-feature-detect.wasm create mode 100644 src/mono/wasm/runtime/wasm-simd-feature-detect.wat diff --git a/src/mono/CMakeLists.txt b/src/mono/CMakeLists.txt index 92a0ac8ea82a5..5ff644b7cd57d 100644 --- a/src/mono/CMakeLists.txt +++ b/src/mono/CMakeLists.txt @@ -270,6 +270,7 @@ elseif(CLR_CMAKE_HOST_OS STREQUAL "emscripten") add_compile_options(-Wno-strict-prototypes) add_compile_options(-Wno-unused-but-set-variable) add_compile_options(-Wno-single-bit-bitfield-constant-conversion) + add_compile_options(-msimd128) set(DISABLE_EXECUTABLES 1) # FIXME: Is there a cmake option for this ? set(DISABLE_SHARED_LIBS 1) diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index a2bff18e2ef32..dc38222a8ff38 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -102,7 +102,7 @@ typedef enum { #define PROFILE_INTERP 0 -#if !HOST_BROWSER && __GNUC__ +#if __GNUC__ #define INTERP_ENABLE_SIMD #endif @@ -342,6 +342,12 @@ mono_jiterp_stackval_from_data (MonoType *type, stackval *result, const void *da gpointer mono_jiterp_frame_data_allocator_alloc (FrameDataAllocator *stack, InterpFrame *frame, int size); +gpointer +mono_jiterp_get_simd_intrinsic (int arity, int index); + +int +mono_jiterp_get_simd_opcode (int arity, int index); + #endif static inline int diff --git a/src/mono/mono/mini/interp/interp-simd-intrins.def b/src/mono/mono/mini/interp/interp-simd-intrins.def index 57bbba1717d7b..9ed37a34b1287 100644 --- a/src/mono/mono/mini/interp/interp-simd-intrins.def +++ b/src/mono/mono/mini/interp/interp-simd-intrins.def @@ -1,81 +1,185 @@ -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb) - -INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle) +// FIXME: SIMD causes compile errors on WASI +#ifdef HOST_BROWSER +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_P +#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_V +#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_I_V +#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VV +#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PP(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VI +#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PP(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VVV +#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PPP(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#else // HOST_BROWSER +#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) +#endif // HOST_BROWSER + +// The third argument is the wasm opcode that corresponds to this simd intrinsic, if any. +// Specify 0 if there is no exact 1:1 mapping (the opcode can still be implemented manually in the jiterpreter.) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition, 110) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition, 142) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition, 174) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_ADD, interp_v128_r4_op_addition, 228) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction, 113) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction, 145) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction, 177) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_SUB, interp_v128_r4_op_subtraction, 229) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and, 78) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or, 80) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality, 0) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or, 81) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply, 149) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply, 181) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_MULTIPLY, interp_v128_r4_op_multiply, 230) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_DIVISION, interp_v128_r4_op_division, 231) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation, 97) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation, 129) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation, 161) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift, 107) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift, 139) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift, 171) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift, 203) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift, 108) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift, 140) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift, 172) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift, 109) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift, 141) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift, 173) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift, 205) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement, 77) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower, 137) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper, 138) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow, 102) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than, 40) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than, 37) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than, 38) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than, 47) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals, 35) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals, 45) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals, 55) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals, 214) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar, 0) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb, 0) + +// wasm opcode is 0 because it has a different calling convention +INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select, 0) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create, 0) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not, 79) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal, 52) + +// wasm only has a swizzle opcode for i8x16, none of the others +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 14) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle, 0) + +// Wasm PackedSimd (see PackedSimd.cs) +// We automatically generate C wrappers around clang's wasm simd intrinsics for each of these intrinsics +// The 2nd argument is the name of the clang intrinsic and the 3rd argument is the wasm opcode. + +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I8X16_SPLAT, wasm_v128_load8_splat, 0x07) +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I16X8_SPLAT, wasm_v128_load16_splat, 0x08) +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I32X4_SPLAT, wasm_v128_load32_splat, 0x09) +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I64X2_SPLAT, wasm_v128_load64_splat, 0x0a) +// FIXME: ExtractLane and ReplaceLane +// FIXME: Shuffle +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_SWIZZLE, wasm_i8x16_swizzle, 0x0e) +// FIXME: f32/f64 versions of add/subtract/multiply/negate are missing +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_ADD, wasm_i8x16_add, 0x6e) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_ADD, wasm_i16x8_add, 0x8e) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_ADD, wasm_i32x4_add, 0xae) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_ADD, wasm_i64x2_add, 0xce) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_SUBTRACT, wasm_i8x16_sub, 0x71) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_SUBTRACT, wasm_i16x8_sub, 0x91) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_SUBTRACT, wasm_i32x4_sub, 0xb1) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_SUBTRACT, wasm_i64x2_sub, 0xd1) +// There is no i8x16 mul opcode +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_MULTIPLY, _interp_wasm_simd_assert_not_reached, 0x0) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_MULTIPLY, wasm_i16x8_mul, 0x95) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_MULTIPLY, wasm_i32x4_mul, 0xb5) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_MULTIPLY, wasm_i64x2_mul, 0xd5) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_DOT_I16X8, wasm_i32x4_dot_i16x8, 0xba) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I8X16_NEGATE, wasm_i8x16_neg, 0x61) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I16X8_NEGATE, wasm_i16x8_neg, 0x81) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I32X4_NEGATE, wasm_i32x4_neg, 0xa1) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I64X2_NEGATE, wasm_i64x2_neg, 0xc1) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTLEFT, wasm_i8x16_shl, 0x6b) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTLEFT, wasm_i16x8_shl, 0x8b) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTLEFT, wasm_i32x4_shl, 0xab) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTLEFT, wasm_i64x2_shl, 0xcb) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTARITHMETIC, wasm_i8x16_shr, 0x6c) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTRIGHTARITHMETIC, wasm_i16x8_shr, 0x8c) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTRIGHTARITHMETIC, wasm_i32x4_shr, 0xac) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTRIGHTARITHMETIC, wasm_i64x2_shr, 0xcc) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTLOGICAL, wasm_u8x16_shr, 0x6d) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTRIGHTLOGICAL, wasm_u16x8_shr, 0x8d) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTRIGHTLOGICAL, wasm_u32x4_shr, 0xad) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTRIGHTLOGICAL, wasm_u64x2_shr, 0xcd) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_AND, wasm_v128_and, 0x4e) +// FIXME: NOT, OR, XOR +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I8X16_BITMASK, wasm_i8x16_bitmask, 0x64) +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I16X8_BITMASK, wasm_i16x8_bitmask, 0x84) +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I32X4_BITMASK, wasm_i32x4_bitmask, 0xa4) +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I64X2_BITMASK, wasm_i64x2_bitmask, 0xc4) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, wasm_i8x16_eq, 0x23) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPAREEQUAL, wasm_i16x8_eq, 0x2d) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPAREEQUAL, wasm_i32x4_eq, 0x37) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPAREEQUAL, wasm_i64x2_eq, 0xd6) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPAREEQUAL, wasm_f32x4_eq, 0x41) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPAREEQUAL, wasm_f64x2_eq, 0x47) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPARENOTEQUAL, wasm_i8x16_ne, 0x24) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPARENOTEQUAL, wasm_i16x8_ne, 0x2e) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPARENOTEQUAL, wasm_i32x4_ne, 0x38) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPARENOTEQUAL, wasm_i64x2_ne, 0xd7) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPARENOTEQUAL, wasm_f32x4_ne, 0x42) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPARENOTEQUAL, wasm_f64x2_ne, 0x48) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_S, wasm_i8x16_narrow_i16x8, 0x65) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_S, wasm_i16x8_narrow_i32x4, 0x85) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_U, wasm_u8x16_narrow_i16x8, 0x66) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_U, wasm_u16x8_narrow_i32x4, 0x86) diff --git a/src/mono/mono/mini/interp/interp-simd.c b/src/mono/mono/mini/interp/interp-simd.c index f67370e478d1a..09e90a997ac6e 100644 --- a/src/mono/mono/mini/interp/interp-simd.c +++ b/src/mono/mono/mini/interp/interp-simd.c @@ -2,6 +2,10 @@ #include "interp-internals.h" #include "interp-simd.h" +#if HOST_BROWSER +#include +#endif + #ifdef INTERP_ENABLE_SIMD typedef gint64 v128_i8 __attribute__ ((vector_size (SIZEOF_V128))); @@ -12,6 +16,7 @@ typedef gint16 v128_i2 __attribute__ ((vector_size (SIZEOF_V128))); typedef guint16 v128_u2 __attribute__ ((vector_size (SIZEOF_V128))); typedef gint8 v128_i1 __attribute__ ((vector_size (SIZEOF_V128))); typedef guint8 v128_u1 __attribute__ ((vector_size (SIZEOF_V128))); +typedef float v128_r4 __attribute__ ((vector_size (SIZEOF_V128))); // get_AllBitsSet static void @@ -39,6 +44,12 @@ interp_v128_i4_op_addition (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 + *(v128_i4*)v2; } +static void +interp_v128_r4_op_addition (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 + *(v128_r4*)v2; +} + // op_Subtraction static void interp_v128_i1_op_subtraction (gpointer res, gpointer v1, gpointer v2) @@ -58,6 +69,12 @@ interp_v128_i4_op_subtraction (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 - *(v128_i4*)v2; } +static void +interp_v128_r4_op_subtraction (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 - *(v128_r4*)v2; +} + // op_BitwiseAnd static void interp_v128_op_bitwise_and (gpointer res, gpointer v1, gpointer v2) @@ -124,6 +141,18 @@ interp_v128_i4_op_multiply (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 * *(v128_i4*)v2; } +static void +interp_v128_r4_op_multiply (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 * *(v128_r4*)v2; +} + +static void +interp_v128_r4_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 / *(v128_r4*)v2; +} + // op_UnaryNegation static void interp_v128_i1_op_negation (gpointer res, gpointer v1) @@ -535,32 +564,122 @@ interp_v128_i8_shuffle (gpointer res, gpointer v1, gpointer v2) V128_SHUFFLE (gint64, guint64); } -#define INTERP_SIMD_INTRINSIC_P_P(a,b) -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) + +// For the wasm packed simd intrinsics we want to automatically generate the C implementations from +// their corresponding clang intrinsics. See also: +// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h +// In this context V means Vector128 and P means void* pointer. +#ifdef HOST_BROWSER + +static v128_t +_interp_wasm_simd_assert_not_reached (v128_t lhs, v128_t rhs) { + g_assert_not_reached (); +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \ + *((v128_t *)res) = c_intrinsic (v1); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \ + *((int32_t *)res) = c_intrinsic (*((v128_t *)v1)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((int *)v2)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2, gpointer v3) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2), *((v128_t *)v3)); \ +} + +#include "interp-simd-intrins.def" + +#undef INTERP_WASM_SIMD_INTRINSIC_V_P +#undef INTERP_WASM_SIMD_INTRINSIC_V_V +#undef INTERP_WASM_SIMD_INTRINSIC_I_V +#undef INTERP_WASM_SIMD_INTRINSIC_V_VV +#undef INTERP_WASM_SIMD_INTRINSIC_V_VI +#undef INTERP_WASM_SIMD_INTRINSIC_V_VVV + +// Now generate the wasm opcode tables for the intrinsics + +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) c, + +int interp_simd_p_p_wasm_opcode_table [] = { +#include "interp-simd-intrins.def" +}; + +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) + +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) c, + +int interp_simd_p_pp_wasm_opcode_table [] = { +#include "interp-simd-intrins.def" +}; + +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) + +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) c, + +int interp_simd_p_ppp_wasm_opcode_table [] = { +#include "interp-simd-intrins.def" +}; + +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) + +#endif // HOST_BROWSER #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) b, +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) b, PP_SIMD_Method interp_simd_p_p_table [] = { #include "interp-simd-intrins.def" }; #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) b, +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) b, PPP_SIMD_Method interp_simd_p_pp_table [] = { #include "interp-simd-intrins.def" }; #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) b, +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) b, PPPP_SIMD_Method interp_simd_p_ppp_table [] = { #include "interp-simd-intrins.def" }; #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) #endif // INTERP_ENABLE_SIMD diff --git a/src/mono/mono/mini/interp/interp-simd.h b/src/mono/mono/mini/interp/interp-simd.h index 3763c571069ba..e3306a251fc9f 100644 --- a/src/mono/mono/mini/interp/interp-simd.h +++ b/src/mono/mono/mini/interp/interp-simd.h @@ -11,6 +11,12 @@ extern PP_SIMD_Method interp_simd_p_p_table []; extern PPP_SIMD_Method interp_simd_p_pp_table []; extern PPPP_SIMD_Method interp_simd_p_ppp_table []; +#if HOST_BROWSER +extern int interp_simd_p_p_wasm_opcode_table []; +extern int interp_simd_p_pp_wasm_opcode_table []; +extern int interp_simd_p_ppp_wasm_opcode_table []; +#endif + #endif /* __MONO_MINI_INTERP_SIMD_H__ */ diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index f3802f7afe5e8..9e1e9e1e8561b 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -8907,4 +8907,42 @@ mono_jiterp_enum_hasflag (MonoClass *klass, gint32 *dest, stackval *sp1, stackva *dest = mono_interp_enum_hasflag (sp1, sp2, klass); } +EMSCRIPTEN_KEEPALIVE gpointer +mono_jiterp_get_simd_intrinsic (int arity, int index) +{ +#ifdef INTERP_ENABLE_SIMD + switch (arity) { + case 1: + return interp_simd_p_p_table [index]; + case 2: + return interp_simd_p_pp_table [index]; + case 3: + return interp_simd_p_ppp_table [index]; + default: + g_assert_not_reached(); + } +#else + g_assert_not_reached(); +#endif +} + +EMSCRIPTEN_KEEPALIVE int +mono_jiterp_get_simd_opcode (int arity, int index) +{ +#ifdef INTERP_ENABLE_SIMD + switch (arity) { + case 1: + return interp_simd_p_p_wasm_opcode_table [index]; + case 2: + return interp_simd_p_pp_wasm_opcode_table [index]; + case 3: + return interp_simd_p_ppp_wasm_opcode_table [index]; + default: + g_assert_not_reached(); + } +#else + g_assert_not_reached(); +#endif +} + #endif diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index 021a4399fe307..2849cec1778ff 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -41,35 +41,35 @@ typedef enum { /* SIMD opcodes, grouped by signature */ -#define INTERP_SIMD_INTRINSIC_P_P(a,b) -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) a, +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) a, typedef enum { #include "interp-simd-intrins.def" } MintSIMDOpsPP; #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) a, +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) a, typedef enum { #include "interp-simd-intrins.def" INTERP_SIMD_INTRINSIC_P_PP_LAST } MintSIMDOpsPPP; #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) a, +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) a, typedef enum { #include "interp-simd-intrins.def" INTERP_SIMD_INTRINSIC_P_PPP_LAST } MintSIMDOpsPPPP; #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) #if NO_UNALIGNED_ACCESS # if G_BYTE_ORDER == G_LITTLE_ENDIAN diff --git a/src/mono/mono/mini/interp/simd-methods.def b/src/mono/mono/mini/interp/simd-methods.def index 57b87d028de94..4eb76e178558b 100644 --- a/src/mono/mono/mini/interp/simd-methods.def +++ b/src/mono/mono/mini/interp/simd-methods.def @@ -1,12 +1,14 @@ SIMD_METHOD(get_Count) SIMD_METHOD(get_AllBitsSet) SIMD_METHOD(get_IsHardwareAccelerated) +SIMD_METHOD(get_IsSupported) SIMD_METHOD(get_Item) SIMD_METHOD(get_One) SIMD_METHOD(get_Zero) SIMD_METHOD(op_Addition) SIMD_METHOD(op_BitwiseAnd) SIMD_METHOD(op_BitwiseOr) +SIMD_METHOD(op_Division) SIMD_METHOD(op_Equality) SIMD_METHOD(op_ExclusiveOr) SIMD_METHOD(op_Explicit) @@ -24,6 +26,7 @@ SIMD_METHOD(ConditionalSelect) SIMD_METHOD(Create) SIMD_METHOD(CreateScalar) SIMD_METHOD(CreateScalarUnsafe) + SIMD_METHOD(Equals) SIMD_METHOD(ExtractMostSignificantBits) SIMD_METHOD(GreaterThan) @@ -36,3 +39,20 @@ SIMD_METHOD(ShiftRightLogical) SIMD_METHOD(Shuffle) SIMD_METHOD(WidenLower) SIMD_METHOD(WidenUpper) + +// PackedSimd +SIMD_METHOD(Splat) +SIMD_METHOD(ExtractLane) +SIMD_METHOD(ReplaceLane) +SIMD_METHOD(Swizzle) +SIMD_METHOD(Add) +SIMD_METHOD(Subtract) +SIMD_METHOD(Multiply) +SIMD_METHOD(Dot) +SIMD_METHOD(Negate) +SIMD_METHOD(And) +SIMD_METHOD(Bitmask) +SIMD_METHOD(CompareEqual) +SIMD_METHOD(CompareNotEqual) +SIMD_METHOD(ConvertNarrowingSignedSaturate) +SIMD_METHOD(ConvertNarrowingUnsignedSaturate) diff --git a/src/mono/mono/mini/interp/transform-simd.c b/src/mono/mono/mini/interp/transform-simd.c index a46f7555e14fe..bb7c2699ffbe6 100644 --- a/src/mono/mono/mini/interp/transform-simd.c +++ b/src/mono/mono/mini/interp/transform-simd.c @@ -2,6 +2,8 @@ * SIMD Intrinsics support for interpreter */ +#include "config.h" +#include #include // We use the same approach as jit/aot for identifying simd methods. @@ -61,7 +63,7 @@ static guint16 sri_vector128_methods [] = { SN_Shuffle, SN_WidenLower, SN_WidenUpper, - SN_get_IsHardwareAccelerated + SN_get_IsHardwareAccelerated, }; static guint16 sri_vector128_t_methods [] = { @@ -72,6 +74,7 @@ static guint16 sri_vector128_t_methods [] = { SN_op_Addition, SN_op_BitwiseAnd, SN_op_BitwiseOr, + SN_op_Division, SN_op_Equality, SN_op_ExclusiveOr, SN_op_Inequality, @@ -84,6 +87,60 @@ static guint16 sri_vector128_t_methods [] = { SN_op_UnsignedRightShift }; +static guint16 sri_packedsimd_methods [] = { + SN_ConvertNarrowingSignedSaturate, + SN_ConvertNarrowingUnsignedSaturate, + SN_Swizzle, + SN_get_IsHardwareAccelerated, + SN_get_IsSupported, +}; + +#if HOST_BROWSER + +/* + * maps from INTERP_SIMD_INTRINSIC_WASM_I8X16_xxx to the correct one for the return type, + * assuming that they are laid out sequentially like this: + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, wasm_i8x16_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPAREEQUAL, wasm_i16x8_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPAREEQUAL, wasm_i32x4_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPAREEQUAL, wasm_i64x2_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPAREEQUAL, wasm_f32x4_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPAREEQUAL, wasm_f64x2_eq, 0x0) + * It is your responsibility to ensure that it's actually laid out this way! + */ + +static int sri_packedsimd_offset_from_atype [] = { + -1, // MONO_TYPE_END = 0x00, + -1, // MONO_TYPE_VOID = 0x01, + // FIXME: Should this be 2, for I4? + 0, // MONO_TYPE_BOOLEAN = 0x02, + 1, // MONO_TYPE_CHAR = 0x03, + 0, // MONO_TYPE_I1 = 0x04, + 0, // MONO_TYPE_U1 = 0x05, + 1, // MONO_TYPE_I2 = 0x06, + 1, // MONO_TYPE_U2 = 0x07, + 2, // MONO_TYPE_I4 = 0x08, + 2, // MONO_TYPE_U4 = 0x09, + 3, // MONO_TYPE_I8 = 0x0a, + 3, // MONO_TYPE_U8 = 0x0b, + 4, // MONO_TYPE_R4 = 0x0c, + 5, // MONO_TYPE_R8 = 0x0d, + -1, // MONO_TYPE_STRING = 0x0e, + 2, // MONO_TYPE_PTR = 0x0f, + -1, // MONO_TYPE_BYREF = 0x10, + -1, // MONO_TYPE_VALUETYPE = 0x11, + -1, // MONO_TYPE_CLASS = 0x12, + -1, // MONO_TYPE_VAR = 0x13, + -1, // MONO_TYPE_ARRAY = 0x14, + -1, // MONO_TYPE_GENERICINST= 0x15, + -1, // MONO_TYPE_TYPEDBYREF = 0x16, + 2, // MONO_TYPE_I = 0x18, + 2, // MONO_TYPE_U = 0x19, +}; + +static const int sri_packedsimd_offset_from_atype_length = sizeof(sri_packedsimd_offset_from_atype) / sizeof(sri_packedsimd_offset_from_atype[0]); +#endif // HOST_BROWSER + static gboolean emit_sri_vector128 (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) { @@ -373,26 +430,36 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur } break; case SN_op_LeftShift: - g_assert (scalar_arg == 1); + if (scalar_arg != 1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT; else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT; else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT; else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT; break; + case SN_op_Division: + if (scalar_arg != -1) + return FALSE; + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_R4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_R4_DIVISION; + break; case SN_op_Multiply: - g_assert (scalar_arg == -1); + if (scalar_arg != -1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY; else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY; else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY; + else if (atype == MONO_TYPE_R4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_R4_MULTIPLY; break; case SN_op_OnesComplement: simd_opcode = MINT_SIMD_INTRINS_P_P; simd_intrins = INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT; break; case SN_op_RightShift: - g_assert (scalar_arg == 1); + if (scalar_arg != 1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT; else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT; @@ -414,7 +481,8 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_NEGATION; break; case SN_op_UnsignedRightShift: - g_assert (scalar_arg == 1); + if (scalar_arg != 1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT; else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT; @@ -448,6 +516,182 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur return TRUE; } +#if HOST_BROWSER +static int +map_packedsimd_intrins_based_on_atype (MonoTypeEnum atype, int base_intrins, gboolean allow_float) +{ + int max_offset = allow_float ? 5 : 3; + if ((atype < 0) || (atype >= sri_packedsimd_offset_from_atype_length)) + return -1; + int offset = sri_packedsimd_offset_from_atype [atype]; + if ((offset < 0) || (offset > max_offset)) + return -1; + return base_intrins + offset; +} +#endif + +static gboolean +emit_sri_packedsimd (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) +{ + int id = lookup_intrins (sri_packedsimd_methods, sizeof (sri_packedsimd_methods), cmethod); + if (id == -1) + return FALSE; + + MonoClass *vector_klass = mono_class_from_mono_type_internal (csignature->ret); + int vector_size = -1; + + if ((id == SN_get_IsSupported) || (id == SN_get_IsHardwareAccelerated)) { +#if HOST_BROWSER + interp_add_ins (td, MINT_LDC_I4_1); +#else + interp_add_ins (td, MINT_LDC_I4_0); +#endif + goto opcode_added; + } + +#if HOST_BROWSER + gint16 simd_opcode = -1; + gint16 simd_intrins = -1; + if (!m_class_is_simd_type (vector_klass)) + vector_klass = mono_class_from_mono_type_internal (csignature->params [0]); + if (!m_class_is_simd_type (vector_klass)) + return FALSE; + + vector_size = mono_class_value_size (vector_klass, NULL); + g_assert (vector_size == SIZEOF_V128); + + MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0]; + if (!mono_type_is_primitive (arg_type)) + return FALSE; + MonoTypeEnum atype = arg_type->type; + if (atype == MONO_TYPE_BOOLEAN) + return FALSE; + + int scalar_arg = -1; + for (int i = 0; i < csignature->param_count; i++) { + if (csignature->params [i]->type != MONO_TYPE_GENERICINST) + scalar_arg = i; + } + + switch (id) { + case SN_Splat: { + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SPLAT, FALSE); + break; + } + case SN_Swizzle: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_SWIZZLE; + break; + } + case SN_Add: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_ADD, FALSE); + break; + } + case SN_Subtract: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SUBTRACT, FALSE); + break; + } + case SN_Multiply: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_MULTIPLY, FALSE); + break; + } + case SN_Dot: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I32X4_DOT_I16X8; + break; + } + case SN_Negate: { + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_NEGATE, FALSE); + break; + } + case SN_ShiftLeft: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTLEFT, FALSE); + break; + } + case SN_ShiftRightArithmetic: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTARITHMETIC, FALSE); + break; + } + case SN_ShiftRightLogical: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTLOGICAL, FALSE); + break; + } + case SN_And: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_AND; + break; + } + case SN_Bitmask: { + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_BITMASK, FALSE); + break; + } + case SN_CompareEqual: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, TRUE); + break; + } + case SN_CompareNotEqual: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPARENOTEQUAL, TRUE); + break; + } + case SN_ConvertNarrowingSignedSaturate: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_S; + else if (atype == MONO_TYPE_I2) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_S; + break; + } + case SN_ConvertNarrowingUnsignedSaturate: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_U1) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_U; + else if (atype == MONO_TYPE_U2) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_U; + break; + } + default: + return FALSE; + } + + if (simd_opcode == -1 || simd_intrins == -1) { + return FALSE; + } + + interp_add_ins (td, simd_opcode); + td->last_ins->data [0] = simd_intrins; +#else // HOST_BROWSER + return FALSE; +#endif // HOST_BROWSER + +opcode_added: + td->sp -= csignature->param_count; + for (int i = 0; i < csignature->param_count; i++) + td->last_ins->sregs [i] = td->sp [i].local; + + g_assert (csignature->ret->type != MONO_TYPE_VOID); + int ret_mt = mint_type (csignature->ret); + if (ret_mt == MINT_TYPE_VT) { + // For these intrinsics, if we return a VT then it is a V128 + push_type_vt (td, vector_klass, vector_size); + } else { + push_simple_type (td, stack_type [ret_mt]); + } + interp_ins_set_dreg (td->last_ins, td->sp [-1].local); + td->ip += 5; + return TRUE; +} + static gboolean interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) { @@ -461,11 +705,21 @@ interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodS class_ns = m_class_get_name_space (cmethod->klass); class_name = m_class_get_name (cmethod->klass); - if (!strcmp (class_ns, "System.Runtime.Intrinsics")) { + if (mono_opt_interp_simd_v128 && !strcmp (class_ns, "System.Runtime.Intrinsics")) { if (!strcmp (class_name, "Vector128")) return emit_sri_vector128 (td, cmethod, csignature); else if (!strcmp (class_name, "Vector128`1")) return emit_sri_vector128_t (td, cmethod, csignature); + } else if (mono_opt_interp_simd_packedsimd && !strcmp (class_ns, "System.Runtime.Intrinsics.Wasm")) { + if (!strcmp (class_name, "PackedSimd")) { + gboolean res = emit_sri_packedsimd (td, cmethod, csignature); +#if HOST_BROWSER + if (!res) + g_print ("MONO interpreter: Unsupported method: System.Runtime.Intrinsics.Wasm.PackedSimd.%s\n", cmethod->name); + g_assert (res); +#endif + return res; + } } return FALSE; } diff --git a/src/mono/mono/utils/options-def.h b/src/mono/mono/utils/options-def.h index 1a6f58a9fbda2..6d8715c2465ff 100644 --- a/src/mono/mono/utils/options-def.h +++ b/src/mono/mono/utils/options-def.h @@ -60,6 +60,12 @@ DEFINE_BOOL_READONLY(readonly_flag, "readonly-flag", FALSE, "Example") DEFINE_BOOL(wasm_exceptions, "wasm-exceptions", FALSE, "Enable codegen for WASM exceptions") DEFINE_BOOL(wasm_gc_safepoints, "wasm-gc-safepoints", FALSE, "Use GC safepoints on WASM") DEFINE_BOOL(aot_lazy_assembly_load, "aot-lazy-assembly-load", FALSE, "Load assemblies referenced by AOT images lazily") +#if HOST_BROWSER +DEFINE_BOOL(interp_simd_v128, "interp-simd-v128", FALSE, "Enable interpreter Vector128 support") +#else +DEFINE_BOOL(interp_simd_v128, "interp-simd-v128", TRUE, "Enable interpreter Vector128 support") +#endif +DEFINE_BOOL(interp_simd_packedsimd, "interp-simd-packedsimd", FALSE, "Enable interpreter WASM PackedSimd support") #if HOST_BROWSER @@ -110,6 +116,8 @@ DEFINE_BOOL(jiterpreter_use_constants, "jiterpreter-use-constants", FALSE, "Use DEFINE_BOOL(jiterpreter_eliminate_null_checks, "jiterpreter-eliminate-null-checks", TRUE, "Attempt to eliminate redundant null checks in traces") // enables performing backward branches without exiting traces DEFINE_BOOL(jiterpreter_backward_branches_enabled, "jiterpreter-backward-branches-enabled", TRUE, "Enable performing backward branches without exiting traces") +// Attempt to use WASM v128 opcodes to implement SIMD interpreter opcodes +DEFINE_BOOL(jiterpreter_enable_simd, "jiterpreter-simd-enabled", TRUE, "Attempt to use WebAssembly SIMD support") // When compiling a jit_call wrapper, bypass sharedvt wrappers if possible by inlining their // logic into the compiled wrapper and calling the target AOTed function with native call convention DEFINE_BOOL(jiterpreter_direct_jit_call, "jiterpreter-direct-jit-calls", TRUE, "Bypass gsharedvt wrappers when compiling JIT call wrappers") diff --git a/src/mono/sample/wasm/browser-bench/Vector.cs b/src/mono/sample/wasm/browser-bench/Vector.cs index 343332783555f..cb04d361fd4aa 100644 --- a/src/mono/sample/wasm/browser-bench/Vector.cs +++ b/src/mono/sample/wasm/browser-bench/Vector.cs @@ -15,6 +15,8 @@ public VectorTask() { measurements = new Measurement[] { new Create(), + new PackConstant(), + new Pack(), new Add(), new Multiply(), new DotInt(), @@ -56,6 +58,25 @@ class Create : VectorMeasurement public override void RunStep() => vector = Vector128.Create(0x123456); } + class PackConstant : VectorMeasurement + { + Vector128 vector; + + public override string Name => "Pack Vector128 (Constant)"; + + public override void RunStep() => vector = Vector128.Create(1, 2, 3, 4); + } + + class Pack : VectorMeasurement + { + Vector128 vector; + int a = 1, b = 2, c = 3, d = 4; + + public override string Name => "Pack Vector128"; + + public override void RunStep() => vector = Vector128.Create(a, b, c, d); + } + class Add : VectorMeasurement { Vector128 vector1, vector2, vector3; diff --git a/src/mono/wasm/runtime/CMakeLists.txt b/src/mono/wasm/runtime/CMakeLists.txt index 4d3781bb924f6..1a39d1520c873 100644 --- a/src/mono/wasm/runtime/CMakeLists.txt +++ b/src/mono/wasm/runtime/CMakeLists.txt @@ -39,6 +39,8 @@ set_target_properties(dotnet PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${NATIVE_BIN_DIR}") set(ignoreMeWasmOptFlags "${CONFIGURATION_WASM_OPT_FLAGS}") +set(ignoreMeWasmOptAdditionalFlags "${WASM_OPT_ADDITIONAL_FLAGS}") +set(ignoreMeEmsdkPath "${EMSDK_PATH}") if(CMAKE_BUILD_TYPE STREQUAL "Release") add_custom_command(TARGET dotnet diff --git a/src/mono/wasm/runtime/cwraps.ts b/src/mono/wasm/runtime/cwraps.ts index 6c6912fd6b6c9..3b0a98261edf4 100644 --- a/src/mono/wasm/runtime/cwraps.ts +++ b/src/mono/wasm/runtime/cwraps.ts @@ -125,6 +125,8 @@ const fn_signatures: SigLine[] = [ [true, "mono_jiterp_boost_back_branch_target", "void", ["number"]], [true, "mono_jiterp_is_imethod_var_address_taken", "number", ["number", "number"]], [true, "mono_jiterp_get_opcode_value_table_entry", "number", ["number"]], + [true, "mono_jiterp_get_simd_intrinsic", "number", ["number", "number"]], + [true, "mono_jiterp_get_simd_opcode", "number", ["number", "number"]], ...legacy_interop_cwraps ]; @@ -246,6 +248,8 @@ export interface t_Cwraps { mono_jiterp_boost_back_branch_target(destination: number): void; mono_jiterp_is_imethod_var_address_taken(imethod: VoidPtr, offsetBytes: number): number; mono_jiterp_get_opcode_value_table_entry(opcode: number): number; + mono_jiterp_get_simd_intrinsic(arity: number, index: number): VoidPtr; + mono_jiterp_get_simd_opcode(arity: number, index: number): number; } const wrapped_c_functions: t_Cwraps = {}; diff --git a/src/mono/wasm/runtime/genmintops.py b/src/mono/wasm/runtime/genmintops.py index 510b1db22d32f..de7f6e53ac9ab 100755 --- a/src/mono/wasm/runtime/genmintops.py +++ b/src/mono/wasm/runtime/genmintops.py @@ -8,20 +8,25 @@ import os import re -if len (sys.argv) != 3: - print ("Usage: genmintops.py ") +if len (sys.argv) != 4: + print ("Usage: genmintops.py ") exit (1) src_header_path = sys.argv [1] -output_ts_path = sys.argv [2] +simd_header_path = sys.argv [2] +output_ts_path = sys.argv [3] src = open(src_header_path, 'r') +simd_src = open(simd_header_path, 'r') tab = " " header_lines = src.read().splitlines() +# strip preprocessing directives +simd_header_lines = (l for l in simd_src.read().splitlines() if not l.startswith("#")) # strip preprocessing directives and add indentation for tslint/eslint header = "\n".join((tab + l) for l in header_lines if not l.startswith("#")) src.close() +simd_src.close() opdef_regex = r'\s(IR)?OPDEF\((\w+),\s*(.+?),\s*(MintOp\w+)\)' enum_values = re.sub( @@ -31,11 +36,36 @@ opdef_regex, lambda m : f"[MintOpcode.{m.group(2)}]: [{m.group(3)}, MintOpArgType.{m.group(4)}],", header ) +simd_values_1 = [] +simd_values_2 = [] +simd_values_3 = [] +simd_disp = { + "INTERP_SIMD_INTRINSIC_P_P": simd_values_1, + "INTERP_SIMD_INTRINSIC_P_PP": simd_values_2, + "INTERP_SIMD_INTRINSIC_P_PPP": simd_values_3, + "INTERP_WASM_SIMD_INTRINSIC_V_P": simd_values_1, + "INTERP_WASM_SIMD_INTRINSIC_V_V": simd_values_1, + "INTERP_WASM_SIMD_INTRINSIC_I_V": simd_values_1, + "INTERP_WASM_SIMD_INTRINSIC_V_VV": simd_values_2, + "INTERP_WASM_SIMD_INTRINSIC_V_VI": simd_values_2, + "INTERP_WASM_SIMD_INTRINSIC_V_VVV": simd_values_3, +} + +for line in simd_header_lines: + idx1 = line.index("(") if "(" in line else None + idx2 = line.index(",") if "," in line else None + if (idx1 and idx2): + key = line[0:idx1].strip() + simd_disp[key].append(line[(idx1 + 1):idx2].strip().replace("INTERP_SIMD_INTRINSIC_", "")) + +splitter = ",\n " +splitter2 = ",\n " + generated = f""" // Generated by genmintops.py from mintops.def. // Do not manually edit this file. -import {{ OpcodeInfoTable, MintOpArgType }} from "./jiterpreter-opcodes"; +import {{ OpcodeInfoTable, MintOpArgType, SimdInfoTable }} from "./jiterpreter-opcodes"; export const enum MintOpcode {{ {enum_values} @@ -46,6 +76,30 @@ export const OpcodeInfo : OpcodeInfoTable = {{ {metadata_table} }}; + +export const enum SimdIntrinsic2 {{ + {splitter.join(simd_values_1)} +}} + +export const enum SimdIntrinsic3 {{ + {splitter.join(simd_values_2)} +}} + +export const enum SimdIntrinsic4 {{ + {splitter.join(simd_values_3)} +}} + +export const SimdInfo : SimdInfoTable = {{ + 2: [ + {splitter2.join(repr(x) for x in simd_values_1)} + ], + 3: [ + {splitter2.join(repr(x) for x in simd_values_2)} + ], + 4: [ + {splitter2.join(repr(x) for x in simd_values_3)} + ], +}}; """ os.makedirs(os.path.dirname(output_ts_path), exist_ok=True) diff --git a/src/mono/wasm/runtime/jiterpreter-interp-entry.ts b/src/mono/wasm/runtime/jiterpreter-interp-entry.ts index aad65e7ad4841..14651ad27e970 100644 --- a/src/mono/wasm/runtime/jiterpreter-interp-entry.ts +++ b/src/mono/wasm/runtime/jiterpreter-interp-entry.ts @@ -289,7 +289,7 @@ function flush_wasm_entry_trampoline_jit_queue() { // Emit function imports for (let i = 0; i < trampImports.length; i++) { mono_assert(trampImports[i], () => `trace #${i} missing`); - builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, false, trampImports[i][2]); + builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, trampImports[i][2]); } builder._generateImportSection(); @@ -343,12 +343,9 @@ function flush_wasm_entry_trampoline_jit_queue() { console.log(`jit queue generated ${buffer.length} byte(s) of wasm`); counters.bytesGenerated += buffer.length; const traceModule = new WebAssembly.Module(buffer); + const wasmImports = builder.getWasmImports(); - const traceInstance = new WebAssembly.Instance(traceModule, { - i: builder.getImportedFunctionTable(), - c: builder.getConstants(), - m: { h: (Module).asm.memory }, - }); + const traceInstance = new WebAssembly.Instance(traceModule, wasmImports); // Now that we've jitted the trampolines, go through and fix up the function pointers // to point to the new jitted trampolines instead of the default implementations diff --git a/src/mono/wasm/runtime/jiterpreter-jit-call.ts b/src/mono/wasm/runtime/jiterpreter-jit-call.ts index 3f016adc00510..b1bf6a52a75a5 100644 --- a/src/mono/wasm/runtime/jiterpreter-jit-call.ts +++ b/src/mono/wasm/runtime/jiterpreter-jit-call.ts @@ -11,7 +11,7 @@ import { WasmOpcode } from "./jiterpreter-opcodes"; import { WasmValtype, WasmBuilder, addWasmFunctionPointer as addWasmFunctionPointer, _now, elapsedTimes, counters, getWasmFunctionTable, applyOptions, - recordFailure, getOptions + recordFailure, getOptions, bytesFromHex } from "./jiterpreter-support"; import cwraps from "./cwraps"; @@ -157,7 +157,7 @@ class TrampolineInfo { } // this is cached replacements for Module.getWasmTableEntry(); -// we could add and +// we could add and // if we need to export the original function getWasmTableEntry(index: number) { let result = fnCache[index]; @@ -236,9 +236,7 @@ function getIsWasmEhSupported(): boolean { // Probe whether the current environment can handle wasm exceptions try { // Load and compile the wasm version of do_jit_call_indirect. This serves as a way to probe for wasm EH - const bytes = new Uint8Array(doJitCall16.length / 2); - for (let i = 0; i < doJitCall16.length; i += 2) - bytes[i / 2] = parseInt(doJitCall16.substring(i, i + 2), 16); + const bytes = bytesFromHex(doJitCall16); counters.bytesGenerated += bytes.length; doJitCallModule = new WebAssembly.Module(bytes); @@ -396,7 +394,7 @@ export function mono_interp_flush_jitcall_queue(): void { // Emit function imports for (let i = 0; i < trampImports.length; i++) - builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, false, trampImports[i][2]); + builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, trampImports[i][2]); builder._generateImportSection(); // Function section @@ -444,12 +442,9 @@ export function mono_interp_flush_jitcall_queue(): void { console.log(`do_jit_call queue flush generated ${buffer.length} byte(s) of wasm`); counters.bytesGenerated += buffer.length; const traceModule = new WebAssembly.Module(buffer); + const wasmImports = builder.getWasmImports(); - const traceInstance = new WebAssembly.Instance(traceModule, { - i: builder.getImportedFunctionTable(), - c: builder.getConstants(), - m: { h: (Module).asm.memory } - }); + const traceInstance = new WebAssembly.Instance(traceModule, wasmImports); for (let i = 0; i < jitQueue.length; i++) { const info = jitQueue[i]; diff --git a/src/mono/wasm/runtime/jiterpreter-opcodes.ts b/src/mono/wasm/runtime/jiterpreter-opcodes.ts index ccefa7b41327c..3d347bdea7452 100644 --- a/src/mono/wasm/runtime/jiterpreter-opcodes.ts +++ b/src/mono/wasm/runtime/jiterpreter-opcodes.ts @@ -30,6 +30,12 @@ export type OpcodeInfoTable = { [key: number]: [name: string, length_u16: number, dregs: number, sregs: number, optype: MintOpArgType]; } +export type SimdInfoSubtable = Array + +export type SimdInfoTable = { + [argument_count: number] : SimdInfoSubtable +} + // Keep this in sync with the wasm spec (but I don't think any changes will impact it), // Note that prefix opcodes aren't in this enum, since making them write properly is awkward. @@ -229,5 +235,245 @@ export const enum WasmOpcode { i64_extend_32_s, PREFIX_sat = 0xfc, + PREFIX_simd = 0xfd, PREFIX_atomic = 0xfe } + +export const enum WasmSimdOpcode { + v128_load = 0x00, + v128_load8x8_s = 0x01, + v128_load8x8_u = 0x02, + v128_load16x4_s = 0x03, + v128_load16x4_u = 0x04, + v128_load32x2_s = 0x05, + v128_load32x2_u = 0x06, + v128_load8_splat = 0x07, + v128_load16_splat = 0x08, + v128_load32_splat = 0x09, + v128_load64_splat = 0x0a, + v128_store = 0x0b, + v128_const = 0x0c, + i8x16_shuffle = 0x0d, + i8x16_swizzle = 0x0e, + i8x16_splat = 0x0f, + i16x8_splat = 0x10, + i32x4_splat = 0x11, + i64x2_splat = 0x12, + f32x4_splat = 0x13, + f64x2_splat = 0x14, + i8x16_extract_lane_s = 0x15, + i8x16_extract_lane_u = 0x16, + i8x16_replace_lane = 0x17, + i16x8_extract_lane_s = 0x18, + i16x8_extract_lane_u = 0x19, + i16x8_replace_lane = 0x1a, + i32x4_extract_lane = 0x1b, + i32x4_replace_lane = 0x1c, + i64x2_extract_lane = 0x1d, + i64x2_replace_lane = 0x1e, + f32x4_extract_lane = 0x1f, + f32x4_replace_lane = 0x20, + f64x2_extract_lane = 0x21, + f64x2_replace_lane = 0x22, + i8x16_eq = 0x23, + i8x16_ne = 0x24, + i8x16_lt_s = 0x25, + i8x16_lt_u = 0x26, + i8x16_gt_s = 0x27, + i8x16_gt_u = 0x28, + i8x16_le_s = 0x29, + i8x16_le_u = 0x2a, + i8x16_ge_s = 0x2b, + i8x16_ge_u = 0x2c, + i16x8_eq = 0x2d, + i16x8_ne = 0x2e, + i16x8_lt_s = 0x2f, + i16x8_lt_u = 0x30, + i16x8_gt_s = 0x31, + i16x8_gt_u = 0x32, + i16x8_le_s = 0x33, + i16x8_le_u = 0x34, + i16x8_ge_s = 0x35, + i16x8_ge_u = 0x36, + i32x4_eq = 0x37, + i32x4_ne = 0x38, + i32x4_lt_s = 0x39, + i32x4_lt_u = 0x3a, + i32x4_gt_s = 0x3b, + i32x4_gt_u = 0x3c, + i32x4_le_s = 0x3d, + i32x4_le_u = 0x3e, + i32x4_ge_s = 0x3f, + i32x4_ge_u = 0x40, + f32x4_eq = 0x41, + f32x4_ne = 0x42, + f32x4_lt = 0x43, + f32x4_gt = 0x44, + f32x4_le = 0x45, + f32x4_ge = 0x46, + f64x2_eq = 0x47, + f64x2_ne = 0x48, + f64x2_lt = 0x49, + f64x2_gt = 0x4a, + f64x2_le = 0x4b, + f64x2_ge = 0x4c, + v128_not = 0x4d, + v128_and = 0x4e, + v128_andnot = 0x4f, + v128_or = 0x50, + v128_xor = 0x51, + v128_bitselect = 0x52, + i8x16_abs = 0x60, + i8x16_neg = 0x61, + i8x16_all_true = 0x63, + i8x16_bitmask = 0x64, + i8x16_narrow_i16x8_s = 0x65, + i8x16_narrow_i16x8_u = 0x66, + i8x16_shl = 0x6b, + i8x16_shr_s = 0x6c, + i8x16_shr_u = 0x6d, + i8x16_add = 0x6e, + i8x16_add_sat_s = 0x6f, + i8x16_add_sat_u = 0x70, + i8x16_sub = 0x71, + i8x16_sub_sat_s = 0x72, + i8x16_sub_sat_u = 0x73, + i8x16_min_s = 0x76, + i8x16_min_u = 0x77, + i8x16_max_s = 0x78, + i8x16_max_u = 0x79, + i8x16_avgr_u = 0x7b, + i16x8_abs = 0x80, + i16x8_neg = 0x81, + i16x8_all_true = 0x83, + i16x8_bitmask = 0x84, + i16x8_narrow_i32x4_s = 0x85, + i16x8_narrow_i32x4_u = 0x86, + i16x8_extend_low_i8x16_s = 0x87, + i16x8_extend_high_i8x16_s = 0x88, + i16x8_extend_low_i8x16_u = 0x89, + i16x8_extend_high_i8x16_u = 0x8a, + i16x8_shl = 0x8b, + i16x8_shr_s = 0x8c, + i16x8_shr_u = 0x8d, + i16x8_add = 0x8e, + i16x8_add_sat_s = 0x8f, + i16x8_add_sat_u = 0x90, + i16x8_sub = 0x91, + i16x8_sub_sat_s = 0x92, + i16x8_sub_sat_u = 0x93, + i16x8_mul = 0x95, + i16x8_min_s = 0x96, + i16x8_min_u = 0x97, + i16x8_max_s = 0x98, + i16x8_max_u = 0x99, + i16x8_avgr_u = 0x9b, + i32x4_abs = 0xa0, + i32x4_neg = 0xa1, + i32x4_all_true = 0xa3, + i32x4_bitmask = 0xa4, + i32x4_extend_low_i16x8_s = 0xa7, + i32x4_extend_high_i16x8_s = 0xa8, + i32x4_extend_low_i16x8_u = 0xa9, + i32x4_extend_high_i16x8_u = 0xaa, + i32x4_shl = 0xab, + i32x4_shr_s = 0xac, + i32x4_shr_u = 0xad, + i32x4_add = 0xae, + i32x4_sub = 0xb1, + i32x4_mul = 0xb5, + i32x4_min_s = 0xb6, + i32x4_min_u = 0xb7, + i32x4_max_s = 0xb8, + i32x4_max_u = 0xb9, + i32x4_dot_i16x8_s = 0xba, + i64x2_abs = 0xc0, + i64x2_neg = 0xc1, + i64x2_bitmask = 0xc4, + i64x2_extend_low_i32x4_s = 0xc7, + i64x2_extend_high_i32x4_s = 0xc8, + i64x2_extend_low_i32x4_u = 0xc9, + i64x2_extend_high_i32x4_u = 0xca, + i64x2_shl = 0xcb, + i64x2_shr_s = 0xcc, + i64x2_shr_u = 0xcd, + i64x2_add = 0xce, + i64x2_sub = 0xd1, + i64x2_mul = 0xd5, + f32x4_ceil = 0x67, + f32x4_floor = 0x68, + f32x4_trunc = 0x69, + f32x4_nearest = 0x6a, + f64x2_ceil = 0x74, + f64x2_floor = 0x75, + f64x2_trunc = 0x7a, + f64x2_nearest = 0x94, + f32x4_abs = 0xe0, + f32x4_neg = 0xe1, + f32x4_sqrt = 0xe3, + f32x4_add = 0xe4, + f32x4_sub = 0xe5, + f32x4_mul = 0xe6, + f32x4_div = 0xe7, + f32x4_min = 0xe8, + f32x4_max = 0xe9, + f32x4_pmin = 0xea, + f32x4_pmax = 0xeb, + f64x2_abs = 0xec, + f64x2_neg = 0xed, + f64x2_sqrt = 0xef, + f64x2_add = 0xf0, + f64x2_sub = 0xf1, + f64x2_mul = 0xf2, + f64x2_div = 0xf3, + f64x2_min = 0xf4, + f64x2_max = 0xf5, + f64x2_pmin = 0xf6, + f64x2_pmax = 0xf7, + i32x4_trunc_sat_f32x4_s = 0xf8, + i32x4_trunc_sat_f32x4_u = 0xf9, + f32x4_convert_i32x4_s = 0xfa, + f32x4_convert_i32x4_u = 0xfb, + v128_load32_zero = 0x5c, + v128_load64_zero = 0x5d, + i16x8_extmul_low_i8x16_s = 0x9c, + i16x8_extmul_high_i8x16_s = 0x9d, + i16x8_extmul_low_i8x16_u = 0x9e, + i16x8_extmul_high_i8x16_u = 0x9f, + i32x4_extmul_low_i16x8_s = 0xbc, + i32x4_extmul_high_i16x8_s = 0xbd, + i32x4_extmul_low_i16x8_u = 0xbe, + i32x4_extmul_high_i16x8_u = 0xbf, + i64x2_extmul_low_i32x4_s = 0xdc, + i64x2_extmul_high_i32x4_s = 0xdd, + i64x2_extmul_low_i32x4_u = 0xde, + i64x2_extmul_high_i32x4_u = 0xdf, + i16x8_q15mulr_sat_s = 0x82, + v128_any_true = 0x53, + v128_load8_lane = 0x54, + v128_load16_lane = 0x55, + v128_load32_lane = 0x56, + v128_load64_lane = 0x57, + v128_store8_lane = 0x58, + v128_store16_lane = 0x59, + v128_store32_lane = 0x5a, + v128_store64_lane = 0x5b, + i64x2_eq = 0xd6, + i64x2_ne = 0xd7, + i64x2_lt_s = 0xd8, + i64x2_gt_s = 0xd9, + i64x2_le_s = 0xda, + i64x2_ge_s = 0xdb, + i64x2_all_true = 0xc3, + f64x2_convert_low_i32x4_s = 0xfe, + f64x2_convert_low_i32x4_u = 0xff, + i32x4_trunc_sat_f64x2_s_zero = 0xfc, + i32x4_trunc_sat_f64x2_u_zero = 0xfd, + f32x4_demote_f64x2_zero = 0x5e, + f64x2_promote_low_f32x4 = 0x5f, + i8x16_popcnt = 0x62, + i16x8_extadd_pairwise_i8x16_s = 0x7c, + i16x8_extadd_pairwise_i8x16_u = 0x7d, + i32x4_extadd_pairwise_i16x8_s = 0x7e, + i32x4_extadd_pairwise_i16x8_u = 0x7f, +} diff --git a/src/mono/wasm/runtime/jiterpreter-support.ts b/src/mono/wasm/runtime/jiterpreter-support.ts index 010e67496b19b..306757573a59e 100644 --- a/src/mono/wasm/runtime/jiterpreter-support.ts +++ b/src/mono/wasm/runtime/jiterpreter-support.ts @@ -4,7 +4,7 @@ import { mono_assert } from "./types"; import { NativePointer, ManagedPointer, VoidPtr } from "./types/emscripten"; import { Module, runtimeHelpers } from "./globals"; -import { WasmOpcode } from "./jiterpreter-opcodes"; +import { WasmOpcode, WasmSimdOpcode } from "./jiterpreter-opcodes"; import { MintOpcode } from "./mintops"; import cwraps from "./cwraps"; @@ -118,7 +118,6 @@ type ImportedFunctionInfo = { typeIndex: number; module: string; name: string; - assumeUsed: boolean; func: Function; } @@ -166,6 +165,7 @@ export class WasmBuilder { nextConstantSlot = 0; compressImportNames = false; + lockImports = false; constructor(constantSlotCount: number) { this.stack = [new BlobBuilder()]; @@ -178,6 +178,7 @@ export class WasmBuilder { this.stackSize = 1; this.inSection = false; this.inFunction = false; + this.lockImports = false; this.locals.clear(); this.functionTypeCount = this.permanentFunctionTypeCount; @@ -186,13 +187,12 @@ export class WasmBuilder { this.functionTypesByIndex = Object.create(this.permanentFunctionTypesByIndex); this.nextImportIndex = 0; - this.importedFunctionCount = this.permanentImportedFunctionCount; + this.importedFunctionCount = 0; this.importedFunctions = Object.create(this.permanentImportedFunctions); for (const k in this.importedFunctions) { const f = this.importedFunctions[k]; - if (!f.assumeUsed) - f.index = undefined; + f.index = undefined; } this.functions.length = 0; @@ -235,15 +235,45 @@ export class WasmBuilder { return current.getArrayView(false).slice(0, current.size); } + getWasmImports () : WebAssembly.Imports { + const result : any = { + c: this.getConstants(), + m: { h: (Module).asm.memory }, + f: { f: getWasmFunctionTable() }, + }; + + const importsToEmit = this.getImportsToEmit(); + + for (let i = 0; i < importsToEmit.length; i++) { + const ifi = importsToEmit[i]; + if (typeof (ifi.func) !== "function") + throw new Error(`Import '${ifi.name}' not found or not a function`); + + const mangledName = this.getCompressedName(ifi); + let subTable = result[ifi.module]; + if (!subTable) { + subTable = result[ifi.module] = {}; + } + subTable[mangledName] = ifi.func; + } + + return result; + } + // HACK: Approximate amount of space we need to generate the full module at present // FIXME: This does not take into account any other functions already generated if they weren't // emitted into the module immediately - get bytesGeneratedSoFar() { + get bytesGeneratedSoFar () { + const importSize = this.compressImportNames + // mod (2 bytes) name (2-3 bytes) type (1 byte) typeidx (1-2 bytes) + ? 8 + // we keep the uncompressed import names somewhat short, generally, so +12 bytes is about right + : 20; + return this.stack[0].size + // HACK: A random constant for section headers and padding 32 + - // mod (2 bytes) name (2-3 bytes) type (1 byte) typeidx (1-2 bytes) - (this.importedFunctionCount * 8) + + (this.importedFunctionCount * importSize) + // type index for each function (this.functions.length * 2) + // export entry for each export @@ -264,7 +294,13 @@ export class WasmBuilder { return this.current.appendU8(value); } - appendU32(value: number) { + appendSimd (value: WasmSimdOpcode) { + this.current.appendU8(WasmOpcode.PREFIX_simd); + // Yes that's right. We're using LEB128 to encode 8-bit opcodes. Why? I don't know + return this.current.appendULeb(value); + } + + appendU32 (value: number) { return this.current.appendU32(value); } @@ -424,8 +460,8 @@ export class WasmBuilder { return imports; } - getCompressedName(ifi: ImportedFunctionInfo) { - if (!this.compressImportNames || typeof (ifi.index) !== "number") + getCompressedName (ifi: ImportedFunctionInfo) { + if (!this.compressImportNames || typeof(ifi.index) !== "number") return ifi.name; let result = compressedNameCache[ifi.index!]; @@ -434,23 +470,31 @@ export class WasmBuilder { return result; } - _generateImportSection() { - const importsToEmit = []; + getImportsToEmit () { + const result = []; for (const k in this.importedFunctions) { - const f = this.importedFunctions[k]; - if (f.index !== undefined) - importsToEmit.push(f); + const v = this.importedFunctions[k]; + if (typeof (v.index) !== "number") + continue; + result.push(v); } - importsToEmit.sort((lhs, rhs) => lhs.index! - rhs.index!); + result.sort((lhs, rhs) => lhs.index! - rhs.index!); + // console.log("result=[" + result.map(f => `#${f.index} ${f.module}.${f.name}`) + "]"); + return result; + } + + _generateImportSection () { + const importsToEmit = this.getImportsToEmit(); + this.lockImports = true; // Import section this.beginSection(2); - this.appendULeb(1 + importsToEmit.length + this.constantSlots.length); + this.appendULeb(2 + importsToEmit.length + this.constantSlots.length); - // console.log(`referenced ${importsToEmit.length}/${allImports.length} import(s)`); + // console.log(`referenced ${importsToEmit.length} import(s)`); for (let i = 0; i < importsToEmit.length; i++) { const ifi = importsToEmit[i]; - // console.log(` #${ifi.index} ${ifi.module}.${ifi.name} = ${ifi.friendlyName}`); + // console.log(` #${ifi.index} ${ifi.module}.${ifi.name} = ${ifi.func}`); this.appendName(ifi.module); this.appendName(this.getCompressedName(ifi)); this.appendU8(0x0); // function @@ -472,14 +516,26 @@ export class WasmBuilder { this.appendU8(0x00); // Minimum size is in 64k pages, not bytes this.appendULeb(0x01); + + this.appendName("f"); + this.appendName("f"); + // tabletype + this.appendU8(0x01); + // funcref + this.appendU8(0x70); + // limits = { min=0x01, max=infinity } + this.appendU8(0x00); + this.appendULeb(0x01); } defineImportedFunction( module: string, name: string, functionTypeName: string, - assumeUsed: boolean, permanent: boolean, func: Function | number - ): ImportedFunctionInfo { - if (permanent && (this.importedFunctionCount > this.permanentImportedFunctionCount)) - throw new Error("New permanent imports cannot be defined after non-permanent ones"); + permanent: boolean, func: Function | number + ) : ImportedFunctionInfo { + if (this.lockImports) + throw new Error("Import section already generated"); + if (permanent && (this.importedFunctionCount > 0)) + throw new Error("New permanent imports cannot be defined after any indexes have been assigned"); const type = this.functionTypes[functionTypeName]; if (!type) throw new Error("No function type named " + functionTypeName); @@ -487,23 +543,15 @@ export class WasmBuilder { throw new Error("A permanent import must have a permanent function type"); const typeIndex = type[0]; const table = permanent ? this.permanentImportedFunctions : this.importedFunctions; - const index = assumeUsed - ? ( - permanent - ? this.permanentImportedFunctionCount++ - : this.importedFunctionCount++ - ) - : undefined; if (typeof (func) === "number") func = getWasmFunctionTable().get(func); if (typeof (func) !== "function") throw new Error(`Value passed for imported function ${name} was not a function or valid function pointer`); const result = table[name] = { - index, + index: undefined, typeIndex, module, name, - assumeUsed, func }; return result; @@ -581,11 +629,21 @@ export class WasmBuilder { this.endSection(); } - callImport(name: string) { + call_indirect (functionTypeName: string, tableIndex: number) { + const type = this.functionTypes[functionTypeName]; + if (!type) + throw new Error("No function type named " + functionTypeName); + const typeIndex = type[0]; + this.appendU8(WasmOpcode.call_indirect); + this.appendULeb(typeIndex); + this.appendULeb(tableIndex); + } + + callImport (name: string) { const func = this.importedFunctions[name]; if (!func) throw new Error("No imported function named " + name); - if (func.index === undefined) + if (typeof (func.index) !== "number") func.index = this.importedFunctionCount++; this.appendU8(WasmOpcode.call); this.appendULeb(func.index); @@ -1325,6 +1383,9 @@ export const elapsedTimes = { compilation: 0 }; +export const simdFallbackCounters : { [name: string] : number } = { +}; + export const counters = { traceCandidates: 0, tracesCompiled: 0, @@ -1336,6 +1397,7 @@ export const counters = { nullChecksEliminated: 0, backBranchesEmitted: 0, backBranchesNotEmitted: 0, + simdFallback: simdFallbackCounters, }; export const _now = (globalThis.performance && globalThis.performance.now) @@ -1636,6 +1698,13 @@ export function importDef(name: string, fn: Function): [string, string, Function return [name, name, fn]; } +export function bytesFromHex (hex: string) : Uint8Array { + const bytes = new Uint8Array(hex.length / 2); + for (let i = 0; i < hex.length; i += 2) + bytes[i / 2] = parseInt(hex.substring(i, i + 2), 16); + return bytes; +} + export type JiterpreterOptions = { enableAll?: boolean; enableTraces: boolean; @@ -1644,6 +1713,7 @@ export type JiterpreterOptions = { enableBackwardBranches: boolean; enableCallResume: boolean; enableWasmEh: boolean; + enableSimd: boolean; // For locations where the jiterpreter heuristic says we will be unable to generate // a trace, insert an entry point opcode anyway. This enables collecting accurate // stats for options like estimateHeat, but raises overhead. @@ -1685,6 +1755,7 @@ const optionNames: { [jsName: string]: string } = { "enableBackwardBranches": "jiterpreter-backward-branch-entries-enabled", "enableCallResume": "jiterpreter-call-resume-enabled", "enableWasmEh": "jiterpreter-wasm-eh-enabled", + "enableSimd": "jiterpreter-simd-enabled", "enableStats": "jiterpreter-stats-enabled", "disableHeuristic": "jiterpreter-disable-heuristic", "estimateHeat": "jiterpreter-estimate-heat", diff --git a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts index 0f80f0661ee97..cdce074e9845b 100644 --- a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts @@ -2,24 +2,28 @@ // The .NET Foundation licenses this file to you under the MIT license. import { mono_assert, MonoMethod } from "./types"; +import { Module } from "./globals"; import { NativePointer } from "./types/emscripten"; import { getU16, getI16, getU32_unaligned, getI32_unaligned, getF32_unaligned, getF64_unaligned, } from "./memory"; -import { WasmOpcode } from "./jiterpreter-opcodes"; -import { MintOpcode, OpcodeInfo } from "./mintops"; +import { WasmOpcode, WasmSimdOpcode } from "./jiterpreter-opcodes"; +import { + MintOpcode, OpcodeInfo, SimdInfo, + SimdIntrinsic2, SimdIntrinsic3, SimdIntrinsic4 +} from "./mintops"; import cwraps from "./cwraps"; import { MintOpcodePtr, WasmValtype, WasmBuilder, append_memset_dest, append_bailout, append_exit, append_memmove_dest_src, try_append_memset_fast, - try_append_memmove_fast, counters, + try_append_memmove_fast, counters, bytesFromHex, getMemberOffset, JiterpMember, BailoutReason, getOpcodeTableValue } from "./jiterpreter-support"; import { - sizeOfDataItem, + sizeOfDataItem, sizeOfV128, sizeOfStackval, disabledOpcodes, countCallTargets, callTargetCounts, trapTraceErrors, @@ -158,7 +162,7 @@ export function generateWasmBody ( ) : number { const abort = 0; let isFirstInstruction = true, isConditionallyExecuted = false, - firstOpcodeInBlock = true; + firstOpcodeInBlock = true, containsSimd = false; let result = 0, prologueOpcodeCounter = 0, conditionalOpcodeCounter = 0; @@ -203,9 +207,20 @@ export function generateWasmBody ( let opcode = getU16(ip); const info = OpcodeInfo[opcode]; + const isSimdIntrins = (opcode >= MintOpcode.MINT_SIMD_INTRINS_P_P) && + (opcode <= MintOpcode.MINT_SIMD_INTRINS_P_PPP); + const simdIntrinsArgCount = isSimdIntrins + ? opcode - MintOpcode.MINT_SIMD_INTRINS_P_P + 2 + : 0; + const simdIntrinsIndex = isSimdIntrins + ? getArgU16(ip, 1 + simdIntrinsArgCount) + : 0; + mono_assert(info, () => `invalid opcode ${opcode}`); - const opname = info[0]; + const opname = isSimdIntrins + ? SimdInfo[simdIntrinsArgCount][simdIntrinsIndex] + : info[0]; const _ip = ip; const isBackBranchTarget = builder.options.noExitBackwardBranches && is_backward_branch_target(ip, startOfBody, backwardBranchTable), @@ -1293,6 +1308,14 @@ export function generateWasmBody ( append_exit(builder, ip, exitOpcodeCounter, BailoutReason.ComplexBranch); } else ip = abort; + } else if ( + (opcode >= MintOpcode.MINT_SIMD_V128_LDC) && + (opcode <= MintOpcode.MINT_SIMD_INTRINS_P_PPP) + ) { + if (!emit_simd(builder, ip, opcode, opname, simdIntrinsArgCount, simdIntrinsIndex)) + ip = abort; + else + containsSimd = true; } else if (opcodeValue === 0) { // This means it was explicitly marked as no-value in the opcode value table // so we can just skip over it. This is done for things like nops. @@ -1376,6 +1399,11 @@ export function generateWasmBody ( // console.log(`estimated size: ${builder.size + builder.cfg.overheadBytes + builder.bytesGeneratedSoFar}`); + // HACK: Traces containing simd will be *much* shorter than non-simd traces, + // which will cause both the heuristic and our length requirement outside + // to reject them. For now, just add a big constant to the length + if (containsSimd) + result += 10240; return result; } @@ -1404,12 +1432,16 @@ function append_branch_target_block (builder: WasmBuilder, ip: MintOpcodePtr, is builder.cfg.startBranchBlock(ip, isBackBranchTarget); } -function append_ldloc (builder: WasmBuilder, offset: number, opcode: WasmOpcode) { +function append_ldloc (builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) { builder.local("pLocals"); - builder.appendU8(opcode); + builder.appendU8(opcodeOrPrefix); + if (simdOpcode !== undefined) { + // This looks wrong but I assure you it's correct. + builder.appendULeb(simdOpcode); + } // stackval is 8 bytes, but pLocals might not be 8 byte aligned so we use 4 // wasm spec prohibits alignment higher than natural alignment, just to be annoying - const alignment = (opcode > WasmOpcode.f64_load) ? 0 : 2; + const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_load) ? 0 : 2; builder.appendMemarg(offset, alignment); } @@ -1418,11 +1450,15 @@ function append_ldloc (builder: WasmBuilder, offset: number, opcode: WasmOpcode) // where the offset+alignment pair is referred to as a 'memarg' by the spec. // The actual store operation is equivalent to `pBase[offset] = value` (alignment has no // observable impact on behavior, other than causing compilation failures if out of range) -function append_stloc_tail (builder: WasmBuilder, offset: number, opcode: WasmOpcode) { - builder.appendU8(opcode); +function append_stloc_tail (builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) { + builder.appendU8(opcodeOrPrefix); + if (simdOpcode !== undefined) { + // This looks wrong but I assure you it's correct. + builder.appendULeb(simdOpcode); + } // stackval is 8 bytes, but pLocals might not be 8 byte aligned so we use 4 // wasm spec prohibits alignment higher than natural alignment, just to be annoying - const alignment = (opcode > WasmOpcode.f64_store) ? 0 : 2; + const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_store) ? 0 : 2; builder.appendMemarg(offset, alignment); invalidate_local(offset); } @@ -1759,13 +1795,13 @@ function emit_fieldop ( case MintOpcode.MINT_STFLD_O: { /* * Writing a ref-type field has to call an import to perform the write barrier anyway, - * and technically it should use a different kind of barrier from copy_pointer. So + * and technically it should use a different kind of barrier from copy_ptr. So * we define a special import that is responsible for performing the whole stfld_o * operation with as little trace-side overhead as possible * Previously the pseudocode looked like: * cknull_ptr = *(MonoObject *)&locals[objectOffset]; * if (!cknull_ptr) bailout; - * copy_pointer(cknull_ptr + fieldOffset, *(MonoObject *)&locals[localOffset]) + * copy_ptr(cknull_ptr + fieldOffset, *(MonoObject *)&locals[localOffset]) * The null check optimization also allows us to safely omit the bailout check * if we know that the target object isn't null. Even if the target object were * somehow null in this case (bad! shouldn't be possible!) it won't be a crash @@ -1938,7 +1974,7 @@ function emit_sfieldop ( // src append_ldloca(builder, localOffset, 0); // FIXME: Use mono_gc_wbarrier_set_field_internal - builder.callImport("copy_pointer"); + builder.callImport("copy_ptr"); return true; case MintOpcode.MINT_LDSFLD_VT: { const sizeBytes = getArgU16(ip, 4); @@ -2048,7 +2084,7 @@ const unopTable : { [opcode: number]: OpRec3 | undefined } = { [MintOpcode.MINT_POPCNT_I4]: [WasmOpcode.i32_popcnt, WasmOpcode.i32_load, WasmOpcode.i32_store], [MintOpcode.MINT_CLZ_I8]: [WasmOpcode.i64_clz, WasmOpcode.i64_load, WasmOpcode.i64_store], [MintOpcode.MINT_CTZ_I8]: [WasmOpcode.i64_ctz, WasmOpcode.i64_load, WasmOpcode.i64_store], - [MintOpcode.MINT_POPCNT_I8]: [WasmOpcode.i64_popcnt, WasmOpcode.i32_load, WasmOpcode.i32_store], + [MintOpcode.MINT_POPCNT_I8]: [WasmOpcode.i64_popcnt, WasmOpcode.i64_load, WasmOpcode.i64_store], }; // HACK: Generating correct wasm for these is non-trivial so we hand them off to C. @@ -3023,7 +3059,7 @@ function emit_indirectop (builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintO builder.local("cknull_ptr"); // Load address of value so that copy_managed_pointer can grab it append_ldloca(builder, valueVarIndex, 0); - builder.callImport("copy_pointer"); + builder.callImport("copy_ptr"); } else { // Pre-load address for the store operation builder.local("cknull_ptr"); @@ -3237,6 +3273,331 @@ function emit_arrayop (builder: WasmBuilder, frame: NativePointer, ip: MintOpcod return true; } +const vec128Test = + "0061736d0100000001040160000003020100070801047465737400000a090107004100fd111a0b"; +let wasmSimdSupported : boolean | undefined; + +function getIsWasmSimdSupported () : boolean { + if (wasmSimdSupported !== undefined) + return wasmSimdSupported; + + // Probe whether the current environment can handle wasm v128 opcodes. + try { + // Load and compile a test module that uses i32x4.splat. See wasm-simd-feature-detect.wat/wasm + const bytes = bytesFromHex(vec128Test); + counters.bytesGenerated += bytes.length; + new WebAssembly.Module(bytes); + wasmSimdSupported = true; + } catch (exc) { + console.log("MONO_WASM: Disabling WASM SIMD support due to JIT failure", exc); + wasmSimdSupported = false; + } + + return wasmSimdSupported; +} + +function get_import_name ( + builder: WasmBuilder, typeName: string, + functionPtr: number +) : string { + const name = `${typeName}_${functionPtr.toString(16)}`; + if (typeof (builder.importedFunctions[name]) !== "object") + builder.defineImportedFunction("s", name, typeName, false, functionPtr); + + return name; +} + +const simdCreateSizes = { + [MintOpcode.MINT_SIMD_V128_I1_CREATE]: 1, + [MintOpcode.MINT_SIMD_V128_I2_CREATE]: 2, + [MintOpcode.MINT_SIMD_V128_I4_CREATE]: 4, + [MintOpcode.MINT_SIMD_V128_I8_CREATE]: 8, +}; + +const simdCreateLoadOps = { + [MintOpcode.MINT_SIMD_V128_I1_CREATE]: WasmOpcode.i32_load8_s, + [MintOpcode.MINT_SIMD_V128_I2_CREATE]: WasmOpcode.i32_load16_s, + [MintOpcode.MINT_SIMD_V128_I4_CREATE]: WasmOpcode.i32_load, + [MintOpcode.MINT_SIMD_V128_I8_CREATE]: WasmOpcode.i64_load, +}; + +const simdCreateStoreOps = { + [MintOpcode.MINT_SIMD_V128_I1_CREATE]: WasmOpcode.i32_store8, + [MintOpcode.MINT_SIMD_V128_I2_CREATE]: WasmOpcode.i32_store16, + [MintOpcode.MINT_SIMD_V128_I4_CREATE]: WasmOpcode.i32_store, + [MintOpcode.MINT_SIMD_V128_I8_CREATE]: WasmOpcode.i64_store, +}; + +function emit_simd ( + builder: WasmBuilder, ip: MintOpcodePtr, + opcode: MintOpcode, opname: string, + argCount: number, index: number +) : boolean { + // First, if compiling an intrinsic attempt to emit the special vectorized implementation + // We only do this if SIMD is enabled since we'll be using the v128 opcodes. + if (builder.options.enableSimd && getIsWasmSimdSupported()) { + switch (argCount) { + case 2: + if (emit_simd_2(builder, ip, index)) + return true; + break; + case 3: + if (emit_simd_3(builder, ip, index)) + return true; + break; + case 4: + if (emit_simd_4(builder, ip, index)) + return true; + break; + } + } + + // Fall back to a mix of non-vectorized wasm and the interpreter's implementation of the opcodes + switch (opcode) { + case MintOpcode.MINT_SIMD_V128_LDC: { + if (builder.options.enableSimd && getIsWasmSimdSupported()) { + builder.local("pLocals"); + builder.appendSimd(WasmSimdOpcode.v128_const); + const view = Module.HEAPU8.slice(ip + 4, ip + 4 + sizeOfV128); + builder.appendBytes(view); + append_simd_store(builder, ip); + } else { + // dest + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src (ip + 2) + builder.ptr_const(ip + 4); + append_memmove_dest_src(builder, sizeOfV128); + } + return true; + } + case MintOpcode.MINT_SIMD_V128_I1_CREATE: + case MintOpcode.MINT_SIMD_V128_I2_CREATE: + case MintOpcode.MINT_SIMD_V128_I4_CREATE: + case MintOpcode.MINT_SIMD_V128_I8_CREATE: { + // These opcodes pack a series of locals into a vector + const elementSize = simdCreateSizes[opcode], + numElements = sizeOfV128 / elementSize, + destOffset = getArgU16(ip, 1), + srcOffset = getArgU16(ip, 2), + loadOp = simdCreateLoadOps[opcode], + storeOp = simdCreateStoreOps[opcode]; + for (let i = 0; i < numElements; i++) { + builder.local("pLocals"); + // load element from stack slot + append_ldloc(builder, srcOffset + (i * sizeOfStackval), loadOp); + // then store to destination element + append_stloc_tail(builder, destOffset + (i * elementSize), storeOp); + } + return true; + } + case MintOpcode.MINT_SIMD_INTRINS_P_P: { + counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1; + // res + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src + append_ldloca(builder, getArgU16(ip, 2), 0); + const importName = get_import_name(builder, "simd_p_p", cwraps.mono_jiterp_get_simd_intrinsic(1, index)); + builder.callImport(importName); + return true; + } + case MintOpcode.MINT_SIMD_INTRINS_P_PP: { + counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1; + // res + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src + append_ldloca(builder, getArgU16(ip, 2), 0); + append_ldloca(builder, getArgU16(ip, 3), 0); + const importName = get_import_name(builder, "simd_p_pp", cwraps.mono_jiterp_get_simd_intrinsic(2, index)); + builder.callImport(importName); + return true; + } + case MintOpcode.MINT_SIMD_INTRINS_P_PPP: { + counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1; + // res + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src + append_ldloca(builder, getArgU16(ip, 2), 0); + append_ldloca(builder, getArgU16(ip, 3), 0); + append_ldloca(builder, getArgU16(ip, 4), 0); + const importName = get_import_name(builder, "simd_p_ppp", cwraps.mono_jiterp_get_simd_intrinsic(3, index)); + builder.callImport(importName); + return true; + } + default: + console.log(`MONO_WASM: jiterpreter emit_simd failed for ${opname}`); + return false; + } +} + +function append_simd_store (builder: WasmBuilder, ip: MintOpcodePtr) { + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store); +} + +function append_simd_2_load (builder: WasmBuilder, ip: MintOpcodePtr, loadOp?: WasmSimdOpcode) { + builder.local("pLocals"); + // This || is harmless since v128_load is 0 + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, loadOp || WasmSimdOpcode.v128_load); +} + +function append_simd_3_load (builder: WasmBuilder, ip: MintOpcodePtr) { + builder.local("pLocals"); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + // FIXME: Can rhs be a scalar? We handle shifts separately already + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); +} + +function append_simd_4_load (builder: WasmBuilder, ip: MintOpcodePtr) { + builder.local("pLocals"); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 4), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); +} + +const simdShiftTable = new Set([ + SimdIntrinsic3.V128_I1_LEFT_SHIFT, + SimdIntrinsic3.V128_I2_LEFT_SHIFT, + SimdIntrinsic3.V128_I4_LEFT_SHIFT, + SimdIntrinsic3.V128_I8_LEFT_SHIFT, + + SimdIntrinsic3.V128_I1_RIGHT_SHIFT, + SimdIntrinsic3.V128_I2_RIGHT_SHIFT, + SimdIntrinsic3.V128_I4_RIGHT_SHIFT, + + SimdIntrinsic3.V128_I1_URIGHT_SHIFT, + SimdIntrinsic3.V128_I2_URIGHT_SHIFT, + SimdIntrinsic3.V128_I4_URIGHT_SHIFT, + SimdIntrinsic3.V128_I8_URIGHT_SHIFT, +]); + +function append_stloc_simd_zero (builder: WasmBuilder, offset: number) { + builder.local("pLocals"); + builder.appendSimd(WasmSimdOpcode.v128_const); + builder.appendBytes(new Uint8Array(sizeOfV128)); + append_stloc_tail(builder, offset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store); +} + +function emit_simd_2 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic2) : boolean { + const simple = cwraps.mono_jiterp_get_simd_opcode(1, index); + if (simple) { + append_simd_2_load(builder, ip); + builder.appendSimd(simple); + append_simd_store(builder, ip); + return true; + } + + switch (index) { + case SimdIntrinsic2.V128_I1_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load8_s); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store8); + return true; + case SimdIntrinsic2.V128_I2_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load16_s); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store16); + return true; + case SimdIntrinsic2.V128_I4_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store); + return true; + case SimdIntrinsic2.V128_I8_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i64_load); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i64_store); + return true; + + case SimdIntrinsic2.V128_I1_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load8_splat); + append_simd_store(builder, ip); + return true; + case SimdIntrinsic2.V128_I2_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load16_splat); + append_simd_store(builder, ip); + return true; + case SimdIntrinsic2.V128_I4_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load32_splat); + append_simd_store(builder, ip); + return true; + case SimdIntrinsic2.V128_I8_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load64_splat); + append_simd_store(builder, ip); + return true; + + default: + return false; + } +} + +function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic3) : boolean { + const simple = cwraps.mono_jiterp_get_simd_opcode(2, index); + if (simple) { + const isShift = simdShiftTable.has(index); + if (isShift) { + builder.local("pLocals"); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.i32_load); + builder.appendSimd(simple); + append_simd_store(builder, ip); + } else { + append_simd_3_load(builder, ip); + builder.appendSimd(simple); + append_simd_store(builder, ip); + } + return true; + } + + switch (index) { + case SimdIntrinsic3.V128_BITWISE_EQUALITY: + case SimdIntrinsic3.V128_BITWISE_INEQUALITY: + append_simd_3_load(builder, ip); + // FIXME: i64x2_ne and i64x2_any_true? + builder.appendSimd(WasmSimdOpcode.i64x2_eq); + builder.appendSimd(WasmSimdOpcode.i64x2_all_true); + if (index === SimdIntrinsic3.V128_BITWISE_INEQUALITY) + builder.appendU8(WasmOpcode.i32_eqz); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store); + return true; + default: + return false; + } + + return false; +} + +function emit_simd_4 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic4) : boolean { + const simple = cwraps.mono_jiterp_get_simd_opcode(3, index); + if (simple) { + append_simd_4_load(builder, ip); + builder.appendSimd(simple); + append_simd_store(builder, ip); + return true; + } + + switch (index) { + case SimdIntrinsic4.V128_CONDITIONAL_SELECT: + builder.local("pLocals"); + // Wasm spec: result = ior𝑁(iand𝑁(𝑖1, 𝑖3), iand𝑁(𝑖2, inot𝑁(𝑖3))) + // Our opcode: *arg0 = (*arg2 & *arg1) | (*arg3 & ~*arg1) + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 4), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + builder.appendSimd(WasmSimdOpcode.v128_bitselect); + append_simd_store(builder, ip); + return true; + default: + return false; + } +} + function append_safepoint (builder: WasmBuilder, ip: MintOpcodePtr) { // Check whether a safepoint is required builder.ptr_const(cwraps.mono_jiterp_get_polling_required_address()); diff --git a/src/mono/wasm/runtime/jiterpreter.ts b/src/mono/wasm/runtime/jiterpreter.ts index 1d355272ae85c..7c2e66bde25cb 100644 --- a/src/mono/wasm/runtime/jiterpreter.ts +++ b/src/mono/wasm/runtime/jiterpreter.ts @@ -12,7 +12,8 @@ import { MintOpcode, OpcodeInfo } from "./mintops"; import cwraps from "./cwraps"; import { MintOpcodePtr, WasmValtype, WasmBuilder, addWasmFunctionPointer, - _now, elapsedTimes, counters, getRawCwrap, importDef, + _now, elapsedTimes, + counters, getRawCwrap, importDef, JiterpreterOptions, getOptions, recordFailure, JiterpMember, getMemberOffset, BailoutReasonNames, BailoutReason @@ -138,6 +139,8 @@ export const traceInfo: { [key: string]: TraceInfo } = {}; export const sizeOfDataItem = 4, sizeOfObjectHeader = 8, + sizeOfV128 = 16, + sizeOfStackval = 8, // While stats are enabled, dump concise stats every N traces so that it's clear a long-running // task isn't frozen if it's jitting lots of traces autoDumpInterval = 500; @@ -261,7 +264,7 @@ function getTraceImports() { traceImports = [ importDef("bailout", recordBailout), - importDef("copy_pointer", getRawCwrap("mono_wasm_copy_managed_pointer")), + importDef("copy_ptr", getRawCwrap("mono_wasm_copy_managed_pointer")), importDef("entry", getRawCwrap("mono_jiterp_increase_entry_count")), importDef("value_copy", getRawCwrap("mono_jiterp_value_copy")), importDef("gettype", getRawCwrap("mono_jiterp_gettype_ref")), @@ -376,8 +379,7 @@ function initialize_builder(builder: WasmBuilder) { WasmValtype.i32, true ); builder.defineType( - "copy_pointer", - { + "copy_ptr", { "dest": WasmValtype.i32, "src": WasmValtype.i32 }, @@ -693,13 +695,34 @@ function initialize_builder(builder: WasmBuilder) { }, WasmValtype.i32, true ); + builder.defineType( + "simd_p_p", { + "arg0": WasmValtype.i32, + "arg1": WasmValtype.i32, + }, WasmValtype.void, true + ); + builder.defineType( + "simd_p_pp", { + "arg0": WasmValtype.i32, + "arg1": WasmValtype.i32, + "arg2": WasmValtype.i32, + }, WasmValtype.void, true + ); + builder.defineType( + "simd_p_ppp", { + "arg0": WasmValtype.i32, + "arg1": WasmValtype.i32, + "arg2": WasmValtype.i32, + "arg3": WasmValtype.i32, + }, WasmValtype.void, true + ); const traceImports = getTraceImports(); // Pre-define function imports as persistent for (let i = 0; i < traceImports.length; i++) { mono_assert(traceImports[i], () => `trace #${i} missing`); - builder.defineImportedFunction("i", traceImports[i][0], traceImports[i][1], false, true, traceImports[i][2]); + builder.defineImportedFunction("i", traceImports[i][0], traceImports[i][1], true, traceImports[i][2]); } } @@ -836,17 +859,15 @@ function generate_wasm( if (trace > 0) console.log(`${((builder.base)).toString(16)} ${methodFullName || traceName} generated ${buffer.length} byte(s) of wasm`); counters.bytesGenerated += buffer.length; + if (buffer.length >= maxModuleSize) { console.warn(`MONO_WASM: Jiterpreter generated too much code (${buffer.length} bytes) for trace ${traceName}. Please report this issue.`); return 0; } - const traceModule = new WebAssembly.Module(buffer); - const traceInstance = new WebAssembly.Instance(traceModule, { - i: builder.getImportedFunctionTable(), - c: builder.getConstants(), - m: { h: (Module).asm.memory }, - }); + const traceModule = new WebAssembly.Module(buffer); + const wasmImports = builder.getWasmImports(); + const traceInstance = new WebAssembly.Instance(traceModule, wasmImports); // Get the exported trace function const fn = traceInstance.exports[traceName]; @@ -907,7 +928,7 @@ function generate_wasm( console.log(builder.traceBuf[i]); } - console.log(`// MONO_WASM: ${methodFullName || methodName}:${traceOffset.toString(16)} generated, blob follows //`); + console.log(`// MONO_WASM: ${methodFullName || traceName} generated, blob follows //`); let s = "", j = 0; try { // We may have thrown an uncaught exception while inside a block, @@ -1194,7 +1215,10 @@ export function jiterpreter_dump_stats(b?: boolean, concise?: boolean) { console.log(`// ${keys[i]}: ${abortCounts[keys[i]]} abort(s)`); } - if ((typeof (globalThis.setTimeout) === "function") && (b !== undefined)) + for (const k in counters.simdFallback) + console.log(`// simd ${k}: ${counters.simdFallback[k]} fallback insn(s)`); + + if ((typeof(globalThis.setTimeout) === "function") && (b !== undefined)) setTimeout( () => jiterpreter_dump_stats(b), 15000 diff --git a/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm b/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm new file mode 100644 index 0000000000000000000000000000000000000000..5d7c49d0bcbda0301cd143711052253b05534c49 GIT binary patch literal 39 ucmZQbEY4+QU|?WmVN76PU}j=uVCP_DDM>9ZVPN3mWMpS>WcVv6#SH*n1qBiS literal 0 HcmV?d00001 diff --git a/src/mono/wasm/runtime/wasm-simd-feature-detect.wat b/src/mono/wasm/runtime/wasm-simd-feature-detect.wat new file mode 100644 index 0000000000000..8cd56adf584e1 --- /dev/null +++ b/src/mono/wasm/runtime/wasm-simd-feature-detect.wat @@ -0,0 +1,6 @@ +(module + (func $test (export "test") + (i32x4.splat (i32.const 0)) + drop + ) +) diff --git a/src/mono/wasm/wasm.proj b/src/mono/wasm/wasm.proj index cb76cb0d19e02..5bb5fb38c0ec0 100644 --- a/src/mono/wasm/wasm.proj +++ b/src/mono/wasm/wasm.proj @@ -25,7 +25,7 @@ $([MSBuild]::NormalizeDirectory('$(PkgMicrosoft_NETCore_Runtime_ICU_Transport)', 'runtimes', 'browser-wasm', 'native', 'lib')) $([MSBuild]::NormalizeDirectory('$(PkgMicrosoft_NETCore_Runtime_ICU_Transport)', 'runtimes', 'browser-wasm-threads', 'native', 'lib')) - false + true true false emcc @@ -279,7 +279,7 @@ <_EmccLinkFlags Include="-s INITIAL_MEMORY=$(EmccInitialHeapSize)" /> <_EmccLinkFlags Include="-s STACK_SIZE=$(EmccStackSize)" /> - <_EmccCommonFlags Condition="'$(WasmEnableSIMD)' == 'true'" Include="-msimd128" /> + <_EmccCommonFlags Include="-msimd128" /> <_EmccCommonFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-s USE_PTHREADS=1" /> <_EmccLinkFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-Wno-pthreads-mem-growth" /> <_EmccLinkFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-s PTHREAD_POOL_SIZE=0" /> @@ -539,9 +539,9 @@ - +