diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index b8e4f4cfc..677df1720 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -87,6 +87,33 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip if(!vex.l) YMM0(gd); break; + case 0x04: + INST_NAME("PMADDUBSW Gx, Vx, Ex"); + nextop = F8; + q0 = fpu_get_scratch(dyn, ninst); + q1 = fpu_get_scratch(dyn, ninst); + for(int l=0; l<1+vex.l; ++l) { + if(!l) { GETGX_empty_VXEX(v0, v2, v1, 0); } else { GETGY_empty_VYEY(v0, v2, v1); } + if(v0==v1 || v0==v2) { + if(!l) d0 = fpu_get_scratch(dyn, ninst); + } else + d0 = v0; + UXTL_8(q0, v2); // this is unsigned, so 0 extended + SXTL_8(q1, v1); // this is signed + VMULQ_16(q0, q0, q1); + SADDLPQ_16(q1, q0); + UXTL2_8(q0, v2); // this is unsigned + SQXTN_16(d0, q1); // SQXTN reset the vector so need to grab the high part first + SXTL2_8(q1, v1); // this is signed + VMULQ_16(q0, q0, q1); + SADDLPQ_16(q0, q0); + SQXTN2_16(d0, q0); + if(v0!=d0) + VMOVQ(v0, d0); + } + if(!vex.l) YMM0(gd); + break; + case 0x08: INST_NAME("VPSIGNB Gx, Vx, Ex"); nextop = F8; @@ -957,18 +984,18 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip v2 = sse_get_reg(dyn, ninst, x1, vex.v, 1); v1 = sse_get_reg(dyn, ninst, x1, eb2, 0); } else { - v0 = ymm_get_reg(dyn, ninst, x1, gd, 1, vex.v, eb2, -1); - v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 1, gd, eb2, -1); - v1 = ymm_get_reg(dyn, ninst, x1, eb2, 0, gd, vex.v, -1); + v0 = ymm_get_reg(dyn, ninst, x1, gd, 1, vex.v, (!rex.w)?eb2:-1, -1); + v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 1, gd, (!rex.w)?eb2:-1, -1); + if(!rex.w) v1 = ymm_get_reg(dyn, ninst, x1, eb2, 0, gd, vex.v, -1); } // prepare mask if(rex.w) VSSHRQ_64(v2, v2, 63); else VSSHRQ_32(v2, v2, 31); // prescale the values - if(wb1) VSHLQ_32(q1, v1, wb1); else q1 = v1; + if(wb1) { if(!l || !rex.w) VSHLQ_32(q1, v1, wb1); } else q1 = v1; // slow gather, not much choice here... if(rex.w) for(int i=0; i<2; ++i) { VMOVQDto(x4, v2, i); TBZ(x4, 0, 4+4*4); - SMOVQSto(x4, q1, i); + SMOVQSto(x4, q1, i+l*2); ADDx_REG(x4, x4, ed); VLD1_64(v0, i, x4); VMOVQDfrom(v2, i, xZR);