diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index 129d05a4f4..8466533166 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -402,6 +402,24 @@ DEF_OP(CondAddNZCV) { } } +DEF_OP(CondSubNZCV) { + auto Op = IROp->C(); + const auto OpSize = IROp->Size; + + LOGMAN_THROW_AA_FMT(OpSize == IR::i32Bit || OpSize == IR::i64Bit, "Unsupported {} size: {}", __func__, OpSize); + const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + + ARMEmitter::StatusFlags Flags = (ARMEmitter::StatusFlags)Op->FalseNZCV; + uint64_t Const = 0; + auto Src1 = GetZeroableReg(Op->Src1); + + if (IsInlineConstant(Op->Src2, &Const)) { + ccmp(EmitSize, Src1, Const, Flags, MapSelectCC(Op->Cond)); + } else { + ccmp(EmitSize, Src1, GetReg(Op->Src2.ID()), Flags, MapSelectCC(Op->Cond)); + } +} + DEF_OP(Neg) { auto Op = IROp->C(); const uint8_t OpSize = IROp->Size; diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 2214f4c923..c3a6895906 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3617,75 +3617,90 @@ void OpDispatchBuilder::CMPSOp(OpcodeArgs) { bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; - // read DF once - auto PtrDir = LoadDir(Size); + // If rcx = 0, skip the whole loop. + OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + auto OuterJump = CondJump(Counter, {COND_EQ}); - auto JumpStart = Jump(); - // Make sure to start a new block after ending this one - auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetJumpTarget(JumpStart, LoopStart); - SetCurrentCodeBlock(LoopStart); + auto BeforeLoop = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetFalseJumpTarget(OuterJump, BeforeLoop); + SetCurrentCodeBlock(BeforeLoop); StartNewBlock(); - OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + ForeachDirection([this, Op, Size, REPE](int PtrDir) { + IRPair InnerJump; + auto JumpIntoLoop = Jump(); - // Can we end the block? - auto CondJump_ = CondJump(Counter, {COND_EQ}); - IRPair InternalCondJump; - - auto LoopTail = CreateNewCodeBlockAfter(LoopStart); - SetFalseJumpTarget(CondJump_, LoopTail); - SetCurrentCodeBlock(LoopTail); - StartNewBlock(); + // Setup for the loop + auto LoopHeader = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetCurrentCodeBlock(LoopHeader); + StartNewBlock(); + SetJumpTarget(JumpIntoLoop, LoopHeader); - // Working loop - { - OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); - OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); + // Working loop + { + OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); + OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); - // Only ES prefix - Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); - // Default DS prefix - Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); + // Only ES prefix + Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); + // Default DS prefix + Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); - auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); - auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size); + auto Src1 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); + auto Src2 = _LoadMem(GPRClass, Size, Dest_RSI, Size); - GenerateFlags_SUB(Op, Src2, Src1); + // We'll calculate PF/AF after the loop, so use them as temporaries here. + _StoreRegister(Src1, false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + _StoreRegister(Src2, false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); - // Calculate flags early. - CalculateDeferredFlags(); + OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); - OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); + // Decrement counter + TailCounter = _SubWithFlags(OpSize::i64Bit, TailCounter, _Constant(1)); - // Decrement counter - TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); + // Store the counter since we don't have phis + StoreGPRRegister(X86State::REG_RCX, TailCounter); - // Store the counter since we don't have phis - StoreGPRRegister(X86State::REG_RCX, TailCounter); + // Offset the pointer + Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, _Constant(PtrDir * Size)); + StoreGPRRegister(X86State::REG_RDI, Dest_RDI); - // Offset the pointer - Dest_RDI = _Add(OpSize::i64Bit, Dest_RDI, PtrDir); - StoreGPRRegister(X86State::REG_RDI, Dest_RDI); + // Offset second pointer + Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, _Constant(PtrDir * Size)); + StoreGPRRegister(X86State::REG_RSI, Dest_RSI); - // Offset second pointer - Dest_RSI = _Add(OpSize::i64Bit, Dest_RSI, PtrDir); - StoreGPRRegister(X86State::REG_RSI, Dest_RSI); + // If TailCounter != 0, compare sources. + // If TailCounter == 0, set ZF iff that would break. + _CondSubNZCV(OpSize::i64Bit, Src2, Src1, {COND_NEQ}, REPE ? 0 : (1 << 2) /* Z */); + CachedNZCV = nullptr; + NZCVDirty = false; + InnerJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); - CalculateDeferredFlags(); - InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); + // Jump back to the start if we have more work to do + SetTrueJumpTarget(InnerJump, LoopHeader); + } - // Jump back to the start if we have more work to do - SetTrueJumpTarget(InternalCondJump, LoopStart); - } + // Make sure to start a new block after ending this one + auto LoopEnd = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetFalseJumpTarget(InnerJump, LoopEnd); + SetCurrentCodeBlock(LoopEnd); + StartNewBlock(); + }); // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); - SetTrueJumpTarget(CondJump_, LoopEnd); - - SetFalseJumpTarget(InternalCondJump, LoopEnd); + { + // Grab the sources from the last iteration so we can set flags. + auto Src1 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, pf_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + auto Src2 = _LoadRegister(false, offsetof(FEXCore::Core::CPUState, af_raw), GPRClass, GPRFixedClass, CTX->GetGPRSize()); + GenerateFlags_SUB(Op, Src2, Src1); + CalculateDeferredFlags(); + } + auto Jump_ = Jump(); - SetCurrentCodeBlock(LoopEnd); + auto Exit = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetJumpTarget(Jump_, Exit); + SetTrueJumpTarget(OuterJump, Exit); + SetCurrentCodeBlock(Exit); StartNewBlock(); } } @@ -3716,65 +3731,64 @@ void OpDispatchBuilder::LODSOp(OpcodeArgs) { // Calculate flags early. because end of block CalculateDeferredFlags(); - // XXX: Theoretically LODS could be optimized to - // RSI += {-}(RCX * Size) - // RAX = [RSI - Size] - // But this might violate the case of an application scanning pages for read permission and catching the fault - // May or may not matter - - // Read DF once - auto PtrDir = LoadDir(Size); + ForeachDirection([this, Op, Size](int PtrDir) { + // XXX: Theoretically LODS could be optimized to + // RSI += {-}(RCX * Size) + // RAX = [RSI - Size] + // But this might violate the case of an application scanning pages for read permission and catching the fault + // May or may not matter - auto JumpStart = Jump(); - // Make sure to start a new block after ending this one - auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetJumpTarget(JumpStart, LoopStart); - SetCurrentCodeBlock(LoopStart); - StartNewBlock(); + auto JumpStart = Jump(); + // Make sure to start a new block after ending this one + auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetJumpTarget(JumpStart, LoopStart); + SetCurrentCodeBlock(LoopStart); + StartNewBlock(); - OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); - // Can we end the block? + // Can we end the block? - // We leave if RCX = 0 - auto CondJump_ = CondJump(Counter, {COND_EQ}); + // We leave if RCX = 0 + auto CondJump_ = CondJump(Counter, {COND_EQ}); - auto LoopTail = CreateNewCodeBlockAfter(LoopStart); - SetFalseJumpTarget(CondJump_, LoopTail); - SetCurrentCodeBlock(LoopTail); - StartNewBlock(); + auto LoopTail = CreateNewCodeBlockAfter(LoopStart); + SetFalseJumpTarget(CondJump_, LoopTail); + SetCurrentCodeBlock(LoopTail); + StartNewBlock(); - // Working loop - { - OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); + // Working loop + { + OrderedNode *Dest_RSI = LoadGPRRegister(X86State::REG_RSI); - Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); + Dest_RSI = AppendSegmentOffset(Dest_RSI, Op->Flags, FEXCore::X86Tables::DecodeFlags::FLAG_DS_PREFIX); - auto Src = _LoadMemAutoTSO(GPRClass, Size, Dest_RSI, Size); + auto Src = _LoadMemAutoTSO(GPRClass, Size, Dest_RSI, Size); - StoreResult(GPRClass, Op, Src, -1); + StoreResult(GPRClass, Op, Src, -1); - OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); - OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); + OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *TailDest_RSI = LoadGPRRegister(X86State::REG_RSI); - // Decrement counter - TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); + // Decrement counter + TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); - // Store the counter since we don't have phis - StoreGPRRegister(X86State::REG_RCX, TailCounter); + // Store the counter since we don't have phis + StoreGPRRegister(X86State::REG_RCX, TailCounter); - // Offset the pointer - TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, PtrDir); - StoreGPRRegister(X86State::REG_RSI, TailDest_RSI); + // Offset the pointer + TailDest_RSI = _Add(OpSize::i64Bit, TailDest_RSI, _Constant(PtrDir * Size)); + StoreGPRRegister(X86State::REG_RSI, TailDest_RSI); - // Jump back to the start, we have more work to do - Jump(LoopStart); - } - // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); - SetTrueJumpTarget(CondJump_, LoopEnd); - SetCurrentCodeBlock(LoopEnd); - StartNewBlock(); + // Jump back to the start, we have more work to do + Jump(LoopStart); + } + // Make sure to start a new block after ending this one + auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); + SetTrueJumpTarget(CondJump_, LoopEnd); + SetCurrentCodeBlock(LoopEnd); + StartNewBlock(); + }); } } @@ -3805,71 +3819,70 @@ void OpDispatchBuilder::SCASOp(OpcodeArgs) { // Calculate flags early. because end of block CalculateDeferredFlags(); - bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; - - // read DF once - auto PtrDir = LoadDir(Size); + ForeachDirection([this, Op, Size](int Dir){ + bool REPE = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_REP_PREFIX; - auto JumpStart = Jump(); - // Make sure to start a new block after ending this one - auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); - SetJumpTarget(JumpStart, LoopStart); - SetCurrentCodeBlock(LoopStart); - StartNewBlock(); + auto JumpStart = Jump(); + // Make sure to start a new block after ending this one + auto LoopStart = CreateNewCodeBlockAfter(GetCurrentBlock()); + SetJumpTarget(JumpStart, LoopStart); + SetCurrentCodeBlock(LoopStart); + StartNewBlock(); - OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *Counter = LoadGPRRegister(X86State::REG_RCX); - // Can we end the block? - // We leave if RCX = 0 - auto CondJump_ = CondJump(Counter, {COND_EQ}); - IRPair InternalCondJump; + // Can we end the block? + // We leave if RCX = 0 + auto CondJump_ = CondJump(Counter, {COND_EQ}); + IRPair InternalCondJump; - auto LoopTail = CreateNewCodeBlockAfter(LoopStart); - SetFalseJumpTarget(CondJump_, LoopTail); - SetCurrentCodeBlock(LoopTail); - StartNewBlock(); + auto LoopTail = CreateNewCodeBlockAfter(LoopStart); + SetFalseJumpTarget(CondJump_, LoopTail); + SetCurrentCodeBlock(LoopTail); + StartNewBlock(); - // Working loop - { - OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); + // Working loop + { + OrderedNode *Dest_RDI = LoadGPRRegister(X86State::REG_RDI); - Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); + Dest_RDI = AppendSegmentOffset(Dest_RDI, 0, FEXCore::X86Tables::DecodeFlags::FLAG_ES_PREFIX, true); - auto Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); - auto Src2 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); + auto Src1 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true}); + auto Src2 = _LoadMemAutoTSO(GPRClass, Size, Dest_RDI, Size); - GenerateFlags_SUB(Op, Src1, Src2); + GenerateFlags_SUB(Op, Src1, Src2); - // Calculate flags early. - CalculateDeferredFlags(); + // Calculate flags early. + CalculateDeferredFlags(); - OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); - OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI); + OrderedNode *TailCounter = LoadGPRRegister(X86State::REG_RCX); + OrderedNode *TailDest_RDI = LoadGPRRegister(X86State::REG_RDI); - // Decrement counter - TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); + // Decrement counter + TailCounter = _Sub(OpSize::i64Bit, TailCounter, _Constant(1)); - // Store the counter since we don't have phis - StoreGPRRegister(X86State::REG_RCX, TailCounter); + // Store the counter since we don't have phis + StoreGPRRegister(X86State::REG_RCX, TailCounter); - // Offset the pointer - TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, PtrDir); - StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); + // Offset the pointer + TailDest_RDI = _Add(OpSize::i64Bit, TailDest_RDI, _Constant(Dir * Size)); + StoreGPRRegister(X86State::REG_RDI, TailDest_RDI); - CalculateDeferredFlags(); - InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); + CalculateDeferredFlags(); + InternalCondJump = CondJumpNZCV({REPE ? COND_EQ : COND_NEQ}); - // Jump back to the start if we have more work to do - SetTrueJumpTarget(InternalCondJump, LoopStart); - } - // Make sure to start a new block after ending this one - auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); - SetTrueJumpTarget(CondJump_, LoopEnd); + // Jump back to the start if we have more work to do + SetTrueJumpTarget(InternalCondJump, LoopStart); + } + // Make sure to start a new block after ending this one + auto LoopEnd = CreateNewCodeBlockAfter(LoopTail); + SetTrueJumpTarget(CondJump_, LoopEnd); - SetFalseJumpTarget(InternalCondJump, LoopEnd); + SetFalseJumpTarget(InternalCondJump, LoopEnd); - SetCurrentCodeBlock(LoopEnd); - StartNewBlock(); + SetCurrentCodeBlock(LoopEnd); + StartNewBlock(); + }); } } diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index 9c58e50725..6313220b3e 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -228,6 +228,32 @@ friend class FEXCore::IR::PassManager; return CanHaveSideEffects; } + template + void ForeachDirection(F&& Routine) { + // Otherwise, prepare to branch. + auto Zero = _Constant(0); + + // If the shift is zero, do not touch the flags. + auto ForwardBlock = CreateNewCodeBlockAfter(GetCurrentBlock()); + auto BackwardBlock = CreateNewCodeBlockAfter(ForwardBlock); + auto ExitBlock = CreateNewCodeBlockAfter(BackwardBlock); + + auto DF = GetRFLAG(X86State::RFLAG_DF_RAW_LOC); + CondJump(DF, Zero, ForwardBlock, BackwardBlock, {COND_EQ}); + + for (auto D = 0; D < 2; ++D) { + SetCurrentCodeBlock(D ? BackwardBlock : ForwardBlock); + StartNewBlock(); + { + Routine(D ? -1 : 1); + Jump(ExitBlock); + } + } + + SetCurrentCodeBlock(ExitBlock); + StartNewBlock(); + } + OpDispatchBuilder(FEXCore::Context::ContextImpl *ctx); OpDispatchBuilder(FEXCore::Utils::IntrusivePooledAllocator &Allocator); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index abf9b8586d..0f40cdb12f 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -1035,6 +1035,14 @@ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, + "CondSubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2, CondClass:$Cond, u8:$FalseNZCV": { + "Desc": ["If condition is true, set NZCV per difference of GPRs, else force NZCV to a constant."], + "HasSideEffects": true, + "DestSize": "Size", + "EmitValidation": [ + "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" + ] + }, "GPR = AdcWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Adds and set NZCV for the sum of two GPRs and carry-in given as NZCV"], "HasSideEffects": true, diff --git a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp index 192ae6520c..60464625da 100644 --- a/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp +++ b/FEXCore/Source/Interface/IR/Passes/ConstProp.cpp @@ -1139,6 +1139,7 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR) break; } case OP_CONDADDNZCV: + case OP_CONDSUBNZCV: { auto Op = IROp->C(); diff --git a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp index 2df6270d8d..ac57749dfa 100644 --- a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp @@ -228,6 +228,7 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp) return {.Read = FlagsForCondClassType(Op->Cond)}; } + case OP_CONDSUBNZCV: case OP_CONDADDNZCV: { auto Op = IROp->CW(); return { diff --git a/unittests/InstructionCountCI/FlagM/Primary.json b/unittests/InstructionCountCI/FlagM/Primary.json index 743376c771..3fa39e3ba8 100644 --- a/unittests/InstructionCountCI/FlagM/Primary.json +++ b/unittests/InstructionCountCI/FlagM/Primary.json @@ -1950,153 +1950,251 @@ ] }, "repz cmpsb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 26, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x30", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "cfinv" ] }, "repz cmpsw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", - "cbz x5, #+0x30", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "cfinv" ] }, "repz cmpsd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", - "cbz x5, #+0x28", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x24" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "cfinv" ] }, "repz cmpsq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", - "cbz x5, #+0x28", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x24" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "cfinv" ] }, "repnz cmpsb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 26, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x30", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "cfinv" ] }, "repnz cmpsw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", - "cbz x5, #+0x30", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "cfinv" ] }, "repnz cmpsd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", - "cbz x5, #+0x28", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x24" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "cfinv" ] }, "repnz cmpsq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 24, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x60", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", - "cbz x5, #+0x28", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "cfinv", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x24" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "cfinv" ] }, "test al, 1": { @@ -2212,136 +2310,234 @@ ] }, "repz scasb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 25, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", + "cbz x5, #+0x28", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "b.eq #-0x24", + "b #+0x2c", "cbz x5, #+0x28", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x1 (1)", "b.eq #-0x24" ] }, "repz scasw": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x2 (2)", + "b.eq #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", "b.eq #-0x24" ] }, "repz scasd": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", + "cbz x5, #+0x20", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "b.eq #-0x1c", + "b #+0x24", "cbz x5, #+0x20", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x4 (4)", "b.eq #-0x1c" ] }, "repz scasq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", "cbz x5, #+0x20", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x8 (8)", + "b.eq #-0x1c", + "b #+0x24", + "cbz x5, #+0x20", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", "b.eq #-0x1c" ] }, "repnz scasb": { - "ExpectedInstructionCount": 11, + "ExpectedInstructionCount": 25, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x1 (1)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", "b.ne #-0x24" ] }, "repnz scasw": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", + "cbz x5, #+0x28", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "b.ne #-0x24", + "b #+0x2c", "cbz x5, #+0x28", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x2 (2)", "b.ne #-0x24" ] }, "repnz scasd": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", "cbz x5, #+0x20", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x4 (4)", + "b.ne #-0x1c", + "b #+0x24", + "cbz x5, #+0x20", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", "b.ne #-0x1c" ] }, "repnz scasq": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 21, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x28", "cbz x5, #+0x20", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "cfinv", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "b.ne #-0x1c", + "b #+0x24", + "cbz x5, #+0x20", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", "cfinv", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x8 (8)", "b.ne #-0x1c" ] }, diff --git a/unittests/InstructionCountCI/Primary.json b/unittests/InstructionCountCI/Primary.json index 5ee00256f5..7590652008 100644 --- a/unittests/InstructionCountCI/Primary.json +++ b/unittests/InstructionCountCI/Primary.json @@ -3295,169 +3295,267 @@ ] }, "repz cmpsb": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 28, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x38", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x34" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repz cmpsw": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 28, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", - "cbz x5, #+0x38", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x34" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repz cmpsd": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", - "cbz x5, #+0x30", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repz cmpsq": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", - "cbz x5, #+0x30", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.eq #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", + "ccmp x27, x26, #nzcv, ne", + "b.eq #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsb": { - "ExpectedInstructionCount": 15, + "ExpectedInstructionCount": 28, "Comment": "0xa6", "ExpectedArm64ASM": [ + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", - "cbz x5, #+0x38", - "ldrb w21, [x11]", - "ldrb w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #24", - "cmp w0, w21, lsl #24", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x34" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x1 (1)", + "add x10, x10, #0x1 (1)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrb w26, [x11]", + "ldrb w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", + "sub x10, x10, #0x1 (1)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #24", + "cmp w0, w26, lsl #24", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsw": { - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 28, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x70", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", - "cbz x5, #+0x38", - "ldrh w21, [x11]", - "ldrh w22, [x10]", - "eor w27, w22, w21", - "lsl w0, w22, #16", - "cmp w0, w21, lsl #16", - "sub w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x34" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x2 (2)", + "add x10, x10, #0x2 (2)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldrh w26, [x11]", + "ldrh w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", + "sub x10, x10, #0x2 (2)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "lsl w0, w20, #16", + "cmp w0, w26, lsl #16", + "sub w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsd": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", - "cbz x5, #+0x30", - "ldr w21, [x11]", - "ldr w22, [x10]", - "eor w27, w22, w21", - "subs w26, w22, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "add x10, x10, #0x4 (4)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr w26, [x11]", + "ldr w27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x4 (4)", + "sub x10, x10, #0x4 (4)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs w26, w20, w26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "repnz cmpsq": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 26, "Comment": "0xa7", "ExpectedArm64ASM": [ + "cbz x5, #+0x68", "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", - "cbz x5, #+0x30", - "ldr x21, [x11]", - "ldr x22, [x10]", - "eor w27, w22, w21", - "subs x26, x22, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", - "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", - "add x10, x10, x20", - "b.ne #-0x2c" + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x24", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "add x10, x10, #0x8 (8)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "b #+0x20", + "ldr x26, [x11]", + "ldr x27, [x10]", + "subs x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", + "sub x10, x10, #0x8 (8)", + "ccmp x27, x26, #nZcv, ne", + "b.ne #-0x18", + "mov x20, x27", + "eor w27, w20, w26", + "subs x26, x20, x26", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20" ] }, "test al, 1": { @@ -3842,55 +3940,90 @@ ] }, "rep lodsb": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 17, "Comment": "0xac", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x20", "cbz x5, #+0x18", - "ldrb w21, [x10]", - "bfxil x4, x21, #0, #8", + "ldrb w20, [x10]", + "bfxil x4, x20, #0, #8", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x1 (1)", + "b #-0x14", + "b #+0x1c", + "cbz x5, #+0x18", + "ldrb w20, [x10]", + "bfxil x4, x20, #0, #8", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x1 (1)", "b #-0x14" ] }, "rep lodsw": { - "ExpectedInstructionCount": 8, + "ExpectedInstructionCount": 17, "Comment": "0xad", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x20", "cbz x5, #+0x18", - "ldrh w21, [x10]", - "bfxil x4, x21, #0, #16", + "ldrh w20, [x10]", + "bfxil x4, x20, #0, #16", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x2 (2)", + "b #-0x14", + "b #+0x1c", + "cbz x5, #+0x18", + "ldrh w20, [x10]", + "bfxil x4, x20, #0, #16", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x2 (2)", "b #-0x14" ] }, "rep lodsd": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 15, "Comment": "0xad", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x1c", "cbz x5, #+0x14", "ldr w4, [x10]", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x4 (4)", + "b #-0x10", + "b #+0x18", + "cbz x5, #+0x14", + "ldr w4, [x10]", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x4 (4)", "b #-0x10" ] }, "rep lodsq": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 15, "Comment": "0xad", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x1c", "cbz x5, #+0x14", "ldr x4, [x10]", "sub x5, x5, #0x1 (1)", - "add x10, x10, x20", + "add x10, x10, #0x8 (8)", + "b #-0x10", + "b #+0x18", + "cbz x5, #+0x14", + "ldr x4, [x10]", + "sub x5, x5, #0x1 (1)", + "sub x10, x10, #0x8 (8)", "b #-0x10" ] }, @@ -3955,152 +4088,266 @@ ] }, "repz scasb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 29, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x1 (1)", + "b.eq #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", "b.eq #-0x2c" ] }, "repz scasw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 29, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x2 (2)", + "b.eq #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", "b.eq #-0x2c" ] }, "repz scasd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "b.eq #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x4 (4)", "b.eq #-0x24" ] }, "repz scasq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x8 (8)", + "b.eq #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x8 (8)", "b.eq #-0x24" ] }, "repnz scasb": { - "ExpectedInstructionCount": 13, + "ExpectedInstructionCount": 29, "Comment": "0xae", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrb w21, [x11]", - "eor w27, w4, w21", + "ldrb w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #24", - "cmp w0, w21, lsl #24", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x1 (1)", + "b.ne #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrb w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #24", + "cmp w0, w20, lsl #24", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x1 (1)", "b.ne #-0x2c" ] }, "repnz scasw": { - "ExpectedInstructionCount": 14, + "ExpectedInstructionCount": 29, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #1", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x38", "cbz x5, #+0x30", - "ldrh w21, [x11]", - "eor w27, w4, w21", + "ldrh w20, [x11]", + "eor w27, w4, w20", "lsl w0, w4, #16", - "cmp w0, w21, lsl #16", - "sub w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "add x11, x11, #0x2 (2)", + "b.ne #-0x2c", + "b #+0x34", + "cbz x5, #+0x30", + "ldrh w20, [x11]", + "eor w27, w4, w20", + "lsl w0, w4, #16", + "cmp w0, w20, lsl #16", + "sub w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "sub x11, x11, #0x2 (2)", "b.ne #-0x2c" ] }, "repnz scasd": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #2", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr w21, [x11]", - "eor w27, w4, w21", - "subs w26, w4, w21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x4 (4)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr w20, [x11]", + "eor w27, w4, w20", + "subs w26, w4, w20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x4 (4)", "b.ne #-0x24" ] }, "repnz scasq": { - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 25, "Comment": "0xaf", "ExpectedArm64ASM": [ "ldrsb x20, [x28, #714]", - "lsl x20, x20, #3", + "lsr x20, x20, #63", + "cbz x20, #+0x8", + "b #+0x30", "cbz x5, #+0x28", - "ldr x21, [x11]", - "eor w27, w4, w21", - "subs x26, x4, x21", - "mrs x21, nzcv", - "eor w21, w21, #0x20000000", - "msr nzcv, x21", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", + "sub x5, x5, #0x1 (1)", + "add x11, x11, #0x8 (8)", + "b.ne #-0x24", + "b #+0x2c", + "cbz x5, #+0x28", + "ldr x20, [x11]", + "eor w27, w4, w20", + "subs x26, x4, x20", + "mrs x20, nzcv", + "eor w20, w20, #0x20000000", + "msr nzcv, x20", "sub x5, x5, #0x1 (1)", - "add x11, x11, x20", + "sub x11, x11, #0x8 (8)", "b.ne #-0x24" ] },