a32_jitstate: Optimize runtime location descriptor calculation

Calculation is now one unaligned 64-bit load.
2026-01-07 06:58:15 +01:00 · 2019-05-05 23:17:15 +01:00 · 2019-05-05 23:17:15 +01:00 · 6e2cd35e4f
commit 6e2cd35e4f
parent 0de3993373
5 changed files with 67 additions and 76 deletions
--- a/src/backend/x64/a32_emit_x64.cpp
+++ b/src/backend/x64/a32_emit_x64.cpp
@ -217,16 +217,14 @@ void A32EmitX64::GenMemoryAccessors() {
 }

 void A32EmitX64::GenTerminalHandlers() {
-    // PC ends up in ebp, location_descriptor ends up in rbx
+    // location_descriptor ends up in rbx
    const auto calculate_location_descriptor = [this] {
        // This calculation has to match up with IREmitter::PushRSB
-        // TODO: Optimization is available here based on known state of fpcr_mode and cpsr_et.
-        code.mov(ecx, MJitStateReg(A32::Reg::PC));
-        code.mov(ebp, ecx);
-        code.shl(rcx, 32);
-        code.mov(ebx, dword[r15 + offsetof(A32JitState, fpcr_mode)]);
-        code.or_(ebx, dword[r15 + offsetof(A32JitState, cpsr_et)]);
-        code.or_(rbx, rcx);
+        constexpr size_t offsetof_pc = offsetof(A32JitState, Reg) + 15 * sizeof(u32);
+        static_assert(offsetof_pc + 4 == offsetof(A32JitState, cpsr_et));
+        static_assert(offsetof_pc + 5 == offsetof(A32JitState, padding));
+        static_assert(offsetof_pc + 6 == offsetof(A32JitState, fpcr_mode));
+        code.mov(rbx, qword[r15 + offsetof_pc]);
    };

    Xbyak::Label fast_dispatch_cache_miss, rsb_cache_miss;
@ -254,10 +252,11 @@ void A32EmitX64::GenTerminalHandlers() {
        calculate_location_descriptor();
        code.L(rsb_cache_miss);
        code.mov(r12, reinterpret_cast<u64>(fast_dispatch_table.data()));
+        code.mov(rbp, rbx);
        if (code.DoesCpuSupport(Xbyak::util::Cpu::tSSE42)) {
-            code.crc32(ebp, r12d);
+            code.crc32(rbp, r12);
        }
-        code.and_(ebp, fast_dispatch_table_mask);
+        code.and_(rbp, fast_dispatch_table_mask);
        code.lea(rbp, ptr[r12 + rbp]);
        code.cmp(rbx, qword[rbp + offsetof(FastDispatchEntry, location_descriptor)]);
        code.jne(fast_dispatch_cache_miss);
@ -349,18 +348,16 @@ void A32EmitX64::EmitA32GetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
        const Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();

-        // Here we observe that cpsr_et and cpsr_ge are right next to each other in memory,
-        // so we load them both at the same time with one 64-bit read. This allows us to
-        // extract all of their bits together at once with one pext.
-        static_assert(offsetof(A32JitState, cpsr_et) + 4 == offsetof(A32JitState, cpsr_ge));
-        code.mov(result.cvt64(), qword[r15 + offsetof(A32JitState, cpsr_et)]);
-        code.mov(tmp.cvt64(), 0x80808080'00000003ull);
-        code.pext(result.cvt64(), result.cvt64(), tmp.cvt64());
-        code.mov(tmp, 0x000f0220);
-        code.pdep(result, result, tmp);
+        code.mov(result, dword[r15 + offsetof(A32JitState, cpsr_ge)]);
+        code.mov(tmp, 0x80808080);
+        code.pext(result, result, tmp);
+        code.shr(result, 16);
        code.mov(tmp, dword[r15 + offsetof(A32JitState, cpsr_q)]);
        code.shl(tmp, 27);
        code.or_(result, tmp);
+        code.movzx(tmp, code.byte[r15 + offsetof(A32JitState, cpsr_et)]);
+        code.shl(tmp, 5);
+        code.or_(result, tmp);
        code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_nzcv)]);
        code.or_(result, dword[r15 + offsetof(A32JitState, cpsr_jaifm)]);

@ -382,7 +379,6 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
    if (code.DoesCpuSupport(Xbyak::util::Cpu::tBMI2)) {
        const Xbyak::Reg32 cpsr = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        const Xbyak::Reg32 tmp = ctx.reg_alloc.ScratchGpr().cvt32();
-        const Xbyak::Reg32 tmp2 = ctx.reg_alloc.ScratchGpr().cvt32();

        // cpsr_q
        code.bt(cpsr, 27);
@ -398,18 +394,21 @@ void A32EmitX64::EmitA32SetCpsr(A32EmitContext& ctx, IR::Inst* inst) {
        code.and_(tmp, 0x07F0FDDF);
        code.mov(dword[r15 + offsetof(A32JitState, cpsr_jaifm)], tmp);

-        // cpsr_et and cpsr_ge
-        static_assert(offsetof(A32JitState, cpsr_et) + 4 == offsetof(A32JitState, cpsr_ge));
-        code.mov(tmp, 0x000f0220);
-        code.pext(cpsr, cpsr, tmp);
-        code.mov(tmp.cvt64(), 0x01010101'00000003ull);
-        code.pdep(cpsr.cvt64(), cpsr.cvt64(), tmp.cvt64());
+        // cpsr_et
+        code.mov(tmp, cpsr);
+        code.shr(tmp, 5);
+        code.and_(tmp, 0x11);
+        code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt8());
+
+        // cpsr_ge
+        code.shr(cpsr, 16);
+        code.mov(tmp, 0x01010101);
+        code.pdep(cpsr, cpsr, tmp);
        // We perform SWAR partitioned subtraction here, to negate the GE bytes.
-        code.mov(tmp.cvt64(), 0x80808080'00000003ull);
-        code.mov(tmp2.cvt64(), tmp.cvt64());
-        code.sub(tmp.cvt64(), cpsr.cvt64());
-        code.xor_(tmp.cvt64(), tmp2.cvt64());
-        code.mov(qword[r15 + offsetof(A32JitState, cpsr_et)], tmp.cvt64());
+        code.mov(tmp, 0x80808080);
+        code.sub(tmp, cpsr);
+        code.xor_(tmp, 0x80808080);
+        code.mov(dword[r15 + offsetof(A32JitState, cpsr_ge)], tmp);
    } else {
        ctx.reg_alloc.HostCall(nullptr, args[0]);
        code.mov(code.ABI_PARAM2, code.r15);
@ -660,11 +659,11 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {
        const u32 new_pc = arg.GetImmediateU32();
        const u32 mask = Common::Bit<0>(new_pc) ? 0xFFFFFFFE : 0xFFFFFFFC;
        u32 et = 0;
-        et |= ctx.Location().EFlag() ? 2 : 0;
-        et |= Common::Bit<0>(new_pc) ? 1 : 0;
+        et |= ctx.Location().EFlag() ? 0x10 : 0;
+        et |= Common::Bit<0>(new_pc) ? 0x01 : 0;

        code.mov(MJitStateReg(A32::Reg::PC), new_pc & mask);
-        code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], et);
+        code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], u8(et));
    } else {
        if (ctx.Location().EFlag()) {
            const Xbyak::Reg32 new_pc = ctx.reg_alloc.UseScratchGpr(arg).cvt32();
@ -673,8 +672,8 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {

            code.mov(mask, new_pc);
            code.and_(mask, 1);
-            code.lea(et, ptr[mask.cvt64() + 2]);
-            code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], et);
+            code.lea(et, ptr[mask.cvt64() + 0x10]);
+            code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], et.cvt8());
            code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
            code.and_(new_pc, mask);
            code.mov(MJitStateReg(A32::Reg::PC), new_pc);
@ -684,7 +683,7 @@ void A32EmitX64::EmitA32BXWritePC(A32EmitContext& ctx, IR::Inst* inst) {

            code.mov(mask, new_pc);
            code.and_(mask, 1);
-            code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], mask);
+            code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], mask.cvt8());
            code.lea(mask, ptr[mask.cvt64() + mask.cvt64() * 1 - 4]); // mask = pc & 1 ? 0xFFFFFFFE : 0xFFFFFFFC
            code.and_(new_pc, mask);
            code.mov(MJitStateReg(A32::Reg::PC), new_pc);
@ -1261,17 +1260,17 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::ReturnToDispatch, IR::LocationDescri
    code.ReturnFromRunCode();
 }

-static u32 CalculateCpsr_et(const IR::LocationDescriptor& arg) {
+static u8 CalculateCpsr_et(const IR::LocationDescriptor& arg) {
    const A32::LocationDescriptor desc{arg};
-    u32 et = 0;
-    et |= desc.EFlag() ? 2 : 0;
-    et |= desc.TFlag() ? 1 : 0;
+    u8 et = 0;
+    et |= desc.EFlag() ? 0x10 : 0;
+    et |= desc.TFlag() ? 0x01 : 0;
    return et;
 }

 void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDescriptor initial_location) {
    if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) {
-        code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
+        code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
    }

    code.cmp(qword[r15 + offsetof(A32JitState, cycles_remaining)], 0);
@ -1296,7 +1295,7 @@ void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlock terminal, IR::LocationDesc

 void A32EmitX64::EmitTerminalImpl(IR::Term::LinkBlockFast terminal, IR::LocationDescriptor initial_location) {
    if (CalculateCpsr_et(terminal.next) != CalculateCpsr_et(initial_location)) {
-        code.mov(dword[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
+        code.mov(code.byte[r15 + offsetof(A32JitState, cpsr_et)], CalculateCpsr_et(terminal.next));
    }

    patch_information[terminal.next].jmp.emplace_back(code.getCurr());