Compare commits

...

4 Commits

Author SHA1 Message Date
JosJuice
0c89c00d8b Merge pull request #13929 from SuperSamus/dcbz-jit-improvements
Jit: Small `dcbz` improvements
2025-09-30 18:52:09 +02:00
Martino Fontana
667c523755 Jit: Use dcbz() emitter for dcbz_l instead of interpreter fallback
Without cache emulation, these instructions are functionally identical.
In the interpreter, their only difference is related to HID registers checks, which the JIT already doesn't do for `dcbz`.
A loop with `dcbz_l` is used in the SDK function `LCEnable`, which is called frequently in some games.
2025-09-02 23:15:28 +02:00
Martino Fontana
d1ba849876 Jit64: dcbz, use VMOVAPS with YMM registers on AVX CPUs
LLVM does this, so let's do it as well.
2025-09-02 23:15:27 +02:00
Martino Fontana
5e69da7eba x64Emitter: Support YMM registers
This is accomplished by adding a 0x100 bit the the register.
Made sure that, on AVX instructions, that bit is trucated.
2025-09-02 22:15:16 +02:00
6 changed files with 47 additions and 14 deletions

View File

@@ -250,7 +250,7 @@ void OpArg::WriteVEX(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp
int X = !(indexReg & 8);
int B = !(offsetOrBaseReg & 8);
int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
u8 vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
// do we need any VEX fields that only appear in the three-byte form?
if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
@@ -343,7 +343,7 @@ void OpArg::WriteRest(XEmitter* emit, int extraBytes, X64Reg _operandReg,
if (SIB)
oreg = 4;
emit->WriteModRM(mod, _operandReg & 7, oreg & 7);
emit->WriteModRM(mod, _operandReg, oreg);
if (SIB)
{
@@ -1844,8 +1844,9 @@ void XEmitter::WriteVEXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, con
{
int mmmmm = GetVEXmmmmm(op);
int pp = GetVEXpp(opPrefix);
// FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
arg.WriteVEX(this, regOp1, regOp2, 0, pp, mmmmm, W);
// Note that mixing an XMM register with a YMM register is invalid, which isn't checked here.
int L = (regOp1 != INVALID_REG && regOp1 & 0x100) || (regOp2 != INVALID_REG && regOp2 & 0x100);
arg.WriteVEX(this, regOp1, regOp2, L, pp, mmmmm, W);
Write8(op & 0xFF);
arg.WriteRest(this, extrabytes, regOp1);
}
@@ -1857,19 +1858,23 @@ void XEmitter::WriteVEXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, co
Write8((u8)regOp3 << 4);
}
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
int W, int extrabytes)
void CheckAVXSupport()
{
if (!cpu_info.bAVX)
PanicAlertFmt("Trying to use AVX on a system that doesn't support it. Bad programmer.");
}
void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
int W, int extrabytes)
{
CheckAVXSupport();
WriteVEXOp(opPrefix, op, regOp1, regOp2, arg, W, extrabytes);
}
void XEmitter::WriteAVXOp4(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
X64Reg regOp3, int W)
{
if (!cpu_info.bAVX)
PanicAlertFmt("Trying to use AVX on a system that doesn't support it. Bad programmer.");
CheckAVXSupport();
WriteVEXOp4(opPrefix, op, regOp1, regOp2, arg, regOp3, W);
}
@@ -3029,6 +3034,19 @@ void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);
}
void XEmitter::VMOVAPS(const OpArg& arg, X64Reg regOp)
{
WriteAVXOp(0x00, 0x29, X64Reg::INVALID_REG, regOp, arg);
}
void XEmitter::VZEROUPPER()
{
CheckAVXSupport();
Write8(0xC5);
Write8(0xF8);
Write8(0x77);
}
void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg)
{
WriteFMA3Op(0x98, regOp1, regOp2, arg);

View File

@@ -876,6 +876,10 @@ public:
void VPOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VPXOR(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VMOVAPS(const OpArg& arg, X64Reg regOp);
void VZEROUPPER();
// FMA3
void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);

View File

@@ -72,7 +72,8 @@ enum X64Reg
XMM14,
XMM15,
YMM0 = 0,
// Use the bit 0x100 to distinguish XMM and YMM registers.
YMM0 = 0x100,
YMM1,
YMM2,
YMM3,

View File

@@ -101,7 +101,7 @@ constexpr std::array<Jit64OpTemplate, 13> s_table4{{
{592, &Jit64::ps_mergeXX}, // ps_merge10
{624, &Jit64::ps_mergeXX}, // ps_merge11
{1014, &Jit64::FallBackToInterpreter}, // dcbz_l
{1014, &Jit64::dcbz}, // dcbz_l
}};
constexpr std::array<Jit64OpTemplate, 17> s_table4_2{{

View File

@@ -8,6 +8,7 @@
#include "Common/Assert.h"
#include "Common/BitSet.h"
#include "Common/CPUDetect.h"
#include "Common/CommonTypes.h"
#include "Common/MsgHandler.h"
#include "Common/x64ABI.h"
@@ -473,9 +474,18 @@ void Jit64::dcbz(UGeckoInstruction inst)
FixupBranch slow = J_CC(CC_Z, Jump::Near);
// Fast path: compute full address, then zero out 32 bytes of memory.
XORPS(XMM0, R(XMM0));
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0);
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0);
if (cpu_info.bAVX)
{
VXORPS(XMM0, XMM0, R(XMM0));
VMOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), YMM0);
VZEROUPPER();
}
else
{
XORPS(XMM0, R(XMM0));
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0);
MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0);
}
// Slow path: call the general-case code.
SwitchToFarCode();

View File

@@ -101,7 +101,7 @@ constexpr std::array<JitArm64OpTemplate, 13> s_table4{{
{592, &JitArm64::ps_mergeXX}, // ps_merge10
{624, &JitArm64::ps_mergeXX}, // ps_merge11
{1014, &JitArm64::FallBackToInterpreter}, // dcbz_l
{1014, &JitArm64::dcbz}, // dcbz_l
}};
constexpr std::array<JitArm64OpTemplate, 17> s_table4_2{{