diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 21acf1a7b..f109f0453 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -202,21 +202,39 @@ static void RestoreStack(Xbyak::CodeGenerator& c) { /// Switches to the patch stack, saves registers, and restores the original stack. static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { + // Uses a more robust solution for saving registers on MacOS to avoid potential stack corruption + // if games decide to not follow the ABI and use the red zone. +#ifdef __APPLE__ SaveStack(c); +#else + c.lea(rsp, ptr[rsp - 128]); // red zone +#endif for (const auto& reg : regs) { c.push(reg.cvt64()); } +#ifdef __APPLE__ RestoreStack(c); +#else + c.lea(rsp, ptr[rsp + 128]); +#endif } /// Switches to the patch stack, restores registers, and restores the original stack. static void RestoreRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { +#ifdef __APPLE__ SaveStack(c); +#else + c.lea(rsp, ptr[rsp - 128]); // red zone +#endif for (const auto& reg : regs) { c.pop(reg.cvt64()); } +#ifdef __APPLE__ RestoreStack(c); +#else + c.lea(rsp, ptr[rsp + 128]); +#endif } /// Switches to the patch stack and stores all registers. @@ -257,12 +275,11 @@ static void RestoreContext(Xbyak::CodeGenerator& c, const Xbyak::Operand& dst, RestoreStack(c); } -#ifdef __APPLE__ - static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); const auto src2 = ZydisToXbyakOperand(operands[2]); + ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "ANDN overwriting the stack pointer"); // Check if src2 is a memory operand or a register different to dst. // In those cases, we don't need to use a temporary register and are free to modify dst. @@ -301,6 +318,7 @@ static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenera const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); const auto start_len = ZydisToXbyakRegisterOperand(operands[2]); + ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "BEXTR overwriting the stack pointer"); const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast(start_len.getBit())); const auto scratch1 = @@ -338,6 +356,7 @@ static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenera static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); + ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "BLSI overwriting the stack pointer"); const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); @@ -367,6 +386,7 @@ static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); + ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "BLSMSK overwriting the stack pointer"); const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); @@ -395,9 +415,37 @@ static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGener RestoreRegisters(c, {scratch}); } +static void GenerateTZCNT(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + const auto src = ZydisToXbyakOperand(operands[1]); + ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "TZCNT overwriting the stack pointer"); + + Xbyak::Label src_zero, end; + + c.cmp(*src, 0); + c.je(src_zero); + + // If src is not zero, functions like a BSF, but also clears the CF + c.bsf(dst, *src); + c.clc(); + c.jmp(end); + + c.L(src_zero); + c.mov(dst, operands[0].size); + // Since dst is not zero, also set ZF to zero. Testing dst with itself when we know + // it isn't zero is a good way to do this. + // Use cvt32 to avoid REX/Operand size prefixes. + c.test(dst.cvt32(), dst.cvt32()); + // When source is zero, TZCNT also sets CF. + c.stc(); + + c.L(end); +} + static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); + ASSERT_MSG(dst.getIdx() != rsp.getIdx(), "BLSR overwriting the stack pointer"); const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); @@ -426,6 +474,8 @@ static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerat RestoreRegisters(c, {scratch}); } +#ifdef __APPLE__ + static __attribute__((sysv_abi)) void PerformVCVTPH2PS(float* out, const half_float::half* in, const u32 count) { for (u32 i = 0; i < count; i++) { @@ -616,6 +666,11 @@ static bool FilterNoSSE4a(const ZydisDecodedOperand*) { return !cpu.has(Cpu::tSSE4a); } +static bool FilterNoBMI1(const ZydisDecodedOperand*) { + Cpu cpu; + return !cpu.has(Cpu::tBMI1); +} + static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; @@ -897,14 +952,16 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}}, {ZYDIS_MNEMONIC_INSERTQ, {FilterNoSSE4a, GenerateINSERTQ, true}}, + // BMI1 + {ZYDIS_MNEMONIC_ANDN, {FilterNoBMI1, GenerateANDN, true}}, + {ZYDIS_MNEMONIC_BEXTR, {FilterNoBMI1, GenerateBEXTR, true}}, + {ZYDIS_MNEMONIC_BLSI, {FilterNoBMI1, GenerateBLSI, true}}, + {ZYDIS_MNEMONIC_BLSMSK, {FilterNoBMI1, GenerateBLSMSK, true}}, + {ZYDIS_MNEMONIC_BLSR, {FilterNoBMI1, GenerateBLSR, true}}, + {ZYDIS_MNEMONIC_TZCNT, {FilterNoBMI1, GenerateTZCNT, true}}, + #ifdef __APPLE__ // Patches for instruction sets not supported by Rosetta 2. - // BMI1 - {ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}}, - {ZYDIS_MNEMONIC_BEXTR, {FilterRosetta2Only, GenerateBEXTR, true}}, - {ZYDIS_MNEMONIC_BLSI, {FilterRosetta2Only, GenerateBLSI, true}}, - {ZYDIS_MNEMONIC_BLSMSK, {FilterRosetta2Only, GenerateBLSMSK, true}}, - {ZYDIS_MNEMONIC_BLSR, {FilterRosetta2Only, GenerateBLSR, true}}, // F16C {ZYDIS_MNEMONIC_VCVTPH2PS, {FilterRosetta2Only, GenerateVCVTPH2PS, true}}, {ZYDIS_MNEMONIC_VCVTPS2PH, {FilterRosetta2Only, GenerateVCVTPS2PH, true}},