// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include #include #include #include #include "common/assert.h" #include "common/types.h" #include "core/tls.h" #include "cpu_patches.h" #ifdef _WIN32 #include #else #include #ifdef __APPLE__ #include #endif #endif using namespace Xbyak::util; namespace Core { static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) { if (reg >= ZYDIS_REGISTER_EAX && reg <= ZYDIS_REGISTER_R15D) { return Xbyak::Reg32(reg - ZYDIS_REGISTER_EAX + Xbyak::Operand::EAX); } if (reg >= ZYDIS_REGISTER_RAX && reg <= ZYDIS_REGISTER_R15) { return Xbyak::Reg64(reg - ZYDIS_REGISTER_RAX + Xbyak::Operand::RAX); } UNREACHABLE_MSG("Unsupported register: {}", static_cast(reg)); } static Xbyak::Reg ZydisToXbyakRegisterOperand(const ZydisDecodedOperand& operand) { ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_REGISTER, "Expected register operand, got type: {}", static_cast(operand.type)); return ZydisToXbyakRegister(operand.reg.value); } static Xbyak::Address ZydisToXbyakMemoryOperand(const ZydisDecodedOperand& operand) { ASSERT_MSG(operand.type == ZYDIS_OPERAND_TYPE_MEMORY, "Expected memory operand, got type: {}", static_cast(operand.type)); if (operand.mem.base == ZYDIS_REGISTER_RIP) { return ptr[rip + operand.mem.disp.value]; } Xbyak::RegExp expression{}; if (operand.mem.base != ZYDIS_REGISTER_NONE) { expression = expression + ZydisToXbyakRegister(operand.mem.base); } if (operand.mem.index != ZYDIS_REGISTER_NONE) { if (operand.mem.scale != 0) { expression = expression + ZydisToXbyakRegister(operand.mem.index) * operand.mem.scale; } else { expression = expression + ZydisToXbyakRegister(operand.mem.index); } } if (operand.mem.disp.size != 0 && operand.mem.disp.value != 0) { expression = expression + operand.mem.disp.value; } return ptr[expression]; } static std::unique_ptr ZydisToXbyakOperand(const ZydisDecodedOperand& operand) { switch (operand.type) { case ZYDIS_OPERAND_TYPE_REGISTER: { return std::make_unique(ZydisToXbyakRegisterOperand(operand)); } case ZYDIS_OPERAND_TYPE_MEMORY: { return std::make_unique(ZydisToXbyakMemoryOperand(operand)); } default: UNREACHABLE_MSG("Unsupported operand type: {}", static_cast(operand.type)); } } static bool OperandUsesRegister(const Xbyak::Operand* operand, int index) { if (operand->isREG()) { return operand->getIdx() == index; } if (operand->isMEM()) { const Xbyak::RegExp& reg_exp = operand->getAddress().getRegExp(); return reg_exp.getBase().getIdx() == index || reg_exp.getIndex().getIdx() == index; } UNREACHABLE_MSG("Unsupported operand kind: {}", static_cast(operand->getKind())); } static bool IsRegisterAllocated( const std::initializer_list& allocated_registers, const int index) { return std::ranges::find_if(allocated_registers.begin(), allocated_registers.end(), [index](const Xbyak::Operand* operand) { return OperandUsesRegister(operand, index); }) != allocated_registers.end(); } static Xbyak::Reg AllocateScratchRegister( const std::initializer_list allocated_registers, const u32 bits) { for (int index = Xbyak::Operand::R8; index <= Xbyak::Operand::R15; index++) { if (!IsRegisterAllocated(allocated_registers, index)) { return Xbyak::Reg32e(index, static_cast(bits)); } } UNREACHABLE_MSG("Out of scratch registers!"); } #ifdef __APPLE__ static constexpr u32 MaxSavedRegisters = 3; static pthread_key_t register_save_slots[MaxSavedRegisters]; static std::once_flag register_save_init_flag; static_assert(sizeof(void*) == sizeof(u64), "Cannot fit a register inside a thread local storage slot."); static void InitializeRegisterSaveSlots() { for (u32 i = 0; i < MaxSavedRegisters; i++) { ASSERT_MSG(pthread_key_create(®ister_save_slots[i], nullptr) == 0, "Unable to allocate thread-local register save slot {}", i); } } static void SaveRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to save {} registers.", regs.size()); std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); u32 index = 0; for (const auto& reg : regs) { const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); c.putSeg(gs); c.mov(qword[offset], reg.cvt64()); } } static void RestoreRegisters(Xbyak::CodeGenerator& c, const std::initializer_list regs) { ASSERT_MSG(regs.size() <= MaxSavedRegisters, "Not enough space to restore {} registers.", regs.size()); std::call_once(register_save_init_flag, &InitializeRegisterSaveSlots); u32 index = 0; for (const auto& reg : regs) { const auto offset = reinterpret_cast(register_save_slots[index++] * sizeof(void*)); c.putSeg(gs); c.mov(reg.cvt64(), qword[offset]); } } static void GenerateANDN(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src1 = ZydisToXbyakRegisterOperand(operands[1]); const auto src2 = ZydisToXbyakOperand(operands[2]); const auto scratch = AllocateScratchRegister({&dst, &src1, src2.get()}, dst.getBit()); SaveRegisters(c, {scratch}); c.mov(scratch, src1); c.not_(scratch); c.and_(scratch, *src2); c.mov(dst, scratch); RestoreRegisters(c, {scratch}); } static void GenerateBEXTR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); const auto start_len = ZydisToXbyakRegisterOperand(operands[2]); const Xbyak::Reg32e shift(Xbyak::Operand::RCX, static_cast(start_len.getBit())); const auto scratch1 = AllocateScratchRegister({&dst, src.get(), &start_len, &shift}, dst.getBit()); const auto scratch2 = AllocateScratchRegister({&dst, src.get(), &start_len, &shift, &scratch1}, dst.getBit()); if (dst.getIdx() == shift.getIdx()) { SaveRegisters(c, {scratch1, scratch2}); } else { SaveRegisters(c, {scratch1, scratch2, shift}); } c.mov(scratch1, *src); if (shift.getIdx() != start_len.getIdx()) { c.mov(shift, start_len); } c.shr(scratch1, shift.cvt8()); c.shr(shift, 8); c.mov(scratch2, 1); c.shl(scratch2, shift.cvt8()); c.dec(scratch2); c.mov(dst, scratch1); c.and_(dst, scratch2); if (dst.getIdx() == shift.getIdx()) { RestoreRegisters(c, {scratch1, scratch2}); } else { RestoreRegisters(c, {scratch1, scratch2, shift}); } } static void GenerateBLSI(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); SaveRegisters(c, {scratch}); c.mov(scratch, *src); c.neg(scratch); c.and_(scratch, *src); c.mov(dst, scratch); RestoreRegisters(c, {scratch}); } static void GenerateBLSMSK(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); SaveRegisters(c, {scratch}); c.mov(scratch, *src); c.dec(scratch); c.xor_(scratch, *src); c.mov(dst, scratch); RestoreRegisters(c, {scratch}); } static void GenerateBLSR(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto src = ZydisToXbyakOperand(operands[1]); const auto scratch = AllocateScratchRegister({&dst, src.get()}, dst.getBit()); SaveRegisters(c, {scratch}); c.mov(scratch, *src); c.dec(scratch); c.and_(scratch, *src); c.mov(dst, scratch); RestoreRegisters(c, {scratch}); } bool FilterRosetta2Only(const ZydisDecodedOperand*) { int ret = 0; size_t size = sizeof(ret); if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) != 0) { return false; } return ret; } #endif // __APPLE__ static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { const auto& dst_op = operands[0]; const auto& src_op = operands[1]; // Patch only 'mov (64-bit register), fs:[0]' return src_op.type == ZYDIS_OPERAND_TYPE_MEMORY && src_op.mem.segment == ZYDIS_REGISTER_FS && src_op.mem.base == ZYDIS_REGISTER_NONE && src_op.mem.index == ZYDIS_REGISTER_NONE && src_op.mem.disp.value == 0 && dst_op.reg.value >= ZYDIS_REGISTER_RAX && dst_op.reg.value <= ZYDIS_REGISTER_R15; } static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); const auto slot = GetTcbKey(); #if defined(_WIN32) // The following logic is based on the wine implementation of TlsGetValue // https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719 static constexpr u32 TlsSlotsOffset = 0x1480; static constexpr u32 TlsExpansionSlotsOffset = 0x1780; static constexpr u32 TlsMinimumAvailable = 64; const u32 teb_offset = slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset; const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable; // Load the pointer to the table of TLS slots. c.putSeg(gs); c.mov(dst, ptr[reinterpret_cast(teb_offset)]); // Load the pointer to our buffer. c.mov(dst, qword[dst + tls_index * sizeof(LPVOID)]); #elif defined(__APPLE__) // The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by // pthread_getspecific https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96 c.putSeg(gs); c.mov(dst, qword[reinterpret_cast(slot * sizeof(void*))]); #else const auto src = ZydisToXbyakMemoryOperand(operands[1]); // Replace fs read with gs read. c.putSeg(gs); c.mov(dst, src); #endif } using PatchFilter = bool (*)(const ZydisDecodedOperand*); using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); struct PatchInfo { /// Filter for more granular patch conditions past just the instruction mnemonic. PatchFilter filter; /// Generator for the patch/trampoline. InstructionGenerator generator; /// Whether to use a trampoline for this patch. bool trampoline; }; static const std::unordered_map Patches = { #if defined(_WIN32) || defined(__APPLE__) // Windows and Apple need a trampoline. {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}}, #else {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif #ifdef __APPLE__ // BMI1 instructions that are not supported by Rosetta 2 on Apple Silicon. {ZYDIS_MNEMONIC_ANDN, {FilterRosetta2Only, GenerateANDN, true}}, {ZYDIS_MNEMONIC_BEXTR, {FilterRosetta2Only, GenerateBEXTR, true}}, {ZYDIS_MNEMONIC_BLSI, {FilterRosetta2Only, GenerateBLSI, true}}, {ZYDIS_MNEMONIC_BLSMSK, {FilterRosetta2Only, GenerateBLSMSK, true}}, {ZYDIS_MNEMONIC_BLSR, {FilterRosetta2Only, GenerateBLSR, true}}, #endif }; void PatchInstructions(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) { if (Patches.empty()) { // Nothing to patch on this platform. return; } ZydisDecoder instr_decoder; ZydisDecodedInstruction instruction; ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; ZydisDecoderInit(&instr_decoder, ZYDIS_MACHINE_MODE_LONG_64, ZYDIS_STACK_WIDTH_64); u8* code = reinterpret_cast(segment_addr); u8* end = code + segment_size; while (code < end) { ZyanStatus status = ZydisDecoderDecodeFull(&instr_decoder, code, end - code, &instruction, operands); if (!ZYAN_SUCCESS(status)) { code++; continue; } if (Patches.contains(instruction.mnemonic)) { auto patch_info = Patches.at(instruction.mnemonic); if (patch_info.filter(operands)) { auto patch_gen = Xbyak::CodeGenerator(instruction.length, code); if (patch_info.trampoline) { const auto trampoline_ptr = c.getCurr(); patch_info.generator(operands, c); // Return to the following instruction at the end of the trampoline. c.jmp(code + instruction.length); // Replace instruction with near jump to the trampoline. patch_gen.jmp(trampoline_ptr, Xbyak::CodeGenerator::LabelType::T_NEAR); } else { patch_info.generator(operands, patch_gen); } const auto patch_size = patch_gen.getCurr() - code; if (patch_size > 0) { ASSERT_MSG(instruction.length >= patch_size, "Instruction {} with length {} is too short to replace at: {}", ZydisMnemonicGetString(instruction.mnemonic), instruction.length, fmt::ptr(code)); // Fill remaining space with nops. patch_gen.nop(instruction.length - patch_size); LOG_DEBUG(Core, "Patched instruction '{}' at: {}", ZydisMnemonicGetString(instruction.mnemonic), fmt::ptr(code)); } } } code += instruction.length; } } } // namespace Core