core: Rewrite thread local storage implementation (#118)

This commit is contained in:
TheTurtle 2024-05-01 13:38:41 +03:00 committed by GitHub
parent b94efcba5a
commit 1b9bf924ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 176 additions and 189 deletions

View file

@ -3,6 +3,7 @@
#include <Zydis/Zydis.h>
#include <common/assert.h>
#include <xbyak/xbyak.h>
#include "common/config.h"
#include "common/logging/log.h"
#include "common/path_util.h"
@ -94,12 +95,17 @@ void Linker::LoadModuleToMemory(Module* m) {
0x1000; // align base size to 0x1000 block size (TODO is that the default
// block size or it can be changed?
m->base_virtual_addr = VirtualMemory::memory_alloc(LoadAddress, m->aligned_base_size,
VirtualMemory::MemoryMode::ExecuteReadWrite);
static constexpr u64 TrampolineSize = 8_MB;
m->base_virtual_addr =
VirtualMemory::memory_alloc(LoadAddress, m->aligned_base_size + TrampolineSize,
VirtualMemory::MemoryMode::ExecuteReadWrite);
LoadAddress += CODE_BASE_INCR * (1 + m->aligned_base_size / CODE_BASE_INCR);
LOG_INFO(Core_Linker, "====Load Module to Memory ========");
void* trampoline_addr = reinterpret_cast<void*>(m->base_virtual_addr + m->aligned_base_size);
Xbyak::CodeGenerator c(TrampolineSize, trampoline_addr);
LOG_INFO(Core_Linker, "======== Load Module to Memory ========");
LOG_INFO(Core_Linker, "base_virtual_addr ......: {:#018x}", m->base_virtual_addr);
LOG_INFO(Core_Linker, "base_size ..............: {:#018x}", base_size);
LOG_INFO(Core_Linker, "aligned_base_size ......: {:#018x}", m->aligned_base_size);
@ -123,7 +129,7 @@ void Linker::LoadModuleToMemory(Module* m) {
m->elf.LoadSegment(segment_addr, elf_pheader[i].p_offset, segment_file_size);
if (elf_pheader[i].p_flags & PF_EXEC) {
PatchTLS(segment_addr, segment_file_size);
PatchTLS(segment_addr, segment_file_size, c);
}
} else {
LOG_ERROR(Core_Linker, "p_memsz==0 in type {}",
@ -153,8 +159,8 @@ void Linker::LoadModuleToMemory(Module* m) {
case PT_TLS:
m->tls.image_virtual_addr = elf_pheader[i].p_vaddr + m->base_virtual_addr;
m->tls.image_size = GetAlignedSize(elf_pheader[i]);
LOG_INFO(Core_Linker, "tls virtual address ={:#x}", m->tls.image_virtual_addr);
LOG_INFO(Core_Linker, "tls image size ={}", m->tls.image_size);
LOG_INFO(Core_Linker, "TLS virtual address = {:#x}", m->tls.image_virtual_addr);
LOG_INFO(Core_Linker, "TLS image size = {}", m->tls.image_size);
break;
case PT_SCE_PROCPARAM:
m->proc_param_virtual_addr = elf_pheader[i].p_vaddr + m->base_virtual_addr;
@ -662,7 +668,7 @@ static void RunMainEntry(u64 addr, EntryParams* params, exit_func_t exit_func) {
// there's no coming back
:
: "r"(addr), "r"(params), "r"(exit_func)
: "rax", "rsi", "rdi", "rsp");
: "rax", "rsi", "rdi");
}
void Linker::Execute() {
@ -681,9 +687,13 @@ void Linker::Execute() {
p.argv[0] = "eboot.bin"; // hmm should be ok?
for (auto& m : m_modules) {
if (!m->elf.IsSharedLib()) {
RunMainEntry(m->elf.GetElfEntry() + m->base_virtual_addr, &p, ProgramExitFunc);
if (m->elf.IsSharedLib()) {
continue;
}
if (m->tls.image_virtual_addr != 0) {
SetTLSStorage(m->tls.image_virtual_addr);
}
RunMainEntry(m->elf.GetElfEntry() + m->base_virtual_addr, &p, ProgramExitFunc);
}
}

View file

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <xbyak/xbyak.h>
#include "common/assert.h"
#include "common/types.h"
#include "core/tls.h"
@ -11,25 +12,18 @@
namespace Core {
thread_local u8 TLS[1024];
struct TLSPattern {
uint8_t pattern[5];
uint8_t pattern_size;
uint8_t imm_size;
uint8_t target_reg;
u8 pattern[5];
u8 pattern_size;
u8 imm_size;
u8 target_reg;
};
constexpr static TLSPattern TlsPatterns[] = {
{{0x64, 0x48, 0xA1},
3,
8,
0}, // 64 48 A1 | 00 00 00 00 00 00 00 00 # mov rax, qword ptr fs:[64b imm]
{{0x64, 0x48, 0x8B, 0x4, 0x25},
5,
4,
0}, // 64 48 8B 04 25 | 00 00 00 00 # mov rax,qword ptr fs:[0]
// 64 48 A1 | 00 00 00 00 00 00 00 00 # mov rax, qword ptr fs:[64b imm]
{{0x64, 0x48, 0xA1}, 3, 8, 0},
// 64 48 8B 04 25 | 00 00 00 00 # mov rax,qword ptr fs:[0]
{{0x64, 0x48, 0x8B, 0x4, 0x25}, 5, 4, 0}, // rax
{{0x64, 0x48, 0x8B, 0xC, 0x25}, 5, 4, 1}, // rcx
{{0x64, 0x48, 0x8B, 0x14, 0x25}, 5, 4, 2}, // rdx
{{0x64, 0x48, 0x8B, 0x1C, 0x25}, 5, 4, 3}, // rbx
@ -47,103 +41,28 @@ constexpr static TLSPattern TlsPatterns[] = {
{{0x64, 0x4C, 0x8B, 0x3C, 0x25}, 5, 4, 15}, // r15
};
uintptr_t GetGuestTls(s64 tls_offset) {
if (tls_offset == 0) {
return reinterpret_cast<uintptr_t>(TLS);
}
UNREACHABLE_MSG("Unimplemented offset info tls");
#ifdef _WIN32
static DWORD slot = 0;
void SetTLSStorage(u64 image_address) {
// Guest apps will use both positive and negative offsets to the TLS pointer.
// User data at probably in negative offsets, while pthread data at positive offset.
const BOOL result = TlsSetValue(slot, reinterpret_cast<LPVOID>(image_address));
ASSERT(result != 0);
}
#ifdef _WIN64
static LONG WINAPI ExceptionHandler(PEXCEPTION_POINTERS pExp) noexcept {
auto orig_rip = pExp->ContextRecord->Rip;
while (*(u8*)pExp->ContextRecord->Rip == 0x66) {
pExp->ContextRecord->Rip++;
}
void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) {
using namespace Xbyak::util;
if (*(u8*)pExp->ContextRecord->Rip == 0xcd) {
int reg = *(u8*)(pExp->ContextRecord->Rip + 1) - 0x80;
int sizes = *(u8*)(pExp->ContextRecord->Rip + 2);
int pattern_size = sizes & 0xF;
int imm_size = sizes >> 4;
int64_t tls_offset;
if (imm_size == 4) {
tls_offset = *(s32*)(pExp->ContextRecord->Rip + pattern_size);
} else {
tls_offset = *(s64*)(pExp->ContextRecord->Rip + pattern_size);
}
(&pExp->ContextRecord->Rax)[reg] = GetGuestTls(tls_offset); /* GetGuestTls */
pExp->ContextRecord->Rip += pattern_size + imm_size;
return EXCEPTION_CONTINUE_EXECUTION;
}
pExp->ContextRecord->Rip = orig_rip;
const u32 ec = pExp->ExceptionRecord->ExceptionCode;
switch (ec) {
case EXCEPTION_ACCESS_VIOLATION: {
LOG_CRITICAL(Core, "Exception EXCEPTION_ACCESS_VIOLATION ({:#x})", ec);
const auto info = pExp->ExceptionRecord->ExceptionInformation;
switch (info[0]) {
case 0:
LOG_CRITICAL(Core, "Read violation at address {:#x}", info[1]);
break;
case 1:
LOG_CRITICAL(Core, "Write violation at address {:#x}", info[1]);
break;
case 8:
LOG_CRITICAL(Core, "DEP violation at address {:#x}", info[1]);
break;
default:
break;
}
break;
}
case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
LOG_CRITICAL(Core, "Exception EXCEPTION_ARRAY_BOUNDS_EXCEEDED ({:#x})", ec);
break;
case EXCEPTION_DATATYPE_MISALIGNMENT:
LOG_CRITICAL(Core, "Exception EXCEPTION_DATATYPE_MISALIGNMENT ({:#x})", ec);
break;
case EXCEPTION_FLT_DIVIDE_BY_ZERO:
LOG_CRITICAL(Core, "Exception EXCEPTION_FLT_DIVIDE_BY_ZERO ({:#x})", ec);
break;
case EXCEPTION_ILLEGAL_INSTRUCTION:
LOG_CRITICAL(Core, "Exception EXCEPTION_ILLEGAL_INSTRUCTION ({:#x})", ec);
break;
case EXCEPTION_IN_PAGE_ERROR:
LOG_CRITICAL(Core, "Exception EXCEPTION_IN_PAGE_ERROR ({:#x})", ec);
break;
case EXCEPTION_INT_DIVIDE_BY_ZERO:
LOG_CRITICAL(Core, "Exception EXCEPTION_INT_DIVIDE_BY_ZERO ({:#x})", ec);
break;
case EXCEPTION_PRIV_INSTRUCTION:
LOG_CRITICAL(Core, "Exception EXCEPTION_PRIV_INSTRUCTION ({:#x})", ec);
break;
case EXCEPTION_STACK_OVERFLOW:
LOG_CRITICAL(Core, "Exception EXCEPTION_STACK_OVERFLOW ({:#x})", ec);
break;
default:
return EXCEPTION_CONTINUE_SEARCH;
}
return EXCEPTION_CONTINUE_SEARCH;
}
#endif
void InstallTlsHandler() {
#ifdef _WIN64
if (!AddVectoredExceptionHandler(0, ExceptionHandler)) {
LOG_CRITICAL(Core, "Failed to register an exception handler");
}
#endif
}
void PatchTLS(u64 segment_addr, u64 segment_size) {
u8* code = reinterpret_cast<u8*>(segment_addr);
auto remaining_size = segment_size;
// Sometimes loads from the FS segment are prefixed with useless operand size prefix bytes like:
// |66 66 66| 64 48 8b 04 25 00 # mov rax, qword ptr fs:[0x0]
// These are probably ignored by the processor but when patching the instruction to a jump
// they cause issues. So look for them and patch them to nop to avoid problems.
static constexpr std::array<u8, 3> BadPrefix = {0x66, 0x66, 0x66};
while (remaining_size) {
for (const auto& tls_pattern : TlsPatterns) {
const auto total_size = tls_pattern.pattern_size + tls_pattern.imm_size;
@ -153,18 +72,55 @@ void PatchTLS(u64 segment_addr, u64 segment_size) {
if (std::memcmp(code, tls_pattern.pattern, tls_pattern.pattern_size) != 0) {
continue;
}
u64 offset = 0;
if (tls_pattern.imm_size == 4) {
std::memcpy(&offset, code + tls_pattern.pattern_size, sizeof(u32));
LOG_INFO(Core_Linker, "PATTERN32 FOUND at {}, reg: {} offset: {:#x}",
fmt::ptr(code), tls_pattern.target_reg,
*(u32*)(code + tls_pattern.pattern_size));
fmt::ptr(code), tls_pattern.target_reg, offset);
} else {
std::memcpy(&offset, code + tls_pattern.pattern_size, sizeof(u64));
LOG_INFO(Core_Linker, "PATTERN64 FOUND at {}, reg: {} offset: {:#x}",
fmt::ptr(code), tls_pattern.target_reg,
*(u32*)(code + tls_pattern.pattern_size));
fmt::ptr(code), tls_pattern.target_reg, offset);
}
code[0] = 0xcd;
code[1] = 0x80 + tls_pattern.target_reg;
code[2] = tls_pattern.pattern_size | (tls_pattern.imm_size << 4);
ASSERT(offset == 0);
// Allocate slot in the process if not done already.
if (slot == 0) {
slot = TlsAlloc();
}
// Replace bogus instruction prefix with nops if it exists.
if (std::memcmp(code - BadPrefix.size(), BadPrefix.data(), sizeof(BadPrefix)) == 0) {
auto patch = Xbyak::CodeGenerator(BadPrefix.size(), code - BadPrefix.size());
patch.nop(BadPrefix.size());
}
// Replace mov instruction with near jump to the trampoline.
static constexpr u32 NearJmpSize = 5;
auto patch = Xbyak::CodeGenerator(total_size, code);
patch.jmp(c.getCurr(), Xbyak::CodeGenerator::LabelType::T_NEAR);
patch.nop(total_size - NearJmpSize);
// Write the trampoline.
// The following logic is based on the wine implementation of TlsGetValue
// https://github.com/wine-mirror/wine/blob/a27b9551/dlls/kernelbase/thread.c#L719
static constexpr u32 TlsSlotsOffset = 0x1480;
static constexpr u32 TlsExpansionSlotsOffset = 0x1780;
static constexpr u32 TlsMinimumAvailable = 64;
const u32 teb_offset =
slot < TlsMinimumAvailable ? TlsSlotsOffset : TlsExpansionSlotsOffset;
const u32 tls_index = slot < TlsMinimumAvailable ? slot : slot - TlsMinimumAvailable;
const auto target_reg = Xbyak::Reg64(tls_pattern.target_reg);
c.mov(target_reg, teb_offset);
c.putSeg(gs);
c.mov(target_reg, ptr[target_reg]); // Load the pointer to the table of tls slots.
c.mov(
target_reg,
qword[target_reg + tls_index * sizeof(LPVOID)]); // Load the pointer to our buffer.
c.jmp(code + total_size); // Return to the instruction right after the mov.
// Move ahead in module.
code += total_size - 1;
remaining_size -= total_size - 1;
break;
@ -174,4 +130,16 @@ void PatchTLS(u64 segment_addr, u64 segment_size) {
}
}
#else
void SetTLSStorage(u64 image_address) {
UNREACHABLE_MSG("Thread local storage is unimplemented on posix platforms!");
}
void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c) {
UNREACHABLE_MSG("Thread local storage is unimplemented on posix platforms!");
}
#endif
} // namespace Core

View file

@ -5,12 +5,16 @@
#include "common/types.h"
namespace Xbyak {
class CodeGenerator;
}
namespace Core {
/// Installs a host exception handler to handle guest TLS access.
void InstallTlsHandler();
/// Sets the data pointer that contains the TLS image.
void SetTLSStorage(u64 image_address);
/// Patches any instructions that access TLS to trigger the exception handler.
void PatchTLS(u64 segment_addr, u64 segment_size);
/// Patches any instructions that access guest TLS to use provided storage.
void PatchTLS(u64 segment_addr, u64 segment_size, Xbyak::CodeGenerator& c);
} // namespace Core