diff --git a/.gitignore b/.gitignore index 13749d1..014e033 100644 --- a/.gitignore +++ b/.gitignore @@ -6,8 +6,8 @@ *.elf *.z64 -# Output C files -test/funcs +# Local working data +tests # Linux build output build/ @@ -42,12 +42,6 @@ bld/ # Visual Studio 2015/2017 cache/options directory .vs/ -# Libraries (binaries that aren't in the repo) -test/Lib - -# RT64 (since it's not public yet) -test/RT64 - # Runtime files imgui.ini rt64.log diff --git a/.gitmodules b/.gitmodules index 2d7b930..1369f13 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "lib/tomlplusplus"] path = lib/tomlplusplus url = https://github.com/marzer/tomlplusplus +[submodule "lib/sljit"] + path = lib/sljit + url = https://github.com/zherczeg/sljit diff --git a/CMakeLists.txt b/CMakeLists.txt index 2733666..7fc7581 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,3 +164,32 @@ target_sources(OfflineModRecomp PRIVATE ) target_link_libraries(OfflineModRecomp fmt rabbitizer tomlplusplus::tomlplusplus N64Recomp) + +# Live recompiler +project(LiveRecomp) +add_library(LiveRecomp) + +target_sources(LiveRecomp PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/LiveRecomp/live_generator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src/sljitLir.c +) + +target_include_directories(LiveRecomp PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src +) + +target_link_libraries(LiveRecomp N64Recomp) + +# Live recompiler test +project(LiveRecompTest) +add_executable(LiveRecompTest) + +target_sources(LiveRecompTest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/LiveRecomp/live_recompiler_test.cpp +) + +target_include_directories(LiveRecompTest PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src +) + +target_link_libraries(LiveRecompTest LiveRecomp) diff --git a/LiveRecomp/live_generator.cpp b/LiveRecomp/live_generator.cpp new file mode 100644 index 0000000..48c5dc6 --- /dev/null +++ b/LiveRecomp/live_generator.cpp @@ -0,0 +1,1865 @@ +#include +#include +#include +#include + +#include "fmt/format.h" +#include "fmt/ostream.h" + +#include "recompiler/live_recompiler.h" +#include "recomp.h" + +#include "sljitLir.h" + +static_assert(sizeof(void*) >= sizeof(sljit_uw), "`void*` must be able to hold a `sljit_uw` value for rewritable jumps!"); + +constexpr uint64_t rdram_offset = 0xFFFFFFFF80000000ULL; + +void N64Recomp::live_recompiler_init() { + RabbitizerConfig_Cfg.pseudos.pseudoMove = false; + RabbitizerConfig_Cfg.pseudos.pseudoBeqz = false; + RabbitizerConfig_Cfg.pseudos.pseudoBnez = false; + RabbitizerConfig_Cfg.pseudos.pseudoNot = false; + RabbitizerConfig_Cfg.pseudos.pseudoBal = false; +} + +namespace Registers { + constexpr int rdram = SLJIT_S0; // stores (rdram - rdram_offset) + constexpr int ctx = SLJIT_S1; // stores ctx + constexpr int c1cs = SLJIT_S2; // stores ctx + constexpr int hi = SLJIT_S3; // stores ctx + constexpr int lo = SLJIT_S4; // stores ctx + constexpr int arithmetic_temp1 = SLJIT_R0; + constexpr int arithmetic_temp2 = SLJIT_R1; + constexpr int arithmetic_temp3 = SLJIT_R2; + constexpr int arithmetic_temp4 = SLJIT_R3; +} + +struct InnerCall { + size_t target_func_index; + sljit_jump* jump; +}; + +struct ReferenceSymbolCall { + N64Recomp::SymbolReference reference; + sljit_jump* jump; +}; + +struct SwitchErrorJump { + uint32_t instr_vram; + uint32_t jtbl_vram; + sljit_jump* jump; +}; + +struct N64Recomp::LiveGeneratorContext { + std::string function_name; + std::unordered_map labels; + std::unordered_map> pending_jumps; + std::vector func_labels; + std::vector inner_calls; + std::vector> switch_jump_labels; + // See LiveGeneratorOutput::jump_tables for info. Contains sljit labels so they can be linked after recompilation. + std::vector, std::unique_ptr>> unlinked_jump_tables; + // Jump tables for the current function being recompiled. + std::vector> pending_jump_tables; + // See LiveGeneratorOutput::reference_symbol_jumps for info. + std::vector> reference_symbol_jumps; + // See LiveGeneratorOutput::import_jumps_by_index for info. + std::unordered_multimap import_jumps_by_index; + std::vector switch_error_jumps; + sljit_jump* cur_branch_jump; +}; + +N64Recomp::LiveGenerator::LiveGenerator(size_t num_funcs, const LiveGeneratorInputs& inputs) : inputs(inputs) { + compiler = sljit_create_compiler(nullptr); + context = std::make_unique(); + context->func_labels.resize(num_funcs); + errored = false; +} + +N64Recomp::LiveGenerator::~LiveGenerator() { + if (compiler != nullptr) { + sljit_free_compiler(compiler); + compiler = nullptr; + } +} + +N64Recomp::LiveGeneratorOutput N64Recomp::LiveGenerator::finish() { + LiveGeneratorOutput ret{}; + if (errored) { + ret.good = false; + return ret; + } + + ret.good = true; + + // Populate all the pending inner function calls. + for (const InnerCall& call : context->inner_calls) { + sljit_label* target_func_label = context->func_labels[call.target_func_index]; + + // Generation isn't valid if the target function wasn't recompiled. + if (target_func_label == nullptr) { + return { }; + } + + sljit_set_label(call.jump, target_func_label); + } + + // Generate the switch error jump targets and assign the jump labels. + if (!context->switch_error_jumps.empty()) { + // Allocate the function name and place it in the literals. + char* func_name = new char[context->function_name.size() + 1]; + memcpy(func_name, context->function_name.c_str(), context->function_name.size()); + func_name[context->function_name.size()] = '\x00'; + ret.string_literals.emplace_back(func_name); + + std::vector switch_error_return_jumps{}; + switch_error_return_jumps.resize(context->switch_error_jumps.size()); + + // Generate and assign the labels for the switch error jumps. + for (size_t i = 0; i < context->switch_error_jumps.size(); i++) { + const auto& cur_error_jump = context->switch_error_jumps[i]; + + // Generate a label and assign it to the jump. + sljit_set_label(cur_error_jump.jump, sljit_emit_label(compiler)); + + // Load the arguments (function name, vram, jump table address) + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sljit_sw(func_name)); + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, sljit_sw(cur_error_jump.instr_vram)); + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, sljit_sw(cur_error_jump.jtbl_vram)); + + // Call switch_error. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3V(P, 32, 32), SLJIT_IMM, sljit_sw(inputs.switch_error)); + + // Jump to the return statement. + switch_error_return_jumps[i] = sljit_emit_jump(compiler, SLJIT_JUMP); + } + + // Generate the return statement. + sljit_label* return_label = sljit_emit_label(compiler); + sljit_emit_return_void(compiler); + + // Assign the label for all the return jumps. + for (sljit_jump* cur_jump : switch_error_return_jumps) { + sljit_set_label(cur_jump, return_label); + } + } + context->switch_error_jumps.clear(); + + // Generate the code. + ret.code = sljit_generate_code(compiler, 0, NULL); + ret.code_size = sljit_get_generated_code_size(compiler); + ret.functions.resize(context->func_labels.size()); + + // Get the function addresses. + for (size_t func_index = 0; func_index < ret.functions.size(); func_index++) { + sljit_label* func_label = context->func_labels[func_index]; + + // If the function wasn't recompiled, don't populate its address. + if (func_label != nullptr) { + ret.functions[func_index] = reinterpret_cast(sljit_get_label_addr(func_label)); + } + } + context->func_labels.clear(); + + // Get the reference symbol jump instruction addresses. + ret.reference_symbol_jumps.resize(context->reference_symbol_jumps.size()); + for (size_t jump_index = 0; jump_index < context->reference_symbol_jumps.size(); jump_index++) { + ReferenceJumpDetails& details = context->reference_symbol_jumps[jump_index].first; + sljit_jump* jump = context->reference_symbol_jumps[jump_index].second; + + ret.reference_symbol_jumps[jump_index].first = details; + ret.reference_symbol_jumps[jump_index].second = reinterpret_cast(jump->addr); + } + context->reference_symbol_jumps.clear(); + + // Get the import jump instruction addresses. + ret.import_jumps_by_index.reserve(context->import_jumps_by_index.size()); + for (auto& [jump_index, jump] : context->import_jumps_by_index) { + ret.import_jumps_by_index.emplace(jump_index, reinterpret_cast(jump->addr)); + } + context->import_jumps_by_index.clear(); + + // Populate label addresses for the jump tables and place them in the output. + for (auto& [labels, jump_table] : context->unlinked_jump_tables) { + for (size_t entry_index = 0; entry_index < labels.size(); entry_index++) { + sljit_label* cur_label = labels[entry_index]; + jump_table[entry_index] = reinterpret_cast(sljit_get_label_addr(cur_label)); + } + ret.jump_tables.emplace_back(std::move(jump_table)); + } + context->unlinked_jump_tables.clear(); + + ret.executable_offset = sljit_get_executable_offset(compiler); + + sljit_free_compiler(compiler); + compiler = nullptr; + errored = false; + + return ret; +} + +N64Recomp::LiveGeneratorOutput::~LiveGeneratorOutput() { + if (code != nullptr) { + sljit_free_code(code, nullptr); + code = nullptr; + } +} + +size_t N64Recomp::LiveGeneratorOutput::num_reference_symbol_jumps() const { + return reference_symbol_jumps.size(); +} + +void N64Recomp::LiveGeneratorOutput::set_reference_symbol_jump(size_t jump_index, recomp_func_t* func) { + const auto& jump_entry = reference_symbol_jumps[jump_index]; + sljit_set_jump_addr(reinterpret_cast(jump_entry.second), reinterpret_cast(func), executable_offset); +} + +N64Recomp::ReferenceJumpDetails N64Recomp::LiveGeneratorOutput::get_reference_symbol_jump_details(size_t jump_index) { + return reference_symbol_jumps[jump_index].first; +} + +void N64Recomp::LiveGeneratorOutput::populate_import_symbol_jumps(size_t import_index, recomp_func_t* func) { + auto find_range = import_jumps_by_index.equal_range(import_index); + for (auto it = find_range.first; it != find_range.second; ++it) { + sljit_set_jump_addr(reinterpret_cast(it->second), reinterpret_cast(func), executable_offset); + } +} + +constexpr int get_gpr_context_offset(int gpr_index) { + return offsetof(recomp_context, r0) + sizeof(recomp_context::r0) * gpr_index; +} + +constexpr int get_fpr_single_context_offset(int fpr_index) { + return offsetof(recomp_context, f0.fl) + sizeof(recomp_context::f0) * fpr_index; +} + +constexpr int get_fpr_double_context_offset(int fpr_index) { + return offsetof(recomp_context, f0.d) + sizeof(recomp_context::f0) * fpr_index; +} + +constexpr int get_fpr_u32l_context_offset(int fpr_index) { + if (fpr_index & 1) { + // TODO implement odd floats. + assert(false); + return -1; + // return fmt::format("ctx->f_odd[({} - 1) * 2]", fpr_index); + } + else { + return offsetof(recomp_context, f0.u32l) + sizeof(recomp_context::f0) * fpr_index; + } +} + +constexpr int get_fpr_u64_context_offset(int fpr_index) { + return offsetof(recomp_context, f0.u64) + sizeof(recomp_context::f0) * fpr_index; +} + +void get_gpr_values(int gpr, sljit_sw& out, sljit_sw& outw) { + if (gpr == 0) { + out = SLJIT_IMM; + outw = 0; + } + else { + out = SLJIT_MEM1(Registers::ctx); + outw = get_gpr_context_offset(gpr); + } +} + +bool get_operand_values(N64Recomp::Operand operand, const N64Recomp::InstructionContext& context, sljit_sw& out, sljit_sw& outw) { + using namespace N64Recomp; + + switch (operand) { + case Operand::Rd: + get_gpr_values(context.rd, out, outw); + break; + case Operand::Rs: + get_gpr_values(context.rs, out, outw); + break; + case Operand::Rt: + get_gpr_values(context.rt, out, outw); + break; + case Operand::Fd: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_single_context_offset(context.fd); + break; + case Operand::Fs: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_single_context_offset(context.fs); + break; + case Operand::Ft: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_single_context_offset(context.ft); + break; + case Operand::FdDouble: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_double_context_offset(context.fd); + break; + case Operand::FsDouble: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_double_context_offset(context.fs); + break; + case Operand::FtDouble: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_double_context_offset(context.ft); + break; + case Operand::FdU32L: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_u32l_context_offset(context.fd); + break; + case Operand::FsU32L: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_u32l_context_offset(context.fs); + break; + case Operand::FtU32L: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_u32l_context_offset(context.ft); + break; + case Operand::FdU32H: + assert(false); + return false; + case Operand::FsU32H: + assert(false); + return false; + case Operand::FtU32H: + assert(false); + return false; + case Operand::FdU64: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_u64_context_offset(context.fd); + break; + case Operand::FsU64: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_u64_context_offset(context.fs); + break; + case Operand::FtU64: + out = SLJIT_MEM1(Registers::ctx); + outw = get_fpr_u64_context_offset(context.ft); + break; + case Operand::ImmU16: + out = SLJIT_IMM; + outw = (sljit_sw)(uint16_t)context.imm16; + break; + case Operand::ImmS16: + out = SLJIT_IMM; + outw = (sljit_sw)(int16_t)context.imm16; + break; + case Operand::Sa: + out = SLJIT_IMM; + outw = context.sa; + break; + case Operand::Sa32: + out = SLJIT_IMM; + outw = context.sa + 32; + break; + case Operand::Cop1cs: + out = Registers::c1cs; + outw = 0; + break; + case Operand::Hi: + out = Registers::hi; + outw = 0; + break; + case Operand::Lo: + out = Registers::lo; + outw = 0; + break; + case Operand::Zero: + out = SLJIT_IMM; + outw = 0; + break; + } + return true; +} + +bool outputs_to_zero(N64Recomp::Operand output, const N64Recomp::InstructionContext& ctx) { + if (output == N64Recomp::Operand::Rd && ctx.rd == 0) { + return true; + } + if (output == N64Recomp::Operand::Rt && ctx.rt == 0) { + return true; + } + if (output == N64Recomp::Operand::Rs && ctx.rs == 0) { + return true; + } + return false; +} + +void N64Recomp::LiveGenerator::process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const { + // Skip instructions that output to $zero + if (outputs_to_zero(op.output, ctx)) { + return; + } + + sljit_sw dst; + sljit_sw dstw; + sljit_sw src1; + sljit_sw src1w; + sljit_sw src2; + sljit_sw src2w; + bool output_good = get_operand_values(op.output, ctx, dst, dstw); + bool input0_good = get_operand_values(op.operands.operands[0], ctx, src1, src1w); + bool input1_good = get_operand_values(op.operands.operands[1], ctx, src2, src2w); + + if (!output_good || !input0_good || !input1_good) { + assert(false); + errored = true; + return; + } + + // If a relocation is present, perform the relocation and change src1/src1w to use the relocated value. + if (ctx.reloc_type != RelocType::R_MIPS_NONE) { + // Only allow LO16 relocations. + if (ctx.reloc_type != RelocType::R_MIPS_LO16) { + assert(false); + errored = true; + return; + } + // Only allow relocations on immediates. + if (src2 != SLJIT_IMM) { + assert(false); + errored = true; + return; + } + // Only allow relocations on loads and adds. + switch (op.type) { + case BinaryOpType::LD: + case BinaryOpType::LW: + case BinaryOpType::LWU: + case BinaryOpType::LH: + case BinaryOpType::LHU: + case BinaryOpType::LB: + case BinaryOpType::LBU: + case BinaryOpType::LDL: + case BinaryOpType::LDR: + case BinaryOpType::LWL: + case BinaryOpType::LWR: + case BinaryOpType::Add64: + case BinaryOpType::Add32: + break; + default: + // Relocations aren't allowed on this instruction. + assert(false); + errored = true; + return; + } + // Load the relocated address into temp2. + load_relocated_address(ctx, Registers::arithmetic_temp1); + // Extract the LO16 value from the full address (sign extended lower 16 bits). + sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0); + // Replace the immediate input (src2) with the LO16 value. + src2 = Registers::arithmetic_temp1; + src2w = 0; + } + + // TODO validate that the unary ops are valid for the current binary op. + if (op.operands.operand_operations[0] != UnaryOpType::None && + op.operands.operand_operations[0] != UnaryOpType::ToU64 && + op.operands.operand_operations[0] != UnaryOpType::ToS64 && + op.operands.operand_operations[0] != UnaryOpType::ToU32) + { + assert(false); + errored = true; + return; + } + + if (op.operands.operand_operations[1] != UnaryOpType::None && + op.operands.operand_operations[1] != UnaryOpType::ToU64 && + op.operands.operand_operations[1] != UnaryOpType::ToS64 && + op.operands.operand_operations[1] != UnaryOpType::Mask5 && // Only for 32-bit shifts + op.operands.operand_operations[1] != UnaryOpType::Mask6) // Only for 64-bit shifts + { + assert(false); + errored = true; + return; + } + + bool cmp_unsigned = op.operands.operand_operations[0] != UnaryOpType::ToS64; + + auto sign_extend_and_store = [dst, dstw, this]() { + // Sign extend the result. + sljit_emit_op1(this->compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0); + // Store the result back into the context. + sljit_emit_op1(this->compiler, SLJIT_MOV_P, dst, dstw, Registers::arithmetic_temp1, 0); + }; + + auto do_op32 = [src1, src1w, src2, src2w, this, &sign_extend_and_store](sljit_s32 op) { + sljit_emit_op2(this->compiler, op, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w); + sign_extend_and_store(); + }; + + auto do_op64 = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op) { + sljit_emit_op2(this->compiler, op, dst, dstw, src1, src1w, src2, src2w); + }; + + auto do_float_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op) { + sljit_emit_fop2(this->compiler, op, dst, dstw, src1, src1w, src2, src2w); + }; + + auto do_load_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op, int address_xor) { + // TODO 0 immediate optimization. + + // Add the base and immediate into the arithemtic temp. + sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w); + + if (address_xor != 0) { + // xor the address with the specified amount + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, address_xor); + } + + // Load the value at rdram + address into the arithemtic temp with the given operation to allow for sign-extension or zero-extension. + sljit_emit_op1(compiler, op, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0); + + // Move the arithmetic temp into the destination. + sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, Registers::arithmetic_temp1, 0); + }; + + auto do_compare_op = [cmp_unsigned, dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op_unsigned, sljit_s32 op_signed) { + // Pick the operation based on the signedness of the comparison. + sljit_s32 op = cmp_unsigned ? op_unsigned : op_signed; + + // Pick the flags to set based on the operation. + sljit_s32 flags; + if (op <= SLJIT_NOT_ZERO) { + flags = SLJIT_SET_Z; + } else + { + flags = SLJIT_SET(op); + } + + // Perform a subtraction with the determined flag. + sljit_emit_op2u(compiler, SLJIT_SUB | flags, src1, src1w, src2, src2w); + + // Move the operation's flag into the destination. + sljit_emit_op_flags(compiler, SLJIT_MOV, dst, dstw, op); + }; + + auto do_float_compare_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 flag_op, sljit_s32 set_op, bool double_precision) { + // Pick the operation based on the signedness of the comparison. + sljit_s32 compare_op = set_op | (double_precision ? SLJIT_CMP_F64 : SLJIT_CMP_F32); + + // Perform the comparison with the determined operation. + // Float comparisons use fop1 and put the left hand side in dst. + sljit_emit_fop1(compiler, compare_op, src1, src1w, src2, src2w); + + // Move the operation's flag into the destination. + sljit_emit_op_flags(compiler, SLJIT_MOV, dst, dstw, flag_op); + }; + + auto do_unaligned_load_op = [dst, dstw, src1, src1w, src2, src2w, this](bool left, bool doubleword) { + // TODO 0 immediate optimization. + + // Determine the shift direction to use for calculating the mask and shifting the loaded value. + sljit_sw shift_op = left ? SLJIT_SHL : SLJIT_LSHR; + // Determine the operation's word size. + sljit_sw word_size = doubleword ? 8 : 4; + + // Add the base and immediate into the temp1. + // addr = base + offset + sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w); + + // Mask the address with the alignment mask to get the misalignment and put it in temp2. + // misalignment = addr & (word_size - 1); + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, word_size - 1); + + // Mask the address with ~alignment_mask to get the aligned address and put it in temp1. + // addr = addr & ~(word_size - 1); + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, ~(word_size - 1)); + + // Load the word at rdram + aligned address into the temp1 with sign-extension. + // loaded_value = *addr + if (doubleword) { + // Rotate the loaded doubleword by 32 bits to swap the two words into the right order. + sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32); + } + else { + // Use MOV_S32 to sign-extend the loaded word. + sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0); + } + + // Inverse the misalignment if this is a right load. + if (!left) { + // misalignment = (word_size - 1 - misalignment) * 8 + sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp2, 0, SLJIT_IMM, word_size - 1, Registers::arithmetic_temp2, 0); + } + + // Calculate the misalignment shift and put it into temp2. + // misalignment_shift = misalignment * 8 + sljit_emit_op2(compiler, SLJIT_SHL, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, 3); + + // Calculate the misalignment mask and put it into temp3. Use a 32-bit shift if this is a 32-bit operation. + // misalignment_mask = word(-1) SHIFT misalignment_shift + sljit_emit_op2(compiler, doubleword ? shift_op : (shift_op | SLJIT_32), + Registers::arithmetic_temp3, 0, + SLJIT_IMM, doubleword ? uint64_t(-1) : uint32_t(-1), + Registers::arithmetic_temp2, 0); + + if (!doubleword) { + // Sign extend the misalignment mask. + // misalignment_mask = ((uint64_t)(int32_t)misalignment_mask) + sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0); + } + + // Shift the loaded value by the misalignment shift and put it into temp1. + // loaded_value SHIFT misalignment_shift + sljit_emit_op2(compiler, shift_op, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0); + + if (left && !doubleword) { + // Sign extend the loaded value. + // loaded_value = (uint64_t)(int32_t)loaded_value + sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0); + } + + // Mask the shifted loaded value by the misalignment mask. + // loaded_value &= misalignment_mask + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp3, 0); + + // Invert the misalignment mask and store it into temp3. + // misalignment_mask = ~misalignment_mask + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, SLJIT_IMM, sljit_sw(-1)); + + // Mask the initial value (stored in the destination) with the misalignment mask and place it into temp3. + // masked_value = initial_value & misalignment_mask + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp3, 0, dst, dstw, Registers::arithmetic_temp3, 0); + + // Combine the masked initial value with the shifted loaded value and store it in the destination. + // out = masked_value | loaded_value + sljit_emit_op2(compiler, SLJIT_OR, dst, dstw, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp1, 0); + }; + + switch (op.type) { + // Addition/subtraction + case BinaryOpType::Add32: + do_op32(SLJIT_ADD32); + break; + case BinaryOpType::Sub32: + do_op32(SLJIT_SUB32); + break; + case BinaryOpType::Add64: + do_op64(SLJIT_ADD); + break; + case BinaryOpType::Sub64: + do_op64(SLJIT_SUB); + break; + + // Float arithmetic + case BinaryOpType::AddFloat: + do_float_op(SLJIT_ADD_F32); + break; + case BinaryOpType::AddDouble: + do_float_op(SLJIT_ADD_F64); + break; + case BinaryOpType::SubFloat: + do_float_op(SLJIT_SUB_F32); + break; + case BinaryOpType::SubDouble: + do_float_op(SLJIT_SUB_F64); + break; + case BinaryOpType::MulFloat: + do_float_op(SLJIT_MUL_F32); + break; + case BinaryOpType::MulDouble: + do_float_op(SLJIT_MUL_F64); + break; + case BinaryOpType::DivFloat: + do_float_op(SLJIT_DIV_F32); + break; + case BinaryOpType::DivDouble: + do_float_op(SLJIT_DIV_F64); + break; + + // Bitwise + case BinaryOpType::And64: + do_op64(SLJIT_AND); + break; + case BinaryOpType::Or64: + do_op64(SLJIT_OR); + break; + case BinaryOpType::Nor64: + // Bitwise or the two registers and move the result into the temp, then invert the result and move it into the destination. + sljit_emit_op2(this->compiler, SLJIT_OR, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w); + sljit_emit_op2(this->compiler, SLJIT_XOR, dst, dstw, Registers::arithmetic_temp1, 0, SLJIT_IMM, sljit_sw(-1)); + break; + case BinaryOpType::Xor64: + do_op64(SLJIT_XOR); + break; + case BinaryOpType::Sll32: + // TODO only mask if the second input's op is Mask5. + do_op32(SLJIT_MSHL32); + break; + case BinaryOpType::Sll64: + // TODO only mask if the second input's op is Mask6. + do_op64(SLJIT_MSHL); + break; + case BinaryOpType::Srl32: + // TODO only mask if the second input's op is Mask5. + do_op32(SLJIT_MLSHR32); + break; + case BinaryOpType::Srl64: + // TODO only mask if the second input's op is Mask6. + do_op64(SLJIT_MLSHR); + break; + case BinaryOpType::Sra32: + // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half. + // This means we have to use a 64-bit shift and manually mask the input before shifting. + // TODO only mask if the second input's op is Mask5. + sljit_emit_op2(this->compiler, SLJIT_AND32, Registers::arithmetic_temp1, 0, src2, src2w, SLJIT_IMM, 0b11111); + sljit_emit_op2(this->compiler, SLJIT_MASHR, Registers::arithmetic_temp1, 0, src1, src1w, Registers::arithmetic_temp1, 0); + sign_extend_and_store(); + break; + case BinaryOpType::Sra64: + // TODO only mask if the second input's op is Mask6. + do_op64(SLJIT_MASHR); + break; + + // Comparisons + case BinaryOpType::Equal: + do_compare_op(SLJIT_EQUAL, SLJIT_EQUAL); + break; + case BinaryOpType::NotEqual: + do_compare_op(SLJIT_NOT_EQUAL, SLJIT_NOT_EQUAL); + break; + case BinaryOpType::Less: + do_compare_op(SLJIT_LESS, SLJIT_SIG_LESS); + break; + case BinaryOpType::LessEq: + do_compare_op(SLJIT_LESS_EQUAL, SLJIT_SIG_LESS_EQUAL); + break; + case BinaryOpType::Greater: + do_compare_op(SLJIT_GREATER, SLJIT_SIG_GREATER); + break; + case BinaryOpType::GreaterEq: + do_compare_op(SLJIT_GREATER_EQUAL, SLJIT_SIG_GREATER_EQUAL); + break; + case BinaryOpType::EqualFloat: + do_float_compare_op(SLJIT_F_EQUAL, SLJIT_SET_F_EQUAL, false); + break; + case BinaryOpType::LessFloat: + do_float_compare_op(SLJIT_F_LESS, SLJIT_SET_F_LESS, false); + break; + case BinaryOpType::LessEqFloat: + do_float_compare_op(SLJIT_F_LESS_EQUAL, SLJIT_SET_F_LESS_EQUAL, false); + break; + case BinaryOpType::EqualDouble: + do_float_compare_op(SLJIT_F_EQUAL, SLJIT_SET_F_EQUAL, true); + break; + case BinaryOpType::LessDouble: + do_float_compare_op(SLJIT_F_LESS, SLJIT_SET_F_LESS, true); + break; + case BinaryOpType::LessEqDouble: + do_float_compare_op(SLJIT_F_LESS_EQUAL, SLJIT_SET_F_LESS_EQUAL, true); + break; + + // Loads + case BinaryOpType::LD: + // Add the base and immediate into the arithemtic temp. + sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w); + + // Load the value at rdram + address into the arithemtic temp and rotate it by 32 bits to swap the two words into the right order. + sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32); + + // Move the arithmetic temp into the destination. + sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, Registers::arithmetic_temp1, 0); + break; + case BinaryOpType::LW: + do_load_op(SLJIT_MOV_S32, 0); + break; + case BinaryOpType::LWU: + do_load_op(SLJIT_MOV_U32, 0); + break; + case BinaryOpType::LH: + do_load_op(SLJIT_MOV_S16, 2); + break; + case BinaryOpType::LHU: + do_load_op(SLJIT_MOV_U16, 2); + break; + case BinaryOpType::LB: + do_load_op(SLJIT_MOV_S8, 3); + break; + case BinaryOpType::LBU: + do_load_op(SLJIT_MOV_U8, 3); + break; + case BinaryOpType::LDL: + do_unaligned_load_op(true, true); + break; + case BinaryOpType::LDR: + do_unaligned_load_op(false, true); + break; + case BinaryOpType::LWL: + do_unaligned_load_op(true, false); + break; + case BinaryOpType::LWR: + do_unaligned_load_op(false, false); + break; + default: + assert(false); + errored = true; + return; + } +} + +int32_t do_round_w_s(float num) { + return lroundf(num); +} + +int32_t do_round_w_d(double num) { + return lround(num); +} + +int64_t do_round_l_s(float num) { + return llroundf(num); +} + +int64_t do_round_l_d(double num) { + return llround(num); +} + +int32_t do_ceil_w_s(float num) { + return (int32_t)ceilf(num); +} + +int32_t do_ceil_w_d(double num) { + return (int32_t)ceil(num); +} + +int64_t do_ceil_l_s(float num) { + return (int64_t)ceilf(num); +} + +int64_t do_ceil_l_d(double num) { + return (int64_t)ceil(num); +} + +int32_t do_floor_w_s(float num) { + return (int32_t)floorf(num); +} + +int32_t do_floor_w_d(double num) { + return (int32_t)floor(num); +} + +int64_t do_floor_l_s(float num) { + return (int64_t)floorf(num); +} + +int64_t do_floor_l_d(double num) { + return (int64_t)floor(num); +} + +void N64Recomp::LiveGenerator::load_relocated_address(const InstructionContext& ctx, int reg) const { + // Get the pointer to the section address. + int32_t* section_addr_ptr = (ctx.reloc_tag_as_reference ? inputs.reference_section_addresses : inputs.local_section_addresses) + ctx.reloc_section_index; + + // Load the section's address into the target register. + sljit_emit_op1(compiler, SLJIT_MOV_S32, reg, 0, SLJIT_MEM0(), sljit_sw(section_addr_ptr)); + + // Don't emit the add if the offset is zero (small optimization). + if (ctx.reloc_target_section_offset != 0) { + // Add the reloc section offset to the section's address and put the result in R0. + sljit_emit_op2(compiler, SLJIT_ADD, reg, 0, reg, 0, SLJIT_IMM, ctx.reloc_target_section_offset); + } +} + +void N64Recomp::LiveGenerator::process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const { + // Skip instructions that output to $zero + if (outputs_to_zero(op.output, ctx)) { + return; + } + + sljit_sw dst; + sljit_sw dstw; + sljit_sw src; + sljit_sw srcw; + bool output_good = get_operand_values(op.output, ctx, dst, dstw); + bool input_good = get_operand_values(op.input, ctx, src, srcw); + + if (!output_good || !input_good) { + assert(false); + errored = true; + return; + } + + // If a relocation is needed for the input operand, perform the relocation and store the result directly. + if (ctx.reloc_type != RelocType::R_MIPS_NONE) { + // Only allow relocation of lui with an immediate. + if (op.operation != UnaryOpType::Lui || op.input != Operand::ImmU16) { + assert(false); + errored = true; + return; + } + // Only allow HI16 relocs. + if (ctx.reloc_type != RelocType::R_MIPS_HI16) { + assert(false); + errored = true; + return; + } + // Load the relocated address into temp1. + load_relocated_address(ctx, Registers::arithmetic_temp1); + + // HI16 reloc on a lui + // The 32-bit address (a) is equal to section address + section offset + // The 16-bit immediate is equal to (a - (int16_t)a) >> 16 + // Therefore, the register should be set to (int32_t)(a - (int16_t)a) as the shifts cancel out and the lower 16 bits are zero. + + // Extract a sign extended 16-bit value from the lower half of the relocated address and put it in temp2. + sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0); + + // Subtract the sign extended 16-bit value from the full address to get the HI16 value and place it in the destination. + sljit_emit_op2(compiler, SLJIT_SUB, dst, dstw, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0); + return; + } + + sljit_s32 jit_op = SLJIT_BREAKPOINT; + + bool float_op = false; + bool func_float_op = false; + + auto emit_s_func = [this, src, srcw, dst, dstw, &func_float_op](float (*func)(float)) { + func_float_op = true; + + sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(F32, F32), SLJIT_IMM, sljit_sw(func)); + sljit_emit_fop1(compiler, SLJIT_MOV_F32, dst, dstw, SLJIT_RETURN_FREG, 0); + }; + + auto emit_d_func = [this, src, srcw, dst, dstw, &func_float_op](double (*func)(double)) { + func_float_op = true; + + sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(F64, F64), SLJIT_IMM, sljit_sw(func)); + sljit_emit_fop1(compiler, SLJIT_MOV_F64, dst, dstw, SLJIT_RETURN_FREG, 0); + }; + + auto emit_l_from_s_func = [this, src, srcw, dst, dstw, &func_float_op](int64_t (*func)(float)) { + func_float_op = true; + + sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, F32), SLJIT_IMM, sljit_sw(func)); + sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_RETURN_REG, 0); + }; + + auto emit_w_from_s_func = [this, src, srcw, dst, dstw, &func_float_op](int32_t (*func)(float)) { + func_float_op = true; + + sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(32, F32), SLJIT_IMM, sljit_sw(func)); + sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0); + }; + + auto emit_l_from_d_func = [this, src, srcw, dst, dstw, &func_float_op](int64_t (*func)(double)) { + func_float_op = true; + + sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, F64), SLJIT_IMM, sljit_sw(func)); + sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_RETURN_REG, 0); + }; + + auto emit_w_from_d_func = [this, src, srcw, dst, dstw, &func_float_op](int32_t (*func)(double)) { + func_float_op = true; + + sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw); + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(32, F64), SLJIT_IMM, sljit_sw(func)); + sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0); + }; + + switch (op.operation) { + case UnaryOpType::Lui: + if (src != SLJIT_IMM) { + assert(false); + errored = true; + break; + } + src = SLJIT_IMM; + srcw = (sljit_sw)(int32_t)(srcw << 16); + jit_op = SLJIT_MOV; + break; + case UnaryOpType::NegateFloat: + jit_op = SLJIT_NEG_F32; + float_op = true; + break; + case UnaryOpType::NegateDouble: + jit_op = SLJIT_NEG_F64; + float_op = true; + break; + case UnaryOpType::AbsFloat: + jit_op = SLJIT_ABS_F32; + float_op = true; + break; + case UnaryOpType::AbsDouble: + jit_op = SLJIT_ABS_F64; + float_op = true; + break; + case UnaryOpType::SqrtFloat: + emit_s_func(sqrtf); + break; + case UnaryOpType::SqrtDouble: + emit_d_func(sqrt); + break; + case UnaryOpType::ConvertSFromW: + jit_op = SLJIT_CONV_F32_FROM_S32; + float_op = true; + break; + case UnaryOpType::ConvertWFromS: + emit_w_from_s_func(do_cvt_w_s); + break; + case UnaryOpType::ConvertDFromW: + jit_op = SLJIT_CONV_F64_FROM_S32; + float_op = true; + break; + case UnaryOpType::ConvertWFromD: + emit_w_from_d_func(do_cvt_w_d); + break; + case UnaryOpType::ConvertDFromS: + jit_op = SLJIT_CONV_F64_FROM_F32; + float_op = true; + break; + case UnaryOpType::ConvertSFromD: + // SLJIT_CONV_F32_FROM_F64 uses the current rounding mode, just as CVT_S_D does. + jit_op = SLJIT_CONV_F32_FROM_F64; + float_op = true; + break; + case UnaryOpType::ConvertDFromL: + jit_op = SLJIT_CONV_F64_FROM_SW; + float_op = true; + break; + case UnaryOpType::ConvertLFromD: + emit_l_from_d_func(do_cvt_l_d); + break; + case UnaryOpType::ConvertSFromL: + jit_op = SLJIT_CONV_F32_FROM_SW; + float_op = true; + break; + case UnaryOpType::ConvertLFromS: + emit_l_from_s_func(do_cvt_l_s); + break; + case UnaryOpType::TruncateWFromS: + // SLJIT_CONV_S32_FROM_F32 rounds towards zero, just as TRUNC_W_S does. + jit_op = SLJIT_CONV_S32_FROM_F32; + float_op = true; + break; + case UnaryOpType::TruncateWFromD: + // SLJIT_CONV_S32_FROM_F64 rounds towards zero, just as TRUNC_W_D does. + jit_op = SLJIT_CONV_S32_FROM_F64; + float_op = true; + break; + case UnaryOpType::TruncateLFromS: + // SLJIT_CONV_SW_FROM_F32 rounds towards zero, just as TRUNC_L_S does. + jit_op = SLJIT_CONV_SW_FROM_F32; + float_op = true; + break; + case UnaryOpType::TruncateLFromD: + // SLJIT_CONV_SW_FROM_F64 rounds towards zero, just as TRUNC_L_D does. + jit_op = SLJIT_CONV_SW_FROM_F64; + float_op = true; + break; + case UnaryOpType::RoundWFromS: + emit_w_from_s_func(do_round_w_s); + break; + case UnaryOpType::RoundWFromD: + emit_w_from_d_func(do_round_w_d); + break; + case UnaryOpType::RoundLFromS: + emit_l_from_s_func(do_round_l_s); + break; + case UnaryOpType::RoundLFromD: + emit_l_from_d_func(do_round_l_d); + break; + case UnaryOpType::CeilWFromS: + emit_w_from_s_func(do_ceil_w_s); + break; + case UnaryOpType::CeilWFromD: + emit_w_from_d_func(do_ceil_w_d); + break; + case UnaryOpType::CeilLFromS: + emit_l_from_s_func(do_ceil_l_s); + break; + case UnaryOpType::CeilLFromD: + emit_l_from_d_func(do_ceil_l_d); + break; + case UnaryOpType::FloorWFromS: + emit_w_from_s_func(do_floor_w_s); + break; + case UnaryOpType::FloorWFromD: + emit_w_from_d_func(do_floor_w_d); + break; + case UnaryOpType::FloorLFromS: + emit_l_from_s_func(do_floor_l_s); + break; + case UnaryOpType::FloorLFromD: + emit_l_from_d_func(do_floor_l_d); + break; + case UnaryOpType::None: + jit_op = SLJIT_MOV; + break; + case UnaryOpType::ToS32: + case UnaryOpType::ToInt32: + jit_op = SLJIT_MOV_S32; + break; + // Unary ops that can't be used as a standalone operation + case UnaryOpType::ToU32: + case UnaryOpType::ToS64: + case UnaryOpType::ToU64: + case UnaryOpType::Mask5: + case UnaryOpType::Mask6: + assert(false && "Unsupported unary op"); + errored = true; + return; + } + + if (func_float_op) { + // Already handled by the lambda. + } + else if (float_op) { + sljit_emit_fop1(compiler, jit_op, dst, dstw, src, srcw); + } + else { + sljit_emit_op1(compiler, jit_op, dst, dstw, src, srcw); + } +} + +void N64Recomp::LiveGenerator::process_store_op(const StoreOp& op, const InstructionContext& ctx) const { + sljit_sw src; + sljit_sw srcw; + sljit_sw imm = (sljit_sw)(int16_t)ctx.imm16; + + get_operand_values(op.value_input, ctx, src, srcw); + + // Only LO16 relocs are valid on stores. + if (ctx.reloc_type != RelocType::R_MIPS_NONE && ctx.reloc_type != RelocType::R_MIPS_LO16) { + assert(false); + errored = true; + return; + } + + if (ctx.reloc_type == RelocType::R_MIPS_LO16) { + // Load the relocated address into temp1. + load_relocated_address(ctx, Registers::arithmetic_temp1); + // Extract the LO16 value from the full address (sign extended lower 16 bits). + sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0); + // Add the base register (rs) to the LO16 immediate. + sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(ctx.rs)); + } + else { + // TODO 0 immediate optimization. + + // Add the base register (rs) and the immediate to get the address and store it in the arithemtic temp. + sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(ctx.rs), SLJIT_IMM, imm); + } + + auto do_unaligned_store_op = [src, srcw, this](bool left, bool doubleword) { + // Determine the shift direction to use for calculating the mask and shifting the loaded value. + sljit_sw shift_op = left ? SLJIT_LSHR : SLJIT_SHL; + // Determine the operation's word size. + sljit_sw word_size = doubleword ? 8 : 4; + + // Mask the address with the alignment mask to get the misalignment and put it in temp2. + // misalignment = addr & (word_size - 1); + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, word_size - 1); + + // Mask the address with ~alignment_mask to get the aligned address and put it in temp1. + // addr = addr & ~(word_size - 1); + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, ~(word_size - 1)); + + // Load the word at rdram + aligned address into the temp1 with sign-extension. + // loaded_value = *addr + if (doubleword) { + // Rotate the loaded doubleword by 32 bits to swap the two words into the right order. + sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp3, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32); + } + else { + // Use MOV_S32 to sign-extend the loaded word. + sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp3, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0); + } + + // Inverse the misalignment if this is a right load. + if (!left) { + // misalignment = (word_size - 1 - misalignment) * 8 + sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp2, 0, SLJIT_IMM, word_size - 1, Registers::arithmetic_temp2, 0); + } + + // Calculate the misalignment shift and put it into temp2. + // misalignment_shift = misalignment * 8 + sljit_emit_op2(compiler, SLJIT_SHL, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, 3); + + // Shift the input value by the misalignment shift and put it into temp4. + // input_value SHIFT= misalignment_shift + sljit_emit_op2(compiler, shift_op, Registers::arithmetic_temp4, 0, src, srcw, Registers::arithmetic_temp2, 0); + + // Calculate the misalignment mask and put it into temp2. Use a 32-bit shift if this is a 32-bit operation. + // misalignment_mask = word(-1) SHIFT misalignment_shift + sljit_emit_op2(compiler, doubleword ? shift_op : (shift_op | SLJIT_32), + Registers::arithmetic_temp2, 0, + SLJIT_IMM, doubleword ? uint64_t(-1) : uint32_t(-1), + Registers::arithmetic_temp2, 0); + + // Mask the input value with the misalignment mask and place it into temp4. + // masked_value = shifted_value & misalignment_mask + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp2, 0); + + // Invert the misalignment mask and store it into temp2. + // misalignment_mask = ~misalignment_mask + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, sljit_sw(-1)); + + // Mask the loaded value by the misalignment mask. + // input_value &= misalignment_mask + sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp2, 0); + + // Combine the masked initial value with the shifted loaded value and store it in the destination. + // out = masked_value | input_value + if (doubleword) { + // Combine the values into a temp so that it can be rotated to the correct word order. + sljit_emit_op2(compiler, SLJIT_OR, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp3, 0); + sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, Registers::arithmetic_temp4, 0, SLJIT_IMM, 32); + } + else { + sljit_emit_op2(compiler, SLJIT_OR32, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp3, 0); + } + }; + + switch (op.type) { + case StoreOpType::SD: + case StoreOpType::SDC1: + // Rotate the arithmetic temp by 32 bits to swap the words and move it into the destination. + sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw, SLJIT_IMM, 32); + break; + case StoreOpType::SDL: + do_unaligned_store_op(true, true); + break; + case StoreOpType::SDR: + do_unaligned_store_op(false, true); + break; + case StoreOpType::SW: + case StoreOpType::SWC1: + // store the 32-bit value at address + rdram + sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw); + break; + case StoreOpType::SWL: + do_unaligned_store_op(true, false); + break; + case StoreOpType::SWR: + do_unaligned_store_op(false, false); + break; + case StoreOpType::SH: + // xor the address with 2 + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, 2); + // store the 16-bit value at address + rdram + sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw); + break; + case StoreOpType::SB: + // xor the address with 3 + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, 3); + // store the 8-bit value at address + rdram + sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw); + break; + } +} + +void N64Recomp::LiveGenerator::emit_function_start(const std::string& function_name, size_t func_index) const { + context->function_name = function_name; + context->func_labels[func_index] = sljit_emit_label(compiler); + // sljit_emit_op0(compiler, SLJIT_BREAKPOINT); + sljit_emit_enter(compiler, 0, SLJIT_ARGS2V(P, P), 4 | SLJIT_ENTER_FLOAT(1), 5 | SLJIT_ENTER_FLOAT(0), 0); + sljit_emit_op2(compiler, SLJIT_SUB, Registers::rdram, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); +} + +void N64Recomp::LiveGenerator::emit_function_end() const { + // Check that all jumps have been paired to a label. + if (!context->pending_jumps.empty()) { + assert(false); + errored = true; + } + + // Populate the labels for pending switches and move them into the unlinked jump tables. + bool invalid_switch = false; + for (size_t switch_index = 0; switch_index < context->switch_jump_labels.size(); switch_index++) { + const std::vector& cur_labels = context->switch_jump_labels[switch_index]; + std::vector cur_label_addrs{}; + cur_label_addrs.resize(cur_labels.size()); + for (size_t case_index = 0; case_index < cur_labels.size(); case_index++) { + // Find the label. + auto find_it = context->labels.find(cur_labels[case_index]); + if (find_it == context->labels.end()) { + // Label not found, invalid switch. + // Track this in a variable instead of returning immediately so that the pending labels are still cleared. + invalid_switch = true; + break; + } + cur_label_addrs[case_index] = find_it->second; + } + context->unlinked_jump_tables.emplace_back( + std::make_pair, std::unique_ptr>( + std::move(cur_label_addrs), + std::move(context->pending_jump_tables[switch_index]) + ) + ); + } + context->switch_jump_labels.clear(); + context->pending_jump_tables.clear(); + + // Clear the labels to prevent labels from one function being jumped to by another. + context->labels.clear(); + + if (invalid_switch) { + assert(false); + errored = true; + } +} + +void N64Recomp::LiveGenerator::emit_function_call_lookup(uint32_t addr) const { + // Load the address immediate into the first argument. + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, int32_t(addr)); + + // Call get_function. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, 32), SLJIT_IMM, sljit_sw(inputs.get_function)); + + // Copy the return value into R2 so that it can be used for icall + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_R0, 0); + + // Load rdram and ctx into R0 and R1. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0); + + // Call the function. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P), SLJIT_R2, 0); +} + +void N64Recomp::LiveGenerator::emit_function_call_by_register(int reg) const { + // Load the register's value into the first argument. + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg)); + + // Call get_function. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, 32), SLJIT_IMM, sljit_sw(inputs.get_function)); + + // Copy the return value into R2 so that it can be used for icall + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_R0, 0); + + // Load rdram and ctx into R0 and R1. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0); + + // Call the function. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P), SLJIT_R2, 0); +} + +void N64Recomp::LiveGenerator::emit_function_call_reference_symbol(const Context&, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const { + (void)symbol_index; + + // Load rdram and ctx into R0 and R1. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0); + // sljit_emit_op0(compiler, SLJIT_BREAKPOINT); + // Call the function and save the jump to set its label later on. + sljit_jump* call_jump = sljit_emit_call(compiler, SLJIT_CALL | SLJIT_REWRITABLE_JUMP, SLJIT_ARGS2V(P, P)); + // Set a dummy jump value, this will get replaced during reference/import symbol jump population. + if (section_index == N64Recomp::SectionImport) { + sljit_set_target(call_jump, sljit_uw(-1)); + context->import_jumps_by_index.emplace(symbol_index, call_jump); + } + else { + sljit_set_target(call_jump, sljit_uw(-2)); + context->reference_symbol_jumps.emplace_back(std::make_pair( + ReferenceJumpDetails{ + .section = section_index, + .section_offset = target_section_offset + }, + call_jump + )); + } +} + +void N64Recomp::LiveGenerator::emit_function_call(const Context&, size_t function_index) const { + // Load rdram and ctx into R0 and R1. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0); + // Call the function and save the jump to set its label later on. + sljit_jump* call_jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P)); + context->inner_calls.emplace_back(InnerCall{ .target_func_index = function_index, .jump = call_jump }); +} + +void N64Recomp::LiveGenerator::emit_named_function_call(const std::string& function_name) const { + // The live recompiler can't call functions by name. This is only used for statics, so it's not an issue. + assert(false); + errored = true; +} + +void N64Recomp::LiveGenerator::emit_goto(const std::string& target) const { + sljit_jump* jump = sljit_emit_jump(compiler, SLJIT_JUMP); + // Check if the label already exists. + auto find_it = context->labels.find(target); + if (find_it != context->labels.end()) { + sljit_set_label(jump, find_it->second); + } + // It doesn't, so queue this as a pending jump to be resolved later. + else { + context->pending_jumps[target].push_back(jump); + } +} + +void N64Recomp::LiveGenerator::emit_label(const std::string& label_name) const { + sljit_label* label = sljit_emit_label(compiler); + + // Check if there are any pending jumps for this label and assign them if so. + auto find_it = context->pending_jumps.find(label_name); + if (find_it != context->pending_jumps.end()) { + for (sljit_jump* jump : find_it->second) { + sljit_set_label(jump, label); + } + + // Remove the pending jumps for this label. + context->pending_jumps.erase(find_it); + } + + context->labels.emplace(label_name, label); +} + +void N64Recomp::LiveGenerator::emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const { + (void)jtbl; + (void)reg; + // Nothing to do here, the live recompiler performs a subtraction to get the switch's case. +} + +void N64Recomp::LiveGenerator::emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const { + // Make sure there's no pending jump. + if(context->cur_branch_jump != nullptr) { + assert(false); + errored = true; + return; + } + + // Branch conditions do not allow unary ops, except for ToS64 on the first operand to indicate the branch comparison is signed. + if(op.operands.operand_operations[0] != UnaryOpType::None && op.operands.operand_operations[0] != UnaryOpType::ToS64) { + assert(false); + errored = true; + return; + } + + if (op.operands.operand_operations[1] != UnaryOpType::None) { + assert(false); + errored = true; + return; + } + + sljit_s32 condition_type; + bool cmp_signed = op.operands.operand_operations[0] == UnaryOpType::ToS64; + // Comparisons need to be inverted to account for the fact that the generator is expected to generate a code block that only runs if + // the condition is met, meaning the branch should be taken if the condition isn't met. + switch (op.comparison) { + case BinaryOpType::Equal: + condition_type = SLJIT_NOT_EQUAL; + break; + case BinaryOpType::NotEqual: + condition_type = SLJIT_EQUAL; + break; + case BinaryOpType::GreaterEq: + if (cmp_signed) { + condition_type = SLJIT_SIG_LESS; + } + else { + condition_type = SLJIT_LESS; + } + break; + case BinaryOpType::Greater: + if (cmp_signed) { + condition_type = SLJIT_SIG_LESS_EQUAL; + } + else { + condition_type = SLJIT_LESS_EQUAL; + } + break; + case BinaryOpType::LessEq: + if (cmp_signed) { + condition_type = SLJIT_SIG_GREATER; + } + else { + condition_type = SLJIT_GREATER; + } + break; + case BinaryOpType::Less: + if (cmp_signed) { + condition_type = SLJIT_SIG_GREATER_EQUAL; + } + else { + condition_type = SLJIT_GREATER_EQUAL; + } + break; + default: + assert(false && "Invalid branch condition comparison operation!"); + errored = true; + return; + } + sljit_sw src1; + sljit_sw src1w; + sljit_sw src2; + sljit_sw src2w; + + get_operand_values(op.operands.operands[0], ctx, src1, src1w); + get_operand_values(op.operands.operands[1], ctx, src2, src2w); + + // Relocations aren't valid on conditional branches. + if(ctx.reloc_type != RelocType::R_MIPS_NONE) { + assert(false); + errored = true; + return; + } + + // Create a compare jump and track it as the pending branch jump. + context->cur_branch_jump = sljit_emit_cmp(compiler, condition_type, src1, src1w, src2, src2w); +} + +void N64Recomp::LiveGenerator::emit_branch_close() const { + // Make sure there's a pending branch jump. + if(context->cur_branch_jump == nullptr) { + assert(false); + errored = true; + return; + } + + // Assign a label at this point to the pending branch jump and clear it. + sljit_set_label(context->cur_branch_jump, sljit_emit_label(compiler)); + context->cur_branch_jump = nullptr; +} + +void N64Recomp::LiveGenerator::emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const { + // Populate the switch's labels. + std::vector cur_labels{}; + cur_labels.resize(jtbl.entries.size()); + for (size_t i = 0; i < cur_labels.size(); i++) { + cur_labels[i] = fmt::format("L_{:08X}", jtbl.entries[i]); + } + context->switch_jump_labels.emplace_back(std::move(cur_labels)); + + // Allocate the jump table. + std::unique_ptr cur_jump_table = std::make_unique(jtbl.entries.size()); + + /// Codegen + + // Load the jump target register. The lw instruction was patched into an addiu, so this holds + // the address of the jump table entry instead of the actual jump target. + sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg)); + // Subtract the jump table's address from the jump target to get the jump table addend. + // Sign extend the jump table address to 64 bits so that the entire register's contents are used instead of just the lower 32 bits. + const auto& jtbl_section = recompiler_context.sections[jtbl.section_index]; + if (jtbl_section.relocatable) { + // Make a dummy instruction context to pass to `load_relocated_address`. + InstructionContext dummy_context{}; + + // Get the relocated address of the jump table. + uint32_t section_offset = jtbl.vram - jtbl_section.ram_addr; + + // Populate the necessary fields of the dummy context and load the relocated address into temp2. + dummy_context.reloc_section_index = jtbl.section_index; + dummy_context.reloc_target_section_offset = section_offset; + load_relocated_address(dummy_context, Registers::arithmetic_temp2); + + // Subtract the relocated jump table start address from the loaded address. + sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0); + } + else { + sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, (sljit_sw)((int32_t)jtbl.vram)); + } + + // Bounds check the addend. If it's greater than or equal to the jump table size (entries * sizeof(u32)) then jump to the switch error. + sljit_jump* switch_error_jump = sljit_emit_cmp(compiler, SLJIT_GREATER_EQUAL, Registers::arithmetic_temp1, 0, SLJIT_IMM, jtbl.entries.size() * sizeof(uint32_t)); + context->switch_error_jumps.emplace_back(SwitchErrorJump{.instr_vram = jtbl.jr_vram, .jtbl_vram = jtbl.vram, .jump = switch_error_jump}); + + // Multiply the jump table addend by 2 to get the addend for the real jump table. (4 bytes per entry to 8 bytes per entry). + sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0); + // Load the real jump table address. + sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp2, 0, SLJIT_IMM, (sljit_sw)cur_jump_table.get()); + // Load the real jump entry. + sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::arithmetic_temp1, Registers::arithmetic_temp2), 0); + // Jump to the loaded entry. + sljit_emit_ijump(compiler, SLJIT_JUMP, Registers::arithmetic_temp1, 0); + + // Move the jump table into the pending jump tables. + context->pending_jump_tables.emplace_back(std::move(cur_jump_table)); +} + +void N64Recomp::LiveGenerator::emit_case(int case_index, const std::string& target_label) const { + (void)case_index; + (void)target_label; + // Nothing to do here, the jump table is built in emit_switch. +} + +void N64Recomp::LiveGenerator::emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const { + (void)instr_vram; + (void)jtbl_vram; + // Nothing to do here, the jump table is built in emit_switch. +} + +void N64Recomp::LiveGenerator::emit_switch_close() const { + // Nothing to do here, the jump table is built in emit_switch. +} + +void N64Recomp::LiveGenerator::emit_return() const { + sljit_emit_return_void(compiler); +} + +void N64Recomp::LiveGenerator::emit_check_fr(int fpr) const { + (void)fpr; + // Nothing to do here. +} + +void N64Recomp::LiveGenerator::emit_check_nan(int fpr, bool is_double) const { + (void)fpr; + (void)is_double; + // Nothing to do here. +} + +void N64Recomp::LiveGenerator::emit_cop0_status_read(int reg) const { + // Skip the read if the target is the zero register. + if (reg != 0) { + // Load ctx into R0. + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, Registers::ctx, 0); + + // Call cop0_status_read. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P,32), SLJIT_IMM, sljit_sw(inputs.cop0_status_read)); + + // Store the result in the output register. + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg), SLJIT_R0, 0); + } +} + +void N64Recomp::LiveGenerator::emit_cop0_status_write(int reg) const { + sljit_sw src; + sljit_sw srcw; + get_gpr_values(reg, src, srcw); + + // Load ctx and the input register value into R0 and R1 + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, Registers::ctx, 0); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, src, srcw); + + // Call cop0_status_write. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P,32), SLJIT_IMM, sljit_sw(inputs.cop0_status_write)); +} + +void N64Recomp::LiveGenerator::emit_cop1_cs_read(int reg) const { + // Skip the read if the target is the zero register. + if (reg != 0) { + sljit_sw dst; + sljit_sw dstw; + get_gpr_values(reg, dst, dstw); + + // Call get_cop1_cs. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS0(32), SLJIT_IMM, sljit_sw(get_cop1_cs)); + + // Store the result in the output register. + sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0); + } +} + +void N64Recomp::LiveGenerator::emit_cop1_cs_write(int reg) const { + sljit_sw src; + sljit_sw srcw; + get_gpr_values(reg, src, srcw); + + // Load the input register value into R0. + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw); + + // Call set_cop1_cs. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(32), SLJIT_IMM, sljit_sw(set_cop1_cs)); +} + +void N64Recomp::LiveGenerator::emit_muldiv(InstrId instr_id, int reg1, int reg2) const { + sljit_sw src1; + sljit_sw src1w; + sljit_sw src2; + sljit_sw src2w; + get_gpr_values(reg1, src1, src1w); + get_gpr_values(reg2, src2, src2w); + + auto do_mul32_op = [src1, src1w, src2, src2w, this](bool is_signed) { + // Load the two inputs into the multiplication input registers (R0/R1). + if (is_signed) { + // 32-bit signed multiplication is really 64 bits * 35 bits, so load accordingly. + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src1, src1w); + + // Sign extend to 35 bits by shifting left by 64 - 35 and then shifting right by the same amount. + sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, src2, src2w, SLJIT_IMM, 64 - 35); + sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 64 - 35); + } + else { + sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R0, 0, src1, src1w); + sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R1, 0, src2, src2w); + } + + // Perform the multiplication. + sljit_emit_op0(compiler, is_signed ? SLJIT_LMUL_SW : SLJIT_LMUL_UW); + + // Move the results into hi and lo with sign extension. + sljit_emit_op2(compiler, SLJIT_ASHR, Registers::hi, 0, SLJIT_R0, 0, SLJIT_IMM, 32); + sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::lo, 0, SLJIT_R0, 0); + }; + + auto do_mul64_op = [src1, src1w, src2, src2w, this](bool is_signed) { + // Load the two inputs into the multiplication input registers (R0/R1). + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src1, src1w); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, src2, src2w); + + // Perform the multiplication. + sljit_emit_op0(compiler, is_signed ? SLJIT_LMUL_SW : SLJIT_LMUL_UW); + + // Move the results into hi and lo. + sljit_emit_op1(compiler, SLJIT_MOV, Registers::hi, 0, SLJIT_R1, 0); + sljit_emit_op1(compiler, SLJIT_MOV, Registers::lo, 0, SLJIT_R0, 0); + }; + + auto do_div_op = [src1, src1w, src2, src2w, this](bool doubleword, bool is_signed) { + // Pick the division opcode based on the bit width and signedness. + // Note that the 64-bit division opcode is used for 32-bit signed division to match hardware behavior and prevent overflow. + sljit_sw div_opcode = doubleword ? + (is_signed ? SLJIT_DIVMOD_SW : SLJIT_DIVMOD_UW) : + (is_signed ? SLJIT_DIVMOD_SW : SLJIT_DIVMOD_U32); + + // Pick the move opcode to use for loading the operands. + sljit_sw load_opcode = doubleword ? SLJIT_MOV : + (is_signed ? SLJIT_MOV_S32 : SLJIT_MOV_U32); + + // Pick the move opcode to use for saving the results. + sljit_sw save_opcode = doubleword ? SLJIT_MOV : SLJIT_MOV_S32; + + // Load the two inputs into R0 and R1 (the numerator and denominator). + sljit_emit_op1(compiler, load_opcode, SLJIT_R0, 0, src1, src1w); + + // TODO figure out 32-bit signed division behavior when inputs aren't properly sign extended. + // if (!doubleword && is_signed) { + // // Sign extend to 35 bits by shifting left by 64 - 35 and then shifting right by the same amount. + // sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, src2, src2w, SLJIT_IMM, 64 - 35); + // sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 64 - 35); + // } + // else { + sljit_emit_op1(compiler, load_opcode, SLJIT_R1, 0, src2, src2w); + // } + + // Prevent overflow on 64-bit signed division. + if (doubleword && is_signed) { + // If the numerator is INT64_MIN and the denominator is -1, an overflow will occur. To prevent an exception and + // behave as the original hardware would, check if either of those conditions are false. + // If neither condition is false (i.e. both are true), set the denominator to 1. + + // Xor the numerator with INT64_MIN. This will be zero if they're equal. + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, sljit_sw(INT64_MIN)); + + // Invert the denominator. This will be zero if it's -1. + sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, sljit_sw(-1)); + + // Or the results of the previous two calculations and set the zero flag. This will be zero if both conditions were met. + sljit_emit_op2(compiler, SLJIT_OR | SLJIT_SET_Z, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp4, 0); + + // If the zero flag is 0, meaning both conditions were true, replace the denominator with 1. + // i.e. conditionally move an immediate of 1 into arithmetic temp 2 if the zero flag is 0. + sljit_emit_select(compiler, SLJIT_ZERO, SLJIT_R1, SLJIT_IMM, 1, SLJIT_R1); + } + + // If the denominator is 0, skip the division and jump the special handling for that case. + // Branch past the division if the divisor is 0. + sljit_jump* jump_skip_division = sljit_emit_cmp(compiler, SLJIT_EQUAL, SLJIT_R1, 0, SLJIT_IMM, 0);// sljit_emit_jump(compiler, SLJIT_ZERO); + + // Perform the division. + sljit_emit_op0(compiler, div_opcode); + + // Extract the remainder and quotient into the high and low registers respectively. + sljit_emit_op1(compiler, save_opcode, Registers::hi, 0, SLJIT_R1, 0); + sljit_emit_op1(compiler, save_opcode, Registers::lo, 0, SLJIT_R0, 0); + + // Jump to the end of this routine. + sljit_jump* jump_to_end = sljit_emit_jump(compiler, SLJIT_JUMP); + + // Emit a label and set it as the target of the jump if the denominator was zero. + sljit_label* after_division = sljit_emit_label(compiler); + sljit_set_label(jump_skip_division, after_division); + + // Move the numerator into hi. + sljit_emit_op1(compiler, save_opcode, Registers::hi, 0, SLJIT_R0, 0); + + if (is_signed) { + // Calculate the negative signum of the numerator and place it in lo. + // neg_signum = ((int64_t)(~x) >> (bit width - 1)) | 1 + sljit_emit_op2(compiler, SLJIT_XOR, Registers::lo, 0, SLJIT_R0, 0, SLJIT_IMM, sljit_sw(-1)); + sljit_emit_op2(compiler, SLJIT_ASHR, Registers::lo, 0, Registers::lo, 0, SLJIT_IMM, 64 - 1); + sljit_emit_op2(compiler, SLJIT_OR, Registers::lo, 0, Registers::lo, 0, SLJIT_IMM, 1); + } + else { + // Move -1 into lo. + sljit_emit_op1(compiler, SLJIT_MOV, Registers::lo, 0, SLJIT_IMM, sljit_sw(-1)); + } + + // Emit a label and set it as the target of the jump after the divison. + sljit_label* end_label = sljit_emit_label(compiler); + sljit_set_label(jump_to_end, end_label); + }; + + + switch (instr_id) { + case InstrId::cpu_mult: + do_mul32_op(true); + break; + case InstrId::cpu_multu: + do_mul32_op(false); + break; + case InstrId::cpu_dmult: + do_mul64_op(true); + break; + case InstrId::cpu_dmultu: + do_mul64_op(false); + break; + case InstrId::cpu_div: + do_div_op(false, true); + break; + case InstrId::cpu_divu: + do_div_op(false, false); + break; + case InstrId::cpu_ddiv: + do_div_op(true, true); + break; + case InstrId::cpu_ddivu: + do_div_op(true, false); + break; + default: + assert(false && "Invalid mul/div instruction id!"); + break; + } +} + +void N64Recomp::LiveGenerator::emit_syscall(uint32_t instr_vram) const { + // Load rdram and ctx into R0 and R1. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0); + // Load the vram into R2. + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, instr_vram); + // Call syscall_handler. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3V(P, P, 32), SLJIT_IMM, sljit_sw(inputs.syscall_handler)); +} + +void N64Recomp::LiveGenerator::emit_do_break(uint32_t instr_vram) const { + // Load the vram into R0. + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, instr_vram); + // Call do_break. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(32), SLJIT_IMM, sljit_sw(inputs.do_break)); +} + +void N64Recomp::LiveGenerator::emit_pause_self() const { + // Load rdram into R0. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + // Call pause_self. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(P), SLJIT_IMM, sljit_sw(inputs.pause_self)); +} + +void N64Recomp::LiveGenerator::emit_trigger_event(uint32_t event_index) const { + // Load rdram and ctx into R0 and R1. + sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0); + // Load the global event index into R2. + sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, event_index + inputs.base_event_index); + // Call trigger_event. + sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(P), SLJIT_IMM, sljit_sw(inputs.trigger_event)); +} + +void N64Recomp::LiveGenerator::emit_comment(const std::string& comment) const { + (void)comment; + // Nothing to do here. +} + +bool N64Recomp::recompile_function_live(LiveGenerator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span> static_funcs_out, bool tag_reference_relocs) { + return recompile_function_custom(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs); +} + diff --git a/LiveRecomp/live_recompiler_test.cpp b/LiveRecomp/live_recompiler_test.cpp new file mode 100644 index 0000000..c5673eb --- /dev/null +++ b/LiveRecomp/live_recompiler_test.cpp @@ -0,0 +1,364 @@ +#include +#include +#include +#include + +#include "sljitLir.h" +#include "recompiler/live_recompiler.h" +#include "recomp.h" + +static std::vector read_file(const std::filesystem::path& path, bool& found) { + std::vector ret; + found = false; + + std::ifstream file{ path, std::ios::binary}; + + if (file.good()) { + file.seekg(0, std::ios::end); + ret.resize(file.tellg()); + file.seekg(0, std::ios::beg); + + file.read(reinterpret_cast(ret.data()), ret.size()); + found = true; + } + + return ret; +} + + +uint32_t read_u32_swap(const std::vector& vec, size_t offset) { + return byteswap(*reinterpret_cast(&vec[offset])); +} + +uint32_t read_u32(const std::vector& vec, size_t offset) { + return *reinterpret_cast(&vec[offset]); +} + +std::vector rdram; + +void byteswap_copy(uint8_t* dst, uint8_t* src, size_t count) { + for (size_t i = 0; i < count; i++) { + dst[i ^ 3] = src[i]; + } +} + +bool byteswap_compare(uint8_t* a, uint8_t* b, size_t count) { + for (size_t i = 0; i < count; i++) { + if (a[i ^ 3] != b[i]) { + return false; + } + } + return true; +} + +enum class TestError { + Success, + FailedToOpenInput, + FailedToRecompile, + UnknownStructType, + DataDifference +}; + +struct TestStats { + TestError error; + uint64_t codegen_microseconds; + uint64_t execution_microseconds; + uint64_t code_size; +}; + +void write1(uint8_t* rdram, recomp_context* ctx) { + MEM_B(0, ctx->r4) = 1; +} + +recomp_func_t* test_get_function(int32_t vram) { + if (vram == 0x80100000) { + return write1; + } + assert(false); + return nullptr; +} + +void test_switch_error(const char* func, uint32_t vram, uint32_t jtbl) { + printf(" Switch-case out of bounds in %s at 0x%08X for jump table at 0x%08X\n", func, vram, jtbl); +} + +TestStats run_test(const std::filesystem::path& tests_dir, const std::string& test_name) { + std::filesystem::path input_path = tests_dir / (test_name + "_data.bin"); + std::filesystem::path data_dump_path = tests_dir / (test_name + "_data_out.bin"); + + bool found; + std::vector file_data = read_file(input_path, found); + + if (!found) { + printf("Failed to open file: %s\n", input_path.string().c_str()); + return { TestError::FailedToOpenInput }; + } + + // Parse the test file. + uint32_t text_offset = read_u32_swap(file_data, 0x00); + uint32_t text_length = read_u32_swap(file_data, 0x04); + uint32_t init_data_offset = read_u32_swap(file_data, 0x08); + uint32_t good_data_offset = read_u32_swap(file_data, 0x0C); + uint32_t data_length = read_u32_swap(file_data, 0x10); + uint32_t text_address = read_u32_swap(file_data, 0x14); + uint32_t data_address = read_u32_swap(file_data, 0x18); + uint32_t next_struct_address = read_u32_swap(file_data, 0x1C); + + recomp_context ctx{}; + + byteswap_copy(&rdram[text_address - 0x80000000], &file_data[text_offset], text_length); + byteswap_copy(&rdram[data_address - 0x80000000], &file_data[init_data_offset], data_length); + + // Build recompiler context. + N64Recomp::Context context{}; + + // Move the file data into the context. + context.rom = std::move(file_data); + + context.sections.resize(2); + // Create a section for the function to exist in. + context.sections[0].ram_addr = text_address; + context.sections[0].rom_addr = text_offset; + context.sections[0].size = text_length; + context.sections[0].name = ".text"; + context.sections[0].executable = true; + context.sections[0].relocatable = true; + context.section_functions.resize(context.sections.size()); + // Create a section for .data (used for relocations) + context.sections[1].ram_addr = data_address; + context.sections[1].rom_addr = init_data_offset; + context.sections[1].size = data_length; + context.sections[1].name = ".data"; + context.sections[1].executable = false; + context.sections[1].relocatable = true; + + size_t start_func_index; + uint32_t function_desc_address = 0; + uint32_t reloc_desc_address = 0; + + // Read any extra structs. + while (next_struct_address != 0) { + uint32_t cur_struct_address = next_struct_address; + uint32_t struct_type = read_u32_swap(context.rom, next_struct_address + 0x00); + next_struct_address = read_u32_swap(context.rom, next_struct_address + 0x04); + + switch (struct_type) { + case 1: // Function desc + function_desc_address = cur_struct_address; + break; + case 2: // Relocation + reloc_desc_address = cur_struct_address; + break; + default: + printf("Unknown struct type %u\n", struct_type); + return { TestError::UnknownStructType }; + } + } + + // Check if a function description exists. + if (function_desc_address == 0) { + // No function description, so treat the whole thing as one function. + + // Get the function's instruction words. + std::vector text_words{}; + text_words.resize(text_length / sizeof(uint32_t)); + for (size_t i = 0; i < text_words.size(); i++) { + text_words[i] = read_u32(context.rom, text_offset + i * sizeof(uint32_t)); + } + + // Add the function to the context. + context.functions_by_vram[text_address].emplace_back(context.functions.size()); + context.section_functions.emplace_back(context.functions.size()); + context.sections[0].function_addrs.emplace_back(text_address); + context.functions.emplace_back( + text_address, + text_offset, + text_words, + "test_func", + 0 + ); + start_func_index = 0; + } + else { + // Use the function description. + uint32_t num_funcs = read_u32_swap(context.rom, function_desc_address + 0x08); + start_func_index = read_u32_swap(context.rom, function_desc_address + 0x0C); + + for (size_t func_index = 0; func_index < num_funcs; func_index++) { + uint32_t cur_func_address = read_u32_swap(context.rom, function_desc_address + 0x10 + 0x00 + 0x08 * func_index); + uint32_t cur_func_length = read_u32_swap(context.rom, function_desc_address + 0x10 + 0x04 + 0x08 * func_index); + uint32_t cur_func_offset = cur_func_address - text_address + text_offset; + + // Get the function's instruction words. + std::vector text_words{}; + text_words.resize(cur_func_length / sizeof(uint32_t)); + for (size_t i = 0; i < text_words.size(); i++) { + text_words[i] = read_u32(context.rom, cur_func_offset + i * sizeof(uint32_t)); + } + + // Add the function to the context. + context.functions_by_vram[cur_func_address].emplace_back(context.functions.size()); + context.section_functions.emplace_back(context.functions.size()); + context.sections[0].function_addrs.emplace_back(cur_func_address); + context.functions.emplace_back( + cur_func_address, + cur_func_offset, + std::move(text_words), + "test_func_" + std::to_string(func_index), + 0 + ); + } + } + + // Check if a relocation description exists. + if (reloc_desc_address != 0) { + uint32_t num_relocs = read_u32_swap(context.rom, reloc_desc_address + 0x08); + for (uint32_t reloc_index = 0; reloc_index < num_relocs; reloc_index++) { + uint32_t cur_desc_address = reloc_desc_address + 0x0C + reloc_index * 4 * sizeof(uint32_t); + uint32_t reloc_type = read_u32_swap(context.rom, cur_desc_address + 0x00); + uint32_t reloc_section = read_u32_swap(context.rom, cur_desc_address + 0x04); + uint32_t reloc_address = read_u32_swap(context.rom, cur_desc_address + 0x08); + uint32_t reloc_target_offset = read_u32_swap(context.rom, cur_desc_address + 0x0C); + + context.sections[0].relocs.emplace_back(N64Recomp::Reloc{ + .address = reloc_address, + .target_section_offset = reloc_target_offset, + .symbol_index = 0, + .target_section = static_cast(reloc_section), + .type = static_cast(reloc_type), + .reference_symbol = false + }); + } + } + + std::vector> dummy_static_funcs{}; + std::vector section_addresses{}; + section_addresses.emplace_back(text_address); + section_addresses.emplace_back(data_address); + + auto before_codegen = std::chrono::system_clock::now(); + + N64Recomp::LiveGeneratorInputs generator_inputs { + .switch_error = test_switch_error, + .get_function = test_get_function, + .reference_section_addresses = nullptr, + .local_section_addresses = section_addresses.data() + }; + + // Create the sljit compiler and the generator. + N64Recomp::LiveGenerator generator{ context.functions.size(), generator_inputs }; + + for (size_t func_index = 0; func_index < context.functions.size(); func_index++) { + std::ostringstream dummy_ostream{}; + + //sljit_emit_op0(compiler, SLJIT_BREAKPOINT); + + if (!N64Recomp::recompile_function_live(generator, context, func_index, dummy_ostream, dummy_static_funcs, true)) { + return { TestError::FailedToRecompile }; + } + } + + // Generate the code. + N64Recomp::LiveGeneratorOutput output = generator.finish(); + + auto after_codegen = std::chrono::system_clock::now(); + + auto before_execution = std::chrono::system_clock::now(); + + int old_rounding = fegetround(); + + // Run the generated code. + ctx.r29 = 0xFFFFFFFF80000000 + rdram.size() - 0x10; // Set the stack pointer. + output.functions[start_func_index](rdram.data(), &ctx); + + fesetround(old_rounding); + + auto after_execution = std::chrono::system_clock::now(); + + // Check the result of running the code. + bool good = byteswap_compare(&rdram[data_address - 0x80000000], &context.rom[good_data_offset], data_length); + + // Dump the data if the results don't match. + if (!good) { + std::ofstream data_dump_file{ data_dump_path, std::ios::binary }; + std::vector data_swapped; + data_swapped.resize(data_length); + byteswap_copy(data_swapped.data(), &rdram[data_address - 0x80000000], data_length); + data_dump_file.write(reinterpret_cast(data_swapped.data()), data_length); + return { TestError::DataDifference }; + } + + // Return the test's stats. + TestStats ret{}; + ret.error = TestError::Success; + ret.codegen_microseconds = std::chrono::duration_cast(after_codegen - before_codegen).count(); + ret.execution_microseconds = std::chrono::duration_cast(after_execution - before_execution).count(); + ret.code_size = output.code_size; + + return ret; +} + +int main(int argc, const char** argv) { + if (argc < 3) { + printf("Usage: %s [test directory] [test 1] ...\n", argv[0]); + return EXIT_SUCCESS; + } + + N64Recomp::live_recompiler_init(); + + rdram.resize(0x8000000); + + // Skip the first argument (program name) and second argument (test directory). + int count = argc - 1 - 1; + int passed_count = 0; + + std::vector failed_tests{}; + + for (size_t test_index = 0; test_index < count; test_index++) { + const char* cur_test_name = argv[2 + test_index]; + printf("Running test: %s\n", cur_test_name); + TestStats stats = run_test(argv[1], cur_test_name); + + switch (stats.error) { + case TestError::Success: + printf(" Success\n"); + printf(" Generated %" PRIu64 " bytes in %" PRIu64 " microseconds and ran in %" PRIu64 " microseconds\n", + stats.code_size, stats.codegen_microseconds, stats.execution_microseconds); + passed_count++; + break; + case TestError::FailedToOpenInput: + printf(" Failed to open input data file\n"); + break; + case TestError::FailedToRecompile: + printf(" Failed to recompile\n"); + break; + case TestError::UnknownStructType: + printf(" Unknown additional data struct type in test data\n"); + break; + case TestError::DataDifference: + printf(" Output data did not match, dumped to file\n"); + break; + } + + if (stats.error != TestError::Success) { + failed_tests.emplace_back(test_index); + } + + printf("\n"); + } + + printf("Passed %d/%d tests\n", passed_count, count); + if (!failed_tests.empty()) { + printf(" Failed: "); + for (size_t i = 0; i < failed_tests.size(); i++) { + size_t test_index = failed_tests[i]; + + printf("%s", argv[2 + test_index]); + if (i != failed_tests.size() - 1) { + printf(", "); + } + } + printf("\n"); + } + return 0; +} diff --git a/OfflineModRecomp/main.cpp b/OfflineModRecomp/main.cpp index aa25dc8..29e5232 100644 --- a/OfflineModRecomp/main.cpp +++ b/OfflineModRecomp/main.cpp @@ -3,7 +3,7 @@ #include #include -#include "n64recomp.h" +#include "recompiler/context.h" #include "rabbitizer.hpp" static std::vector read_file(const std::filesystem::path& path, bool& found) { @@ -221,8 +221,7 @@ int main(int argc, const char** argv) { // Perform a second pass for recompiling all the functions. for (size_t func_index = 0; func_index < mod_context.functions.size(); func_index++) { - auto& func = mod_context.functions[func_index]; - if (!N64Recomp::recompile_function(mod_context, func, output_file, static_funcs_by_section, true)) { + if (!N64Recomp::recompile_function(mod_context, func_index, output_file, static_funcs_by_section, true)) { output_file.close(); std::error_code ec; std::filesystem::remove(output_file_path, ec); diff --git a/RecompModTool/main.cpp b/RecompModTool/main.cpp index 78649ef..9fbb7d1 100644 --- a/RecompModTool/main.cpp +++ b/RecompModTool/main.cpp @@ -7,7 +7,7 @@ #include #include "fmt/format.h" #include "fmt/ostream.h" -#include "n64recomp.h" +#include "recompiler/context.h" #include #ifdef _WIN32 diff --git a/include/generator.h b/include/generator.h deleted file mode 100644 index 5afcc57..0000000 --- a/include/generator.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef __GENERATOR_H__ -#define __GENERATOR_H__ - -#include "n64recomp.h" -#include "operations.h" - -namespace N64Recomp { - struct InstructionContext { - int rd; - int rs; - int rt; - int sa; - - int fd; - int fs; - int ft; - - int cop1_cs; - - uint16_t imm16; - - bool reloc_tag_as_reference; - RelocType reloc_type; - uint32_t reloc_section_index; - uint32_t reloc_target_section_offset; - }; - - class Generator { - public: - virtual void process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const = 0; - virtual void process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const = 0; - virtual void process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const = 0; - virtual void emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const = 0; - virtual void emit_branch_close(std::ostream& output_file) const = 0; - virtual void emit_check_fr(std::ostream& output_file, int fpr) const = 0; - virtual void emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const = 0; - }; - - class CGenerator final : Generator { - public: - CGenerator() = default; - void process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const final; - void process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const final; - void process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const final; - void emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const final; - void emit_branch_close(std::ostream& output_file) const final; - void emit_check_fr(std::ostream& output_file, int fpr) const final; - void emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const final; - private: - void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const; - void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const; - void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const; - }; -} - -#endif diff --git a/include/recomp.h b/include/recomp.h new file mode 100644 index 0000000..d291eec --- /dev/null +++ b/include/recomp.h @@ -0,0 +1,397 @@ +#ifndef __RECOMP_H__ +#define __RECOMP_H__ + +#include +#include +#include +#include +#include + +// Compiler definition to disable inter-procedural optimization, allowing multiple functions to be in a single file without breaking interposition. +#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) + // MSVC's __declspec(noinline) seems to disable inter-procedural optimization entirely, so it's all that's needed. + #define RECOMP_FUNC __declspec(noinline) + + // Use MSVC's fenv_access pragma. + #define SET_FENV_ACCESS() _Pragma("fenv_access(on)") +#elif defined(__clang__) + // Clang has no dedicated IPO attribute, so we use a combination of other attributes to give the desired behavior. + // The inline keyword allows multiple definitions during linking, and extern forces clang to emit an externally visible definition. + // Weak forces Clang to not perform any IPO as the symbol can be interposed, which prevents actual inlining due to the inline keyword. + // Add noinline on for good measure, which doesn't conflict with the inline keyword as they have different meanings. + #define RECOMP_FUNC extern inline __attribute__((weak,noinline)) + + // Use the standard STDC FENV_ACCESS pragma. + #define SET_FENV_ACCESS() _Pragma("STDC FENV_ACCESS ON") +#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) + // Use GCC's attribute for disabling inter-procedural optimizations. Also enable the rounding-math compiler flag to disable + // constant folding so that arithmetic respects the floating point environment. This is needed because gcc doesn't implement + // any FENV_ACCESS pragma. + #define RECOMP_FUNC __attribute__((noipa, optimize("rounding-math"))) + + // There's no FENV_ACCESS pragma in gcc, so this can be empty. + #define SET_FENV_ACCESS() +#else + #error "No RECOMP_FUNC definition for this compiler" +#endif + +// Implementation of 64-bit multiply and divide instructions +#if defined(__SIZEOF_INT128__) + +static inline void DMULT(int64_t a, int64_t b, int64_t* lo64, int64_t* hi64) { + __int128 full128 = ((__int128)a) * ((__int128)b); + + *hi64 = (int64_t)(full128 >> 64); + *lo64 = (int64_t)(full128 >> 0); +} + +static inline void DMULTU(uint64_t a, uint64_t b, uint64_t* lo64, uint64_t* hi64) { + unsigned __int128 full128 = ((unsigned __int128)a) * ((unsigned __int128)b); + + *hi64 = (uint64_t)(full128 >> 64); + *lo64 = (uint64_t)(full128 >> 0); +} + +#elif defined(_MSC_VER) + +#include +#pragma intrinsic(_mul128) +#pragma intrinsic(_umul128) + +static inline void DMULT(int64_t a, int64_t b, int64_t* lo64, int64_t* hi64) { + *lo64 = _mul128(a, b, hi64); +} + +static inline void DMULTU(uint64_t a, uint64_t b, uint64_t* lo64, uint64_t* hi64) { + *lo64 = _umul128(a, b, hi64); +} + +#else +#error "128-bit integer type not found" +#endif + +static inline void DDIV(int64_t a, int64_t b, int64_t* quot, int64_t* rem) { + int overflow = ((uint64_t)a == 0x8000000000000000ull) && (b == -1ll); + *quot = overflow ? a : (a / b); + *rem = overflow ? 0 : (a % b); +} + +static inline void DDIVU(uint64_t a, uint64_t b, uint64_t* quot, uint64_t* rem) { + *quot = a / b; + *rem = a % b; +} + +typedef uint64_t gpr; + +#define SIGNED(val) \ + ((int64_t)(val)) + +#define ADD32(a, b) \ + ((gpr)(int32_t)((a) + (b))) + +#define SUB32(a, b) \ + ((gpr)(int32_t)((a) - (b))) + +#define MEM_W(offset, reg) \ + (*(int32_t*)(rdram + ((((reg) + (offset))) - 0xFFFFFFFF80000000))) + +#define MEM_H(offset, reg) \ + (*(int16_t*)(rdram + ((((reg) + (offset)) ^ 2) - 0xFFFFFFFF80000000))) + +#define MEM_B(offset, reg) \ + (*(int8_t*)(rdram + ((((reg) + (offset)) ^ 3) - 0xFFFFFFFF80000000))) + +#define MEM_HU(offset, reg) \ + (*(uint16_t*)(rdram + ((((reg) + (offset)) ^ 2) - 0xFFFFFFFF80000000))) + +#define MEM_BU(offset, reg) \ + (*(uint8_t*)(rdram + ((((reg) + (offset)) ^ 3) - 0xFFFFFFFF80000000))) + +#define SD(val, offset, reg) { \ + *(uint32_t*)(rdram + ((((reg) + (offset) + 4)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 0); \ + *(uint32_t*)(rdram + ((((reg) + (offset) + 0)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 32); \ +} + +static inline uint64_t load_doubleword(uint8_t* rdram, gpr reg, gpr offset) { + uint64_t ret = 0; + uint64_t lo = (uint64_t)(uint32_t)MEM_W(reg, offset + 4); + uint64_t hi = (uint64_t)(uint32_t)MEM_W(reg, offset + 0); + ret = (lo << 0) | (hi << 32); + return ret; +} + +#define LD(offset, reg) \ + load_doubleword(rdram, offset, reg) + +static inline gpr do_lwl(uint8_t* rdram, gpr initial_value, gpr offset, gpr reg) { + // Calculate the overall address + gpr address = (offset + reg); + + // Load the aligned word + gpr word_address = address & ~0x3; + uint32_t loaded_value = MEM_W(0, word_address); + + // Mask the existing value and shift the loaded value appropriately + gpr misalignment = address & 0x3; + gpr masked_value = initial_value & (gpr)(uint32_t)~(0xFFFFFFFFu << (misalignment * 8)); + loaded_value <<= (misalignment * 8); + + // Cast to int32_t to sign extend first + return (gpr)(int32_t)(masked_value | loaded_value); +} + +static inline gpr do_lwr(uint8_t* rdram, gpr initial_value, gpr offset, gpr reg) { + // Calculate the overall address + gpr address = (offset + reg); + + // Load the aligned word + gpr word_address = address & ~0x3; + uint32_t loaded_value = MEM_W(0, word_address); + + // Mask the existing value and shift the loaded value appropriately + gpr misalignment = address & 0x3; + gpr masked_value = initial_value & (gpr)(uint32_t)~(0xFFFFFFFFu >> (24 - misalignment * 8)); + loaded_value >>= (24 - misalignment * 8); + + // Cast to int32_t to sign extend first + return (gpr)(int32_t)(masked_value | loaded_value); +} + +static inline void do_swl(uint8_t* rdram, gpr offset, gpr reg, gpr val) { + // Calculate the overall address + gpr address = (offset + reg); + + // Get the initial value of the aligned word + gpr word_address = address & ~0x3; + uint32_t initial_value = MEM_W(0, word_address); + + // Mask the initial value and shift the input value appropriately + gpr misalignment = address & 0x3; + uint32_t masked_initial_value = initial_value & ~(0xFFFFFFFFu >> (misalignment * 8)); + uint32_t shifted_input_value = ((uint32_t)val) >> (misalignment * 8); + MEM_W(0, word_address) = masked_initial_value | shifted_input_value; +} + +static inline void do_swr(uint8_t* rdram, gpr offset, gpr reg, gpr val) { + // Calculate the overall address + gpr address = (offset + reg); + + // Get the initial value of the aligned word + gpr word_address = address & ~0x3; + uint32_t initial_value = MEM_W(0, word_address); + + // Mask the initial value and shift the input value appropriately + gpr misalignment = address & 0x3; + uint32_t masked_initial_value = initial_value & ~(0xFFFFFFFFu << (24 - misalignment * 8)); + uint32_t shifted_input_value = ((uint32_t)val) << (24 - misalignment * 8); + MEM_W(0, word_address) = masked_initial_value | shifted_input_value; +} + +static inline uint32_t get_cop1_cs() { + uint32_t rounding_mode = 0; + switch (fegetround()) { + // round to nearest value + case FE_TONEAREST: + default: + rounding_mode = 0; + break; + // round to zero (truncate) + case FE_TOWARDZERO: + rounding_mode = 1; + break; + // round to positive infinity (ceil) + case FE_UPWARD: + rounding_mode = 2; + break; + // round to negative infinity (floor) + case FE_DOWNWARD: + rounding_mode = 3; + break; + } + return rounding_mode; +} + +static inline void set_cop1_cs(uint32_t val) { + uint32_t rounding_mode = val & 0x3; + int round = FE_TONEAREST; + switch (rounding_mode) { + case 0: // round to nearest value + round = FE_TONEAREST; + break; + case 1: // round to zero (truncate) + round = FE_TOWARDZERO; + break; + case 2: // round to positive infinity (ceil) + round = FE_UPWARD; + break; + case 3: // round to negative infinity (floor) + round = FE_DOWNWARD; + break; + } + fesetround(round); +} + +#define S32(val) \ + ((int32_t)(val)) + +#define U32(val) \ + ((uint32_t)(val)) + +#define S64(val) \ + ((int64_t)(val)) + +#define U64(val) \ + ((uint64_t)(val)) + +#define MUL_S(val1, val2) \ + ((val1) * (val2)) + +#define MUL_D(val1, val2) \ + ((val1) * (val2)) + +#define DIV_S(val1, val2) \ + ((val1) / (val2)) + +#define DIV_D(val1, val2) \ + ((val1) / (val2)) + +#define CVT_S_W(val) \ + ((float)((int32_t)(val))) + +#define CVT_D_W(val) \ + ((double)((int32_t)(val))) + +#define CVT_D_L(val) \ + ((double)((int64_t)(val))) + +#define CVT_S_L(val) \ + ((float)((int64_t)(val))) + +#define CVT_D_S(val) \ + ((double)(val)) + +#define CVT_S_D(val) \ + ((float)(val)) + +#define TRUNC_W_S(val) \ + ((int32_t)(val)) + +#define TRUNC_W_D(val) \ + ((int32_t)(val)) + +#define TRUNC_L_S(val) \ + ((int64_t)(val)) + +#define TRUNC_L_D(val) \ + ((int64_t)(val)) + +#define DEFAULT_ROUNDING_MODE 0 + +static inline int32_t do_cvt_w_s(float val) { + // Rounding mode aware float to 32-bit int conversion. + return (int32_t)lrintf(val); +} + +#define CVT_W_S(val) \ + do_cvt_w_s(val) + +static inline int64_t do_cvt_l_s(float val) { + // Rounding mode aware float to 64-bit int conversion. + return (int64_t)llrintf(val); +} + +#define CVT_L_S(val) \ + do_cvt_l_s(val); + +static inline int32_t do_cvt_w_d(double val) { + // Rounding mode aware double to 32-bit int conversion. + return (int32_t)lrint(val); +} + +#define CVT_W_D(val) \ + do_cvt_w_d(val) + +static inline int64_t do_cvt_l_d(double val) { + // Rounding mode aware double to 64-bit int conversion. + return (int64_t)llrint(val); +} + +#define CVT_L_D(val) \ + do_cvt_l_d(val) + +#define NAN_CHECK(val) \ + assert(val == val) + +//#define NAN_CHECK(val) + +typedef union { + double d; + struct { + float fl; + float fh; + }; + struct { + uint32_t u32l; + uint32_t u32h; + }; + uint64_t u64; +} fpr; + +typedef struct { + gpr r0, r1, r2, r3, r4, r5, r6, r7, + r8, r9, r10, r11, r12, r13, r14, r15, + r16, r17, r18, r19, r20, r21, r22, r23, + r24, r25, r26, r27, r28, r29, r30, r31; + fpr f0, f1, f2, f3, f4, f5, f6, f7, + f8, f9, f10, f11, f12, f13, f14, f15, + f16, f17, f18, f19, f20, f21, f22, f23, + f24, f25, f26, f27, f28, f29, f30, f31; + uint64_t hi, lo; + uint32_t* f_odd; + uint32_t status_reg; + uint8_t mips3_float_mode; +} recomp_context; + +// Checks if the target is an even float register or that mips3 float mode is enabled +#define CHECK_FR(ctx, idx) \ + assert(((idx) & 1) == 0 || (ctx)->mips3_float_mode) + +#ifdef __cplusplus +extern "C" { +#endif + +void cop0_status_write(recomp_context* ctx, gpr value); +gpr cop0_status_read(recomp_context* ctx); +void switch_error(const char* func, uint32_t vram, uint32_t jtbl); +void do_break(uint32_t vram); + +typedef void (recomp_func_t)(uint8_t* rdram, recomp_context* ctx); + +recomp_func_t* get_function(int32_t vram); + +#define LOOKUP_FUNC(val) \ + get_function((int32_t)(val)) + +extern int32_t* section_addresses; + +#define LO16(x) \ + ((x) & 0xFFFF) + +#define HI16(x) \ + (((x) >> 16) + (((x) >> 15) & 1)) + +#define RELOC_HI16(section_index, offset) \ + HI16(section_addresses[section_index] + (offset)) + +#define RELOC_LO16(section_index, offset) \ + LO16(section_addresses[section_index] + (offset)) + +void recomp_syscall_handler(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram); + +void pause_self(uint8_t *rdram); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/n64recomp.h b/include/recompiler/context.h similarity index 93% rename from include/n64recomp.h rename to include/recompiler/context.h index c214ac7..df5224d 100644 --- a/include/n64recomp.h +++ b/include/recompiler/context.h @@ -36,6 +36,20 @@ namespace N64Recomp { : vram(vram), rom(rom), words(std::move(words)), name(std::move(name)), section_index(section_index), ignored(ignored), reimplemented(reimplemented), stubbed(stubbed) {} Function() = default; }; + + struct JumpTable { + uint32_t vram; + uint32_t addend_reg; + uint32_t rom; + uint32_t lw_vram; + uint32_t addu_vram; + uint32_t jr_vram; + uint16_t section_index; + std::vector entries; + + JumpTable(uint32_t vram, uint32_t addend_reg, uint32_t rom, uint32_t lw_vram, uint32_t addu_vram, uint32_t jr_vram, uint16_t section_index, std::vector&& entries) + : vram(vram), addend_reg(addend_reg), rom(rom), lw_vram(lw_vram), addu_vram(addu_vram), jr_vram(jr_vram), section_index(section_index), entries(std::move(entries)) {} + }; enum class RelocType : uint8_t { R_MIPS_NONE = 0, @@ -175,6 +189,8 @@ namespace N64Recomp { std::vector reference_symbols; // Mapping of symbol name to reference symbol index. std::unordered_map reference_symbols_by_name; + // Whether all reference sections should be treated as relocatable (used in live recompilation). + bool all_reference_sections_relocatable = false; public: std::vector
sections; std::vector functions; @@ -187,6 +203,8 @@ namespace N64Recomp { // The target ROM being recompiled, TODO move this outside of the context to avoid making a copy for mod contexts. // Used for reading relocations and for the output binary feature. std::vector rom; + // Whether reference symbols should be validated when emitting function calls during recompilation. + bool skip_validating_reference_symbols = true; //// Only used by the CLI, TODO move this to a struct in the internal headers. // A mapping of function name to index in the functions vector @@ -359,6 +377,9 @@ namespace N64Recomp { } bool is_reference_section_relocatable(uint16_t section_index) const { + if (all_reference_sections_relocatable) { + return true; + } if (section_index == SectionAbsolute) { return false; } @@ -518,9 +539,15 @@ namespace N64Recomp { void copy_reference_sections_from(const Context& rhs) { reference_sections = rhs.reference_sections; } + + void set_all_reference_sections_relocatable() { + all_reference_sections_relocatable = true; + } }; - bool recompile_function(const Context& context, const Function& func, std::ofstream& output_file, std::span> static_funcs, bool tag_reference_relocs); + class Generator; + bool recompile_function(const Context& context, size_t function_index, std::ostream& output_file, std::span> static_funcs, bool tag_reference_relocs); + bool recompile_function_custom(Generator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span> static_funcs_out, bool tag_reference_relocs); enum class ModSymbolsError { Good, diff --git a/include/recompiler/generator.h b/include/recompiler/generator.h new file mode 100644 index 0000000..0ffde0b --- /dev/null +++ b/include/recompiler/generator.h @@ -0,0 +1,109 @@ +#ifndef __GENERATOR_H__ +#define __GENERATOR_H__ + +#include "recompiler/context.h" +#include "operations.h" + +namespace N64Recomp { + struct InstructionContext { + int rd; + int rs; + int rt; + int sa; + + int fd; + int fs; + int ft; + + int cop1_cs; + + uint16_t imm16; + + bool reloc_tag_as_reference; + RelocType reloc_type; + uint32_t reloc_section_index; + uint32_t reloc_target_section_offset; + }; + + class Generator { + public: + virtual void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const = 0; + virtual void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const = 0; + virtual void process_store_op(const StoreOp& op, const InstructionContext& ctx) const = 0; + virtual void emit_function_start(const std::string& function_name, size_t func_index) const = 0; + virtual void emit_function_end() const = 0; + virtual void emit_function_call_lookup(uint32_t addr) const = 0; + virtual void emit_function_call_by_register(int reg) const = 0; + // target_section_offset can each be deduced from symbol_index if the full context is available, + // but for live recompilation the reference symbol list is unavailable so it's still provided. + virtual void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const = 0; + virtual void emit_function_call(const Context& context, size_t function_index) const = 0; + virtual void emit_named_function_call(const std::string& function_name) const = 0; + virtual void emit_goto(const std::string& target) const = 0; + virtual void emit_label(const std::string& label_name) const = 0; + virtual void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const = 0; + virtual void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const = 0; + virtual void emit_branch_close() const = 0; + virtual void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const = 0; + virtual void emit_case(int case_index, const std::string& target_label) const = 0; + virtual void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const = 0; + virtual void emit_switch_close() const = 0; + virtual void emit_return() const = 0; + virtual void emit_check_fr(int fpr) const = 0; + virtual void emit_check_nan(int fpr, bool is_double) const = 0; + virtual void emit_cop0_status_read(int reg) const = 0; + virtual void emit_cop0_status_write(int reg) const = 0; + virtual void emit_cop1_cs_read(int reg) const = 0; + virtual void emit_cop1_cs_write(int reg) const = 0; + virtual void emit_muldiv(InstrId instr_id, int reg1, int reg2) const = 0; + virtual void emit_syscall(uint32_t instr_vram) const = 0; + virtual void emit_do_break(uint32_t instr_vram) const = 0; + virtual void emit_pause_self() const = 0; + virtual void emit_trigger_event(uint32_t event_index) const = 0; + virtual void emit_comment(const std::string& comment) const = 0; + }; + + class CGenerator final : Generator { + public: + CGenerator(std::ostream& output_file) : output_file(output_file) {}; + void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const final; + void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const final; + void process_store_op(const StoreOp& op, const InstructionContext& ctx) const final; + void emit_function_start(const std::string& function_name, size_t func_index) const final; + void emit_function_end() const final; + void emit_function_call_lookup(uint32_t addr) const final; + void emit_function_call_by_register(int reg) const final; + void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const final; + void emit_function_call(const Context& context, size_t function_index) const final; + void emit_named_function_call(const std::string& function_name) const final; + void emit_goto(const std::string& target) const final; + void emit_label(const std::string& label_name) const final; + void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const final; + void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const final; + void emit_branch_close() const final; + void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const final; + void emit_case(int case_index, const std::string& target_label) const final; + void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const final; + void emit_switch_close() const final; + void emit_return() const final; + void emit_check_fr(int fpr) const final; + void emit_check_nan(int fpr, bool is_double) const final; + void emit_cop0_status_read(int reg) const final; + void emit_cop0_status_write(int reg) const final; + void emit_cop1_cs_read(int reg) const final; + void emit_cop1_cs_write(int reg) const final; + void emit_muldiv(InstrId instr_id, int reg1, int reg2) const final; + void emit_syscall(uint32_t instr_vram) const final; + void emit_do_break(uint32_t instr_vram) const final; + void emit_pause_self() const final; + void emit_trigger_event(uint32_t event_index) const final; + void emit_comment(const std::string& comment) const final; + private: + void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const; + void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const; + void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const; + std::ostream& output_file; + }; +} + +#endif diff --git a/include/recompiler/live_recompiler.h b/include/recompiler/live_recompiler.h new file mode 100644 index 0000000..1b92d95 --- /dev/null +++ b/include/recompiler/live_recompiler.h @@ -0,0 +1,141 @@ +#ifndef __LIVE_RECOMPILER_H__ +#define __LIVE_RECOMPILER_H__ + +#include +#include "recompiler/generator.h" +#include "recomp.h" + +struct sljit_compiler; + +namespace N64Recomp { + struct LiveGeneratorContext; + struct ReferenceJumpDetails { + uint16_t section; + uint32_t section_offset; + }; + struct LiveGeneratorOutput { + LiveGeneratorOutput() = default; + LiveGeneratorOutput(const LiveGeneratorOutput& rhs) = delete; + LiveGeneratorOutput(LiveGeneratorOutput&& rhs) { *this = std::move(rhs); } + LiveGeneratorOutput& operator=(const LiveGeneratorOutput& rhs) = delete; + LiveGeneratorOutput& operator=(LiveGeneratorOutput&& rhs) { + good = rhs.good; + string_literals = std::move(rhs.string_literals); + jump_tables = std::move(rhs.jump_tables); + code = rhs.code; + code_size = rhs.code_size; + functions = std::move(rhs.functions); + reference_symbol_jumps = std::move(rhs.reference_symbol_jumps); + import_jumps_by_index = std::move(rhs.import_jumps_by_index); + executable_offset = rhs.executable_offset; + + rhs.good = false; + rhs.code = nullptr; + rhs.code_size = 0; + rhs.reference_symbol_jumps.clear(); + rhs.executable_offset = 0; + + return *this; + } + ~LiveGeneratorOutput(); + size_t num_reference_symbol_jumps() const; + void set_reference_symbol_jump(size_t jump_index, recomp_func_t* func); + ReferenceJumpDetails get_reference_symbol_jump_details(size_t jump_index); + void populate_import_symbol_jumps(size_t import_index, recomp_func_t* func); + bool good = false; + // Storage for string literals referenced by recompiled code. These are allocated as unique_ptr arrays + // to prevent them from moving, as the referenced address is baked into the recompiled code. + std::vector> string_literals; + // Storage for jump tables referenced by recompiled code (vector of arrays of pointers). These are also + // allocated as unique_ptr arrays for the same reason as strings. + std::vector> jump_tables; + // Recompiled code. + void* code; + // Size of the recompiled code. + size_t code_size; + // Pointers to each individual function within the recompiled code. + std::vector functions; + private: + // List of jump details and the corresponding jump instruction address. These jumps get populated after recompilation is complete + // during dependency resolution. + std::vector> reference_symbol_jumps; + // Mapping of import symbol index to any jumps to that import symbol. + std::unordered_multimap import_jumps_by_index; + // sljit executable offset. + int64_t executable_offset; + + friend class LiveGenerator; + }; + struct LiveGeneratorInputs { + uint32_t base_event_index; + void (*cop0_status_write)(recomp_context* ctx, gpr value); + gpr (*cop0_status_read)(recomp_context* ctx); + void (*switch_error)(const char* func, uint32_t vram, uint32_t jtbl); + void (*do_break)(uint32_t vram); + recomp_func_t* (*get_function)(int32_t vram); + void (*syscall_handler)(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram); + void (*pause_self)(uint8_t* rdram); + void (*trigger_event)(uint8_t* rdram, recomp_context* ctx, uint32_t event_index); + int32_t *reference_section_addresses; + int32_t *local_section_addresses; + }; + class LiveGenerator final : public Generator { + public: + LiveGenerator(size_t num_funcs, const LiveGeneratorInputs& inputs); + ~LiveGenerator(); + // Prevent moving or copying. + LiveGenerator(const LiveGenerator& rhs) = delete; + LiveGenerator(LiveGenerator&& rhs) = delete; + LiveGenerator& operator=(const LiveGenerator& rhs) = delete; + LiveGenerator& operator=(LiveGenerator&& rhs) = delete; + + LiveGeneratorOutput finish(); + void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const final; + void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const final; + void process_store_op(const StoreOp& op, const InstructionContext& ctx) const final; + void emit_function_start(const std::string& function_name, size_t func_index) const final; + void emit_function_end() const final; + void emit_function_call_lookup(uint32_t addr) const final; + void emit_function_call_by_register(int reg) const final; + void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const final; + void emit_function_call(const Context& context, size_t function_index) const final; + void emit_named_function_call(const std::string& function_name) const final; + void emit_goto(const std::string& target) const final; + void emit_label(const std::string& label_name) const final; + void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const final; + void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const final; + void emit_branch_close() const final; + void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const final; + void emit_case(int case_index, const std::string& target_label) const final; + void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const final; + void emit_switch_close() const final; + void emit_return() const final; + void emit_check_fr(int fpr) const final; + void emit_check_nan(int fpr, bool is_double) const final; + void emit_cop0_status_read(int reg) const final; + void emit_cop0_status_write(int reg) const final; + void emit_cop1_cs_read(int reg) const final; + void emit_cop1_cs_write(int reg) const final; + void emit_muldiv(InstrId instr_id, int reg1, int reg2) const final; + void emit_syscall(uint32_t instr_vram) const final; + void emit_do_break(uint32_t instr_vram) const final; + void emit_pause_self() const final; + void emit_trigger_event(uint32_t event_index) const final; + void emit_comment(const std::string& comment) const final; + private: + void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const; + void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const; + void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const; + // Loads the relocated address specified by the instruction context into the target register. + void load_relocated_address(const InstructionContext& ctx, int reg) const; + sljit_compiler* compiler; + LiveGeneratorInputs inputs; + mutable std::unique_ptr context; + mutable bool errored; + }; + + void live_recompiler_init(); + bool recompile_function_live(LiveGenerator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span> static_funcs_out, bool tag_reference_relocs); +} + +#endif \ No newline at end of file diff --git a/include/operations.h b/include/recompiler/operations.h similarity index 92% rename from include/operations.h rename to include/recompiler/operations.h index 5cb407e..65f2ed7 100644 --- a/include/operations.h +++ b/include/recompiler/operations.h @@ -28,13 +28,12 @@ namespace N64Recomp { ToU32, ToS64, ToU64, - NegateS32, - NegateS64, Lui, Mask5, // Mask to 5 bits Mask6, // Mask to 5 bits ToInt32, // Functionally equivalent to ToS32, only exists for parity with old codegen - Negate, + NegateFloat, + NegateDouble, AbsFloat, AbsDouble, SqrtFloat, @@ -51,12 +50,20 @@ namespace N64Recomp { ConvertLFromS, TruncateWFromS, TruncateWFromD, + TruncateLFromS, + TruncateLFromD, RoundWFromS, RoundWFromD, + RoundLFromS, + RoundLFromD, CeilWFromS, CeilWFromD, + CeilLFromS, + CeilLFromD, FloorWFromS, - FloorWFromD + FloorWFromD, + FloorLFromS, + FloorLFromD }; enum class BinaryOpType { @@ -92,6 +99,12 @@ namespace N64Recomp { LessEq, Greater, GreaterEq, + EqualFloat, + LessFloat, + LessEqFloat, + EqualDouble, + LessDouble, + LessEqDouble, // Loads LD, LW, diff --git a/lib/sljit b/lib/sljit new file mode 160000 index 0000000..f632608 --- /dev/null +++ b/lib/sljit @@ -0,0 +1 @@ +Subproject commit f6326087b3404efb07c6d3deed97b3c3b8098c0c diff --git a/src/analysis.cpp b/src/analysis.cpp index 5dfd955..92a421e 100644 --- a/src/analysis.cpp +++ b/src/analysis.cpp @@ -4,7 +4,7 @@ #include "rabbitizer.hpp" #include "fmt/format.h" -#include "n64recomp.h" +#include "recompiler/context.h" #include "analysis.h" extern "C" const char* RabbitizerRegister_getNameGpr(uint8_t regValue); @@ -194,21 +194,11 @@ bool analyze_instruction(const rabbitizer::InstructionCpu& instr, const N64Recom reg_states[rs].loaded_lw_vram, reg_states[rs].loaded_addu_vram, instr.getVram(), + 0, // section index gets filled in later std::vector{} ); - } else if (reg_states[rs].valid_lui && reg_states[rs].valid_addiu && !reg_states[rs].valid_addend && !reg_states[rs].valid_loaded) { - uint32_t address = reg_states[rs].prev_addiu_vram + reg_states[rs].prev_lui; - stats.absolute_jumps.emplace_back( - address, - instr.getVram() - ); - } - // Allow tail calls (TODO account for trailing nops due to bad function splits) - else if (instr.getVram() != func.vram + (func.words.size() - 2) * sizeof(func.words[0])) { - // Inconclusive analysis - fmt::print(stderr, "Failed to to find jump table for `jr {}` at 0x{:08X} in {}\n", RabbitizerRegister_getNameGpr(rs), instr.getVram(), func.name); - return false; } + // TODO stricter validation on tail calls, since not all indirect jumps can be treated as one. break; default: if (instr.modifiesRd()) { @@ -256,6 +246,7 @@ bool N64Recomp::analyze_function(const N64Recomp::Context& context, const N64Rec // TODO this assumes that the jump table is in the same section as the function itself cur_jtbl.rom = cur_jtbl.vram + func.rom - func.vram; + cur_jtbl.section_index = func.section_index; while (vram < end_address) { // Retrieve the current entry of the jump table diff --git a/src/analysis.h b/src/analysis.h index eafd1e7..9e0562e 100644 --- a/src/analysis.h +++ b/src/analysis.h @@ -4,22 +4,9 @@ #include #include -#include "n64recomp.h" +#include "recompiler/context.h" namespace N64Recomp { - struct JumpTable { - uint32_t vram; - uint32_t addend_reg; - uint32_t rom; - uint32_t lw_vram; - uint32_t addu_vram; - uint32_t jr_vram; - std::vector entries; - - JumpTable(uint32_t vram, uint32_t addend_reg, uint32_t rom, uint32_t lw_vram, uint32_t addu_vram, uint32_t jr_vram, std::vector&& entries) - : vram(vram), addend_reg(addend_reg), rom(rom), lw_vram(lw_vram), addu_vram(addu_vram), jr_vram(jr_vram), entries(std::move(entries)) {} - }; - struct AbsoluteJump { uint32_t jump_target; uint32_t instruction_vram; @@ -29,7 +16,6 @@ namespace N64Recomp { struct FunctionStats { std::vector jump_tables; - std::vector absolute_jumps; }; bool analyze_function(const Context& context, const Function& function, const std::vector& instructions, FunctionStats& stats); diff --git a/src/cgenerator.cpp b/src/cgenerator.cpp index 7751568..596ad60 100644 --- a/src/cgenerator.cpp +++ b/src/cgenerator.cpp @@ -4,11 +4,11 @@ #include "fmt/format.h" #include "fmt/ostream.h" -#include "generator.h" +#include "recompiler/generator.h" struct BinaryOpFields { std::string func_string; std::string infix_string; }; -std::vector c_op_fields = []() { +static std::vector c_op_fields = []() { std::vector ret{}; ret.resize(static_cast(N64Recomp::BinaryOpType::COUNT)); std::vector ops_setup{}; @@ -45,9 +45,15 @@ std::vector c_op_fields = []() { setup_op(N64Recomp::BinaryOpType::Sra32, "S32", ">>"); // Arithmetic aspect will be taken care of by unary op for first operand. setup_op(N64Recomp::BinaryOpType::Sra64, "", ">>"); // Arithmetic aspect will be taken care of by unary op for first operand. setup_op(N64Recomp::BinaryOpType::Equal, "", "=="); + setup_op(N64Recomp::BinaryOpType::EqualFloat,"", "=="); + setup_op(N64Recomp::BinaryOpType::EqualDouble,"", "=="); setup_op(N64Recomp::BinaryOpType::NotEqual, "", "!="); setup_op(N64Recomp::BinaryOpType::Less, "", "<"); + setup_op(N64Recomp::BinaryOpType::LessFloat, "", "<"); + setup_op(N64Recomp::BinaryOpType::LessDouble,"", "<"); setup_op(N64Recomp::BinaryOpType::LessEq, "", "<="); + setup_op(N64Recomp::BinaryOpType::LessEqFloat,"", "<="); + setup_op(N64Recomp::BinaryOpType::LessEqDouble,"", "<="); setup_op(N64Recomp::BinaryOpType::Greater, "", ">"); setup_op(N64Recomp::BinaryOpType::GreaterEq, "", ">="); setup_op(N64Recomp::BinaryOpType::LD, "LD", ""); @@ -72,22 +78,22 @@ std::vector c_op_fields = []() { return ret; }(); -std::string gpr_to_string(int gpr_index) { +static std::string gpr_to_string(int gpr_index) { if (gpr_index == 0) { return "0"; } return fmt::format("ctx->r{}", gpr_index); } -std::string fpr_to_string(int fpr_index) { +static std::string fpr_to_string(int fpr_index) { return fmt::format("ctx->f{}.fl", fpr_index); } -std::string fpr_double_to_string(int fpr_index) { +static std::string fpr_double_to_string(int fpr_index) { return fmt::format("ctx->f{}.d", fpr_index); } -std::string fpr_u32l_to_string(int fpr_index) { +static std::string fpr_u32l_to_string(int fpr_index) { if (fpr_index & 1) { return fmt::format("ctx->f_odd[({} - 1) * 2]", fpr_index); } @@ -96,11 +102,11 @@ std::string fpr_u32l_to_string(int fpr_index) { } } -std::string fpr_u64_to_string(int fpr_index) { +static std::string fpr_u64_to_string(int fpr_index) { return fmt::format("ctx->f{}.u64", fpr_index); } -std::string unsigned_reloc(const N64Recomp::InstructionContext& context) { +static std::string unsigned_reloc(const N64Recomp::InstructionContext& context) { switch (context.reloc_type) { case N64Recomp::RelocType::R_MIPS_HI16: return fmt::format("{}RELOC_HI16({}, {:#X})", @@ -113,7 +119,7 @@ std::string unsigned_reloc(const N64Recomp::InstructionContext& context) { } } -std::string signed_reloc(const N64Recomp::InstructionContext& context) { +static std::string signed_reloc(const N64Recomp::InstructionContext& context) { return "(int16_t)" + unsigned_reloc(context); } @@ -223,12 +229,6 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper case UnaryOpType::ToU64: // Nothing to do here, they're already U64 break; - case UnaryOpType::NegateS32: - assert(false); - break; - case UnaryOpType::NegateS64: - assert(false); - break; case UnaryOpType::Lui: operand_string = "S32(" + operand_string + " << 16)"; break; @@ -241,7 +241,10 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper case UnaryOpType::ToInt32: operand_string = "(int32_t)" + operand_string; break; - case UnaryOpType::Negate: + case UnaryOpType::NegateFloat: + operand_string = "-" + operand_string; + break; + case UnaryOpType::NegateDouble: operand_string = "-" + operand_string; break; case UnaryOpType::AbsFloat: @@ -292,24 +295,48 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper case UnaryOpType::TruncateWFromD: operand_string = "TRUNC_W_D(" + operand_string + ")"; break; + case UnaryOpType::TruncateLFromS: + operand_string = "TRUNC_L_S(" + operand_string + ")"; + break; + case UnaryOpType::TruncateLFromD: + operand_string = "TRUNC_L_D(" + operand_string + ")"; + break; case UnaryOpType::RoundWFromS: operand_string = "lroundf(" + operand_string + ")"; break; case UnaryOpType::RoundWFromD: operand_string = "lround(" + operand_string + ")"; break; + case UnaryOpType::RoundLFromS: + operand_string = "llroundf(" + operand_string + ")"; + break; + case UnaryOpType::RoundLFromD: + operand_string = "llround(" + operand_string + ")"; + break; case UnaryOpType::CeilWFromS: operand_string = "S32(ceilf(" + operand_string + "))"; break; case UnaryOpType::CeilWFromD: operand_string = "S32(ceil(" + operand_string + "))"; break; + case UnaryOpType::CeilLFromS: + operand_string = "S64(ceilf(" + operand_string + "))"; + break; + case UnaryOpType::CeilLFromD: + operand_string = "S64(ceil(" + operand_string + "))"; + break; case UnaryOpType::FloorWFromS: operand_string = "S32(floorf(" + operand_string + "))"; break; case UnaryOpType::FloorWFromD: operand_string = "S32(floor(" + operand_string + "))"; break; + case UnaryOpType::FloorLFromS: + operand_string = "S64(floorf(" + operand_string + "))"; + break; + case UnaryOpType::FloorLFromD: + operand_string = "S64(floor(" + operand_string + "))"; + break; } } @@ -333,10 +360,10 @@ void N64Recomp::CGenerator::get_binary_expr_string(BinaryOpType type, const Bina expr_string = fmt::format("{} {} {} ? 1 : 0", input_a, infix_string, input_b); } else if (type == BinaryOpType::Equal && operands.operands[1] == Operand::Zero && operands.operand_operations[1] == UnaryOpType::None) { - expr_string = input_a; + expr_string = "!" + input_a; } else if (type == BinaryOpType::NotEqual && operands.operands[1] == Operand::Zero && operands.operand_operations[1] == UnaryOpType::None) { - expr_string = "!" + input_a; + expr_string = input_a; } // End unnecessary cases. @@ -365,7 +392,57 @@ void N64Recomp::CGenerator::get_binary_expr_string(BinaryOpType type, const Bina } } -void N64Recomp::CGenerator::emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const { +void N64Recomp::CGenerator::emit_function_start(const std::string& function_name, size_t func_index) const { + fmt::print(output_file, + "RECOMP_FUNC void {}(uint8_t* rdram, recomp_context* ctx) {{\n" + // these variables shouldn't need to be preserved across function boundaries, so make them local for more efficient output + " uint64_t hi = 0, lo = 0, result = 0;\n" + " int c1cs = 0;\n", // cop1 conditional signal + function_name); +} + +void N64Recomp::CGenerator::emit_function_end() const { + fmt::print(output_file, ";}}\n"); +} + +void N64Recomp::CGenerator::emit_function_call_lookup(uint32_t addr) const { + fmt::print(output_file, "LOOKUP_FUNC(0x{:08X})(rdram, ctx);\n", addr); +} + +void N64Recomp::CGenerator::emit_function_call_by_register(int reg) const { + fmt::print(output_file, "LOOKUP_FUNC({})(rdram, ctx);\n", gpr_to_string(reg)); +} + +void N64Recomp::CGenerator::emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const { + (void)target_section_offset; + const N64Recomp::ReferenceSymbol& sym = context.get_reference_symbol(section_index, symbol_index); + fmt::print(output_file, "{}(rdram, ctx);\n", sym.name); +} + +void N64Recomp::CGenerator::emit_function_call(const Context& context, size_t function_index) const { + fmt::print(output_file, "{}(rdram, ctx);\n", context.functions[function_index].name); +} + +void N64Recomp::CGenerator::emit_named_function_call(const std::string& function_name) const { + fmt::print(output_file, "{}(rdram, ctx);\n", function_name); +} + +void N64Recomp::CGenerator::emit_goto(const std::string& target) const { + fmt::print(output_file, + " goto {};\n", target); +} + +void N64Recomp::CGenerator::emit_label(const std::string& label_name) const { + fmt::print(output_file, + "{}:\n", label_name); +} + +void N64Recomp::CGenerator::emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const { + std::string jump_variable = fmt::format("jr_addend_{:08X}", jtbl.jr_vram); + fmt::print(output_file, "gpr {} = {};\n", jump_variable, gpr_to_string(reg)); +} + +void N64Recomp::CGenerator::emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const { // Thread local variables to prevent allocations when possible. // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations. thread_local std::string expr_string{}; @@ -373,19 +450,114 @@ void N64Recomp::CGenerator::emit_branch_condition(std::ostream& output_file, con fmt::print(output_file, "if ({}) {{\n", expr_string); } -void N64Recomp::CGenerator::emit_branch_close(std::ostream& output_file) const { - fmt::print(output_file, " }}\n"); +void N64Recomp::CGenerator::emit_branch_close() const { + fmt::print(output_file, "}}\n"); } -void N64Recomp::CGenerator::emit_check_fr(std::ostream& output_file, int fpr) const { +void N64Recomp::CGenerator::emit_switch_close() const { + fmt::print(output_file, "}}\n"); +} + +void N64Recomp::CGenerator::emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const { + (void)recompiler_context; + (void)reg; + // TODO generate code to subtract the jump table address from the register's value instead. + // Once that's done, the addend temp can be deleted to simplify the generator interface. + std::string jump_variable = fmt::format("jr_addend_{:08X}", jtbl.jr_vram); + + fmt::print(output_file, "switch ({} >> 2) {{\n", jump_variable); +} + +void N64Recomp::CGenerator::emit_case(int case_index, const std::string& target_label) const { + fmt::print(output_file, "case {}: goto {}; break;\n", case_index, target_label); +} + +void N64Recomp::CGenerator::emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const { + fmt::print(output_file, "default: switch_error(__func__, 0x{:08X}, 0x{:08X});\n", instr_vram, jtbl_vram); +} + +void N64Recomp::CGenerator::emit_return() const { + fmt::print(output_file, "return;\n"); +} + +void N64Recomp::CGenerator::emit_check_fr(int fpr) const { fmt::print(output_file, "CHECK_FR(ctx, {});\n ", fpr); } -void N64Recomp::CGenerator::emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const { +void N64Recomp::CGenerator::emit_check_nan(int fpr, bool is_double) const { fmt::print(output_file, "NAN_CHECK(ctx->f{}.{}); ", fpr, is_double ? "d" : "fl"); } -void N64Recomp::CGenerator::process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const { +void N64Recomp::CGenerator::emit_cop0_status_read(int reg) const { + fmt::print(output_file, "{} = cop0_status_read(ctx);\n", gpr_to_string(reg)); +} + +void N64Recomp::CGenerator::emit_cop0_status_write(int reg) const { + fmt::print(output_file, "cop0_status_write(ctx, {});", gpr_to_string(reg)); +} + +void N64Recomp::CGenerator::emit_cop1_cs_read(int reg) const { + fmt::print(output_file, "{} = get_cop1_cs();\n", gpr_to_string(reg)); +} + +void N64Recomp::CGenerator::emit_cop1_cs_write(int reg) const { + fmt::print(output_file, "set_cop1_cs({});\n", gpr_to_string(reg)); +} + +void N64Recomp::CGenerator::emit_muldiv(InstrId instr_id, int reg1, int reg2) const { + switch (instr_id) { + case InstrId::cpu_mult: + fmt::print(output_file, "result = S64(S32({})) * S64(S32({})); lo = S32(result >> 0); hi = S32(result >> 32);\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_dmult: + fmt::print(output_file, "DMULT(S64({}), S64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_multu: + fmt::print(output_file, "result = U64(U32({})) * U64(U32({})); lo = S32(result >> 0); hi = S32(result >> 32);\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_dmultu: + fmt::print(output_file, "DMULTU(U64({}), U64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_div: + // Cast to 64-bits before division to prevent artihmetic exception for s32(0x80000000) / -1 + fmt::print(output_file, "lo = S32(S64(S32({0})) / S64(S32({1}))); hi = S32(S64(S32({0})) % S64(S32({1})));\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_ddiv: + fmt::print(output_file, "DDIV(S64({}), S64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_divu: + fmt::print(output_file, "lo = S32(U32({0}) / U32({1})); hi = S32(U32({0}) % U32({1}));\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + case InstrId::cpu_ddivu: + fmt::print(output_file, "DDIVU(U64({}), U64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2)); + break; + default: + assert(false); + break; + } +} + +void N64Recomp::CGenerator::emit_syscall(uint32_t instr_vram) const { + fmt::print(output_file, "recomp_syscall_handler(rdram, ctx, 0x{:08X});\n", instr_vram); +} + +void N64Recomp::CGenerator::emit_do_break(uint32_t instr_vram) const { + fmt::print(output_file, "do_break({});\n", instr_vram); +} + +void N64Recomp::CGenerator::emit_pause_self() const { + fmt::print(output_file, "pause_self(rdram);\n"); +} + +void N64Recomp::CGenerator::emit_trigger_event(uint32_t event_index) const { + fmt::print(output_file, "recomp_trigger_event(rdram, ctx, base_event_index + {});\n", event_index); +} + +void N64Recomp::CGenerator::emit_comment(const std::string& comment) const { + fmt::print(output_file, "// {}\n", comment); +} + +void N64Recomp::CGenerator::process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const { // Thread local variables to prevent allocations when possible. // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations. thread_local std::string output{}; @@ -395,7 +567,7 @@ void N64Recomp::CGenerator::process_binary_op(std::ostream& output_file, const B fmt::print(output_file, "{} = {};\n", output, expression); } -void N64Recomp::CGenerator::process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const { +void N64Recomp::CGenerator::process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const { // Thread local variables to prevent allocations when possible. // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations. thread_local std::string output{}; @@ -406,7 +578,7 @@ void N64Recomp::CGenerator::process_unary_op(std::ostream& output_file, const Un fmt::print(output_file, "{} = {};\n", output, input); } -void N64Recomp::CGenerator::process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const { +void N64Recomp::CGenerator::process_store_op(const StoreOp& op, const InstructionContext& ctx) const { // Thread local variables to prevent allocations when possible. // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations. thread_local std::string base_str{}; diff --git a/src/config.cpp b/src/config.cpp index d3b236f..f191ba5 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -3,7 +3,7 @@ #include #include "fmt/format.h" #include "config.h" -#include "n64recomp.h" +#include "recompiler/context.h" std::filesystem::path concat_if_not_empty(const std::filesystem::path& parent, const std::filesystem::path& child) { if (!child.empty()) { @@ -375,7 +375,7 @@ N64Recomp::Config::Config(const char* path) { recomp_include = recomp_include_opt.value(); } else { - recomp_include = "#include \"librecomp/recomp.h\""; + recomp_include = "#include \"recomp.h\""; } std::optional funcs_per_file_opt = input_data["functions_per_output_file"].value(); diff --git a/src/elf.cpp b/src/elf.cpp index a18fdbd..d83908c 100644 --- a/src/elf.cpp +++ b/src/elf.cpp @@ -3,7 +3,7 @@ #include "fmt/format.h" // #include "fmt/ostream.h" -#include "n64recomp.h" +#include "recompiler/context.h" #include "elfio/elfio.hpp" bool read_symbols(N64Recomp::Context& context, const ELFIO::elfio& elf_file, ELFIO::section* symtab_section, const N64Recomp::ElfParsingConfig& elf_config, bool dumping_context, std::unordered_map>& data_syms) { diff --git a/src/main.cpp b/src/main.cpp index a2ccdc1..8a8fe91 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -9,7 +9,7 @@ #include "fmt/format.h" #include "fmt/ostream.h" -#include "n64recomp.h" +#include "recompiler/context.h" #include "config.h" #include @@ -111,7 +111,7 @@ bool compare_files(const std::filesystem::path& file1_path, const std::filesyste return std::equal(begin1, std::istreambuf_iterator(), begin2); //Second argument is end-of-range iterator } -bool recompile_single_function(const N64Recomp::Context& context, const N64Recomp::Function& func, const std::string& recomp_include, const std::filesystem::path& output_path, std::span> static_funcs_out) { +bool recompile_single_function(const N64Recomp::Context& context, size_t func_index, const std::string& recomp_include, const std::filesystem::path& output_path, std::span> static_funcs_out) { // Open the temporary output file std::filesystem::path temp_path = output_path; temp_path.replace_extension(".tmp"); @@ -127,7 +127,7 @@ bool recompile_single_function(const N64Recomp::Context& context, const N64Recom "\n", recomp_include); - if (!N64Recomp::recompile_function(context, func, output_file, static_funcs_out, false)) { + if (!N64Recomp::recompile_function(context, func_index, output_file, static_funcs_out, false)) { return false; } @@ -725,7 +725,7 @@ int main(int argc, char** argv) { // Recompile the function. if (config.single_file_output || config.functions_per_output_file > 1) { - result = N64Recomp::recompile_function(context, func, current_output_file, static_funcs_by_section, false); + result = N64Recomp::recompile_function(context, i, current_output_file, static_funcs_by_section, false); if (!config.single_file_output) { cur_file_function_count++; if (cur_file_function_count >= config.functions_per_output_file) { @@ -734,7 +734,7 @@ int main(int argc, char** argv) { } } else { - result = recompile_single_function(context, func, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section); + result = recompile_single_function(context, i, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section); } if (result == false) { fmt::print(stderr, "Error recompiling {}\n", func.name); @@ -797,22 +797,25 @@ int main(int argc, char** argv) { std::vector insn_words((cur_func_end - static_func_addr) / sizeof(uint32_t)); insn_words.assign(func_rom_start, func_rom_start + insn_words.size()); - N64Recomp::Function func { + // Create the new function and add it to the context. + size_t new_func_index = context.functions.size(); + context.functions.emplace_back( static_func_addr, rom_addr, std::move(insn_words), fmt::format("static_{}_{:08X}", section_index, static_func_addr), static_cast(section_index), false - }; + ); + const N64Recomp::Function& new_func = context.functions[new_func_index]; fmt::print(func_header_file, - "void {}(uint8_t* rdram, recomp_context* ctx);\n", func.name); + "void {}(uint8_t* rdram, recomp_context* ctx);\n", new_func.name); bool result; - size_t prev_num_statics = static_funcs_by_section[func.section_index].size(); + size_t prev_num_statics = static_funcs_by_section[new_func.section_index].size(); if (config.single_file_output || config.functions_per_output_file > 1) { - result = N64Recomp::recompile_function(context, func, current_output_file, static_funcs_by_section, false); + result = N64Recomp::recompile_function(context, new_func_index, current_output_file, static_funcs_by_section, false); if (!config.single_file_output) { cur_file_function_count++; if (cur_file_function_count >= config.functions_per_output_file) { @@ -821,14 +824,14 @@ int main(int argc, char** argv) { } } else { - result = recompile_single_function(context, func, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section); + result = recompile_single_function(context, new_func_index, config.recomp_include, config.output_func_path / (new_func.name + ".c"), static_funcs_by_section); } // Add any new static functions that were found while recompiling this one. - size_t cur_num_statics = static_funcs_by_section[func.section_index].size(); + size_t cur_num_statics = static_funcs_by_section[new_func.section_index].size(); if (cur_num_statics != prev_num_statics) { for (size_t new_static_index = prev_num_statics; new_static_index < cur_num_statics; new_static_index++) { - uint32_t new_static_vram = static_funcs_by_section[func.section_index][new_static_index]; + uint32_t new_static_vram = static_funcs_by_section[new_func.section_index][new_static_index]; if (!statics_set.contains(new_static_vram)) { statics_set.emplace(new_static_vram); @@ -838,7 +841,7 @@ int main(int argc, char** argv) { } if (result == false) { - fmt::print(stderr, "Error recompiling {}\n", func.name); + fmt::print(stderr, "Error recompiling {}\n", new_func.name); std::exit(EXIT_FAILURE); } } diff --git a/src/mod_symbols.cpp b/src/mod_symbols.cpp index 24675fe..fcfdead 100644 --- a/src/mod_symbols.cpp +++ b/src/mod_symbols.cpp @@ -1,6 +1,6 @@ #include -#include "n64recomp.h" +#include "recompiler/context.h" struct FileHeader { char magic[8]; // N64RSYMS diff --git a/src/operations.cpp b/src/operations.cpp index d73b278..70201d3 100644 --- a/src/operations.cpp +++ b/src/operations.cpp @@ -1,4 +1,4 @@ -#include "operations.h" +#include "recompiler/operations.h" namespace N64Recomp { const std::unordered_map unary_ops { @@ -12,8 +12,8 @@ namespace N64Recomp { // Float operations { InstrId::cpu_mov_s, { UnaryOpType::None, Operand::Fd, Operand::Fs, true } }, { InstrId::cpu_mov_d, { UnaryOpType::None, Operand::FdDouble, Operand::FsDouble, true } }, - { InstrId::cpu_neg_s, { UnaryOpType::Negate, Operand::Fd, Operand::Fs, true, true } }, - { InstrId::cpu_neg_d, { UnaryOpType::Negate, Operand::FdDouble, Operand::FsDouble, true, true } }, + { InstrId::cpu_neg_s, { UnaryOpType::NegateFloat, Operand::Fd, Operand::Fs, true, true } }, + { InstrId::cpu_neg_d, { UnaryOpType::NegateDouble, Operand::FdDouble, Operand::FsDouble, true, true } }, { InstrId::cpu_abs_s, { UnaryOpType::AbsFloat, Operand::Fd, Operand::Fs, true, true } }, { InstrId::cpu_abs_d, { UnaryOpType::AbsDouble, Operand::FdDouble, Operand::FsDouble, true, true } }, { InstrId::cpu_sqrt_s, { UnaryOpType::SqrtFloat, Operand::Fd, Operand::Fs, true, true } }, @@ -65,24 +65,22 @@ namespace N64Recomp { { InstrId::cpu_ori, { BinaryOpType::Or64, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rs, Operand::ImmU16 }}} }, { InstrId::cpu_xori, { BinaryOpType::Xor64, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rs, Operand::ImmU16 }}} }, // Shifts - /* BUG Should mask after (change op to Sll32 and input op to ToU32) */ - { InstrId::cpu_sllv, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} }, + { InstrId::cpu_sllv, { BinaryOpType::Sll32, Operand::Rd, {{ UnaryOpType::None, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} }, { InstrId::cpu_dsllv, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} }, { InstrId::cpu_srlv, { BinaryOpType::Srl32, Operand::Rd, {{ UnaryOpType::ToU32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} }, { InstrId::cpu_dsrlv, { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} }, - /* BUG Should mask after (change op to Sra32 and input op to ToS64) */ - { InstrId::cpu_srav, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} }, + // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half. + { InstrId::cpu_srav, { BinaryOpType::Sra32, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} }, { InstrId::cpu_dsrav, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} }, // Shifts (immediate) - /* BUG Should mask after (change op to Sll32 and input op to ToU32) */ - { InstrId::cpu_sll, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, + { InstrId::cpu_sll, { BinaryOpType::Sll32, Operand::Rd, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, { InstrId::cpu_dsll, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, { InstrId::cpu_dsll32, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} }, { InstrId::cpu_srl, { BinaryOpType::Srl32, Operand::Rd, {{ UnaryOpType::ToU32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, { InstrId::cpu_dsrl, { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, { InstrId::cpu_dsrl32, { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} }, - /* BUG should cast after (change op to Sra32 and input op to ToS64) */ - { InstrId::cpu_sra, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, + // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half. + { InstrId::cpu_sra, { BinaryOpType::Sra32, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, { InstrId::cpu_dsra, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} }, { InstrId::cpu_dsra32, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} }, // Comparisons @@ -101,47 +99,47 @@ namespace N64Recomp { { InstrId::cpu_div_s, { BinaryOpType::DivFloat, Operand::Fd, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true, true } }, { InstrId::cpu_div_d, { BinaryOpType::DivDouble, Operand::FdDouble, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true, true } }, // Float comparisons TODO remaining operations and investigate ordered/unordered and default values - { InstrId::cpu_c_lt_s, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_nge_s, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_olt_s, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_ult_s, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_lt_d, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_nge_d, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_olt_d, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_ult_d, { BinaryOpType::Less, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_lt_s, { BinaryOpType::LessFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_nge_s, { BinaryOpType::LessFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_olt_s, { BinaryOpType::LessFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_ult_s, { BinaryOpType::LessFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_lt_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_nge_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_olt_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_ult_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_le_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_ngt_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_ole_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_ule_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_le_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_ngt_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_ole_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_ule_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_le_s, { BinaryOpType::LessEqFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_ngt_s, { BinaryOpType::LessEqFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_ole_s, { BinaryOpType::LessEqFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_ule_s, { BinaryOpType::LessEqFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_le_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_ngt_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_ole_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_ule_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_eq_s, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_ueq_s, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_ngl_s, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_seq_s, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, - { InstrId::cpu_c_eq_d, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_ueq_d, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, - { InstrId::cpu_c_ngl_d, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_eq_s, { BinaryOpType::EqualFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_ueq_s, { BinaryOpType::EqualFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_ngl_s, { BinaryOpType::EqualFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_seq_s, { BinaryOpType::EqualFloat, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } }, + { InstrId::cpu_c_eq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_ueq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_ngl_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, /* TODO rename to c_seq_d when fixed in rabbitizer */ - { InstrId::cpu_c_deq_d, { BinaryOpType::Equal, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, + { InstrId::cpu_c_deq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } }, // Loads - { InstrId::cpu_ld, { BinaryOpType::LD, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lw, { BinaryOpType::LW, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lwu, { BinaryOpType::LWU, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lh, { BinaryOpType::LH, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lhu, { BinaryOpType::LHU, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lb, { BinaryOpType::LB, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lbu, { BinaryOpType::LBU, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_ldl, { BinaryOpType::LDL, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_ldr, { BinaryOpType::LDR, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lwl, { BinaryOpType::LWL, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lwr, { BinaryOpType::LWR, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_lwc1, { BinaryOpType::LW, Operand::FtU32L, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} }, - { InstrId::cpu_ldc1, { BinaryOpType::LD, Operand::FtU64, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}, true } }, + { InstrId::cpu_ld, { BinaryOpType::LD, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lw, { BinaryOpType::LW, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lwu, { BinaryOpType::LWU, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lh, { BinaryOpType::LH, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lhu, { BinaryOpType::LHU, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lb, { BinaryOpType::LB, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lbu, { BinaryOpType::LBU, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_ldl, { BinaryOpType::LDL, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_ldr, { BinaryOpType::LDR, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lwl, { BinaryOpType::LWL, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lwr, { BinaryOpType::LWR, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_lwc1, { BinaryOpType::LW, Operand::FtU32L, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} }, + { InstrId::cpu_ldc1, { BinaryOpType::LD, Operand::FtU64, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}, true } }, }; const std::unordered_map conditional_branch_ops { @@ -159,10 +157,12 @@ namespace N64Recomp { { InstrId::cpu_bltzl, { BinaryOpType::Less, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, false, true }}, { InstrId::cpu_bgezal, { BinaryOpType::GreaterEq, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, false }}, { InstrId::cpu_bgezall, { BinaryOpType::GreaterEq, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, true }}, - { InstrId::cpu_bc1f, { BinaryOpType::NotEqual, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }}, - { InstrId::cpu_bc1fl, { BinaryOpType::NotEqual, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }}, - { InstrId::cpu_bc1t, { BinaryOpType::Equal, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }}, - { InstrId::cpu_bc1tl, { BinaryOpType::Equal, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }}, + { InstrId::cpu_bltzal, { BinaryOpType::Less, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, false }}, + { InstrId::cpu_bltzall, { BinaryOpType::Less, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, true }}, + { InstrId::cpu_bc1f, { BinaryOpType::Equal, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }}, + { InstrId::cpu_bc1fl, { BinaryOpType::Equal, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }}, + { InstrId::cpu_bc1t, { BinaryOpType::NotEqual, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }}, + { InstrId::cpu_bc1tl, { BinaryOpType::NotEqual, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }}, }; const std::unordered_map store_ops { diff --git a/src/recompilation.cpp b/src/recompilation.cpp index 1faef25..cf12c49 100644 --- a/src/recompilation.cpp +++ b/src/recompilation.cpp @@ -8,10 +8,10 @@ #include "fmt/format.h" #include "fmt/ostream.h" -#include "n64recomp.h" +#include "recompiler/context.h" #include "analysis.h" -#include "operations.h" -#include "generator.h" +#include "recompiler/operations.h" +#include "recompiler/generator.h" enum class JalResolutionResult { NoMatch, @@ -28,7 +28,6 @@ JalResolutionResult resolve_jal(const N64Recomp::Context& context, size_t cur_se uint32_t section_vram_start = cur_section.ram_addr; uint32_t section_vram_end = cur_section.ram_addr + cur_section.size; bool in_current_section = target_func_vram >= section_vram_start && target_func_vram < section_vram_end; - bool needs_static = false; bool exact_match_found = false; // Use a thread local to prevent reallocation across runs and to allow multi-threading in the future. @@ -109,8 +108,8 @@ std::string_view ctx_gpr_prefix(int reg) { return ""; } -// Major TODO, this function grew very organically and needs to be cleaned up. Ideally, it'll get split up into some sort of lookup table grouped by similar instruction types. -bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Function& func, const N64Recomp::FunctionStats& stats, const std::unordered_set& skipped_insns, size_t instr_index, const std::vector& instructions, std::ofstream& output_file, bool indent, bool emit_link_branch, int link_branch_index, size_t reloc_index, bool& needs_link_branch, bool& is_branch_likely, bool tag_reference_relocs, std::span> static_funcs_out) { +template +bool process_instruction(GeneratorType& generator, const N64Recomp::Context& context, const N64Recomp::Function& func, const N64Recomp::FunctionStats& stats, const std::unordered_set& jtbl_lw_instructions, size_t instr_index, const std::vector& instructions, std::ostream& output_file, bool indent, bool emit_link_branch, int link_branch_index, size_t reloc_index, bool& needs_link_branch, bool& is_branch_likely, bool tag_reference_relocs, std::span> static_funcs_out) { using namespace N64Recomp; const auto& section = context.sections[func.section_index]; @@ -118,6 +117,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun needs_link_branch = false; is_branch_likely = false; uint32_t instr_vram = instr.getVram(); + InstrId instr_id = instr.getUniqueId(); auto print_indent = [&]() { fmt::print(output_file, " "); @@ -132,16 +132,20 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun } // Output a comment with the original instruction - if (instr.isBranch() || instr.getUniqueId() == InstrId::cpu_j) { - fmt::print(output_file, " // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric()))); - } else if (instr.getUniqueId() == InstrId::cpu_jal) { - fmt::print(output_file, " // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric()))); + print_indent(); + if (instr.isBranch() || instr_id == InstrId::cpu_j) { + generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric())))); + } else if (instr_id == InstrId::cpu_jal) { + generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric())))); } else { - fmt::print(output_file, " // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0)); + generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0))); } - if (skipped_insns.contains(instr_vram)) { - return true; + // Replace loads for jump table entries into addiu. This leaves the jump table entry's address in the output register + // instead of the entry's value, which can then be used to determine the offset from the start of the jump table. + if (jtbl_lw_instructions.contains(instr_vram)) { + assert(instr_id == InstrId::cpu_lw); + instr_id = InstrId::cpu_addiu; } N64Recomp::RelocType reloc_type = N64Recomp::RelocType::R_MIPS_NONE; @@ -178,9 +182,9 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun // Don't try to relocate special section symbols. if (context.is_regular_reference_section(reloc.target_section) || reloc_section == N64Recomp::SectionAbsolute) { bool ref_section_relocatable = context.is_reference_section_relocatable(reloc.target_section); - uint32_t ref_section_vram = context.get_reference_section_vram(reloc.target_section); // Resolve HI16 and LO16 reference symbol relocs to non-relocatable sections by patching the instruction immediate. if (!ref_section_relocatable && (reloc_type == N64Recomp::RelocType::R_MIPS_HI16 || reloc_type == N64Recomp::RelocType::R_MIPS_LO16)) { + uint32_t ref_section_vram = context.get_reference_section_vram(reloc.target_section); uint32_t full_immediate = reloc.target_section_offset + ref_section_vram; if (reloc_type == N64Recomp::RelocType::R_MIPS_HI16) { @@ -206,13 +210,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun } } - auto print_line = [&](fmt::format_string fmt_str, Ts ...args) { - print_indent(); - fmt::vprint(output_file, fmt_str, fmt::make_format_args(args...)); - fmt::print(output_file, ";\n"); - }; - - auto print_unconditional_branch = [&](fmt::format_string fmt_str, Ts ...args) { + auto process_delay_slot = [&](bool use_indent) { if (instr_index < instructions.size() - 1) { bool dummy_needs_link_branch; bool dummy_is_branch_likely; @@ -221,56 +219,87 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) { next_reloc_index++; } - if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, false, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) { + if (!process_instruction(generator, context, func, stats, jtbl_lw_instructions, instr_index + 1, instructions, output_file, use_indent, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) { return false; } } - print_indent(); - fmt::vprint(output_file, fmt_str, fmt::make_format_args(args...)); - if (needs_link_branch) { - fmt::print(output_file, ";\n goto after_{};\n", link_branch_index); - } else { - fmt::print(output_file, ";\n"); - } return true; }; - auto print_func_call = [reloc_target_section_offset, reloc_section, reloc_reference_symbol, reloc_type, &context, §ion, &func, &static_funcs_out, &needs_link_branch, &print_unconditional_branch] - (uint32_t target_func_vram, bool link_branch = true, bool indent = false) + auto print_link_branch = [&]() { + if (needs_link_branch) { + print_indent(); + generator.emit_goto(fmt::format("after_{}", link_branch_index)); + } + }; + + auto print_return_with_delay_slot = [&]() { + if (!process_delay_slot(false)) { + return false; + } + print_indent(); + generator.emit_return(); + print_link_branch(); + return true; + }; + + auto print_goto_with_delay_slot = [&](const std::string& target) { + if (!process_delay_slot(false)) { + return false; + } + print_indent(); + generator.emit_goto(target); + print_link_branch(); + return true; + }; + + auto print_func_call_by_register = [&](int reg) { + if (!process_delay_slot(false)) { + return false; + } + print_indent(); + generator.emit_function_call_by_register(reg); + print_link_branch(); + return true; + }; + + auto print_func_call_by_address = [&generator, reloc_target_section_offset, reloc_section, reloc_reference_symbol, reloc_type, &context, &func, &static_funcs_out, &needs_link_branch, &print_indent, &process_delay_slot, &print_link_branch] + (uint32_t target_func_vram, bool tail_call = false, bool indent = false) { + bool call_by_lookup = false; + bool call_by_name = false; // Event symbol, emit a call to the runtime to trigger this event. if (reloc_section == N64Recomp::SectionEvent) { - needs_link_branch = link_branch; + needs_link_branch = !tail_call; if (indent) { - if (!print_unconditional_branch(" recomp_trigger_event(rdram, ctx, base_event_index + {})", reloc_reference_symbol)) { - return false; - } - } else { - if (!print_unconditional_branch("recomp_trigger_event(rdram, ctx, base_event_index + {})", reloc_reference_symbol)) { - return false; - } + print_indent(); } + if (!process_delay_slot(false)) { + return false; + } + print_indent(); + generator.emit_trigger_event((uint32_t)reloc_reference_symbol); + print_link_branch(); } // Normal symbol or reference symbol, else { std::string jal_target_name{}; + size_t matched_func_index = (size_t)-1; if (reloc_reference_symbol != (size_t)-1) { - const auto& ref_symbol = context.get_reference_symbol(reloc_section, reloc_reference_symbol); - if (reloc_type != N64Recomp::RelocType::R_MIPS_26) { fmt::print(stderr, "Unsupported reloc type {} on jal instruction in {}\n", (int)reloc_type, func.name); return false; } - if (ref_symbol.section_offset != reloc_target_section_offset) { - fmt::print(stderr, "Function {} uses a MIPS_R_26 addend, which is not supported yet\n", func.name); - return false; + if (!context.skip_validating_reference_symbols) { + const auto& ref_symbol = context.get_reference_symbol(reloc_section, reloc_reference_symbol); + if (ref_symbol.section_offset != reloc_target_section_offset) { + fmt::print(stderr, "Function {} uses a MIPS_R_26 addend, which is not supported yet\n", func.name); + return false; + } } - - jal_target_name = ref_symbol.name; } else { - size_t matched_func_index = 0; JalResolutionResult jal_result = resolve_jal(context, func.section_index, target_func_vram, matched_func_index); switch (jal_result) { @@ -284,65 +313,78 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun // Create a static function add it to the static function list for this section. jal_target_name = fmt::format("static_{}_{:08X}", func.section_index, target_func_vram); static_funcs_out[func.section_index].push_back(target_func_vram); + call_by_name = true; break; case JalResolutionResult::Ambiguous: fmt::print(stderr, "[Info] Ambiguous jal target 0x{:08X} in function {}, falling back to function lookup\n", target_func_vram, func.name); // Relocation isn't necessary for jumps inside a relocatable section, as this code path will never run if the target vram // is in the current function's section (see the branch for `in_current_section` above). // If a game ever needs to jump between multiple relocatable sections, relocation will be necessary here. - jal_target_name = fmt::format("LOOKUP_FUNC(0x{:08X})", target_func_vram); + call_by_lookup = true; break; case JalResolutionResult::Error: fmt::print(stderr, "Internal error when resolving jal to address 0x{:08X} in function {}. Please report this issue.\n", target_func_vram, func.name); return false; } } - needs_link_branch = link_branch; + needs_link_branch = !tail_call; if (indent) { - if (!print_unconditional_branch(" {}(rdram, ctx)", jal_target_name)) { - return false; - } - } else { - if (!print_unconditional_branch("{}(rdram, ctx)", jal_target_name)) { - return false; - } + print_indent(); } + if (!process_delay_slot(false)) { + return false; + } + print_indent(); + if (reloc_reference_symbol != (size_t)-1) { + generator.emit_function_call_reference_symbol(context, reloc_section, reloc_reference_symbol, reloc_target_section_offset); + } + else if (call_by_lookup) { + generator.emit_function_call_lookup(target_func_vram); + } + else if (call_by_name) { + generator.emit_named_function_call(jal_target_name); + } + else { + generator.emit_function_call(context, matched_func_index); + } + print_link_branch(); } return true; }; auto print_branch = [&](uint32_t branch_target) { + // If the branch target is outside the current function, check if it can be treated as a tail call. if (branch_target < func.vram || branch_target >= func_vram_end) { + // If the branch target is the start of some known function, this can be handled as a tail call. // FIXME: how to deal with static functions? if (context.functions_by_vram.find(branch_target) != context.functions_by_vram.end()) { fmt::print("Tail call in {} to 0x{:08X}\n", func.name, branch_target); - if (!print_func_call(branch_target, false, true)) { + if (!print_func_call_by_address(branch_target, true, true)) { return false; } - print_line(" return"); - fmt::print(output_file, " }}\n"); + print_indent(); + generator.emit_return(); + // TODO check if this branch close should exist. + // print_indent(); + // generator.emit_branch_close(); return true; } fmt::print(stderr, "[Warn] Function {} is branching outside of the function (to 0x{:08X})\n", func.name, branch_target); } - if (instr_index < instructions.size() - 1) { - bool dummy_needs_link_branch; - bool dummy_is_branch_likely; - size_t next_reloc_index = reloc_index; - uint32_t next_vram = instr_vram + 4; - if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) { - next_reloc_index++; - } - if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, true, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) { - return false; - } + if (!process_delay_slot(true)) { + return false; } - fmt::print(output_file, " goto L_{:08X};\n", branch_target); + print_indent(); + print_indent(); + generator.emit_goto(fmt::format("L_{:08X}", branch_target)); + // TODO check if this link branch ever exists. if (needs_link_branch) { - fmt::print(output_file, " goto after_{};\n", link_branch_index); + print_indent(); + print_indent(); + generator.emit_goto(fmt::format("after_{}", link_branch_index)); } return true; }; @@ -353,7 +395,6 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun int rd = (int)instr.GetO32_rd(); int rs = (int)instr.GetO32_rs(); - int base = rs; int rt = (int)instr.GetO32_rt(); int sa = (int)instr.Get_sa(); @@ -365,7 +406,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun bool handled = true; - switch (instr.getUniqueId()) { + switch (instr_id) { case InstrId::cpu_nop: fmt::print(output_file, "\n"); break; @@ -375,7 +416,8 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun Cop0Reg reg = instr.Get_cop0d(); switch (reg) { case Cop0Reg::COP0_Status: - print_line("{}{} = cop0_status_read(ctx)", ctx_gpr_prefix(rt), rt); + print_indent(); + generator.emit_cop0_status_read(rt); break; default: fmt::print(stderr, "Unhandled cop0 register in mfc0: {}\n", (int)reg); @@ -388,7 +430,8 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun Cop0Reg reg = instr.Get_cop0d(); switch (reg) { case Cop0Reg::COP0_Status: - print_line("cop0_status_write(ctx, {}{})", ctx_gpr_prefix(rt), rt); + print_indent(); + generator.emit_cop0_status_write(rt); break; default: fmt::print(stderr, "Unhandled cop0 register in mtc0: {}\n", (int)reg); @@ -408,38 +451,25 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun // If so, create a temp to preserve the addend register's value if (find_result != stats.jump_tables.end()) { const N64Recomp::JumpTable& cur_jtbl = *find_result; - print_line("gpr jr_addend_{:08X} = {}{}", cur_jtbl.jr_vram, ctx_gpr_prefix(cur_jtbl.addend_reg), cur_jtbl.addend_reg); + print_indent(); + generator.emit_jtbl_addend_declaration(cur_jtbl, cur_jtbl.addend_reg); } } break; case InstrId::cpu_mult: - print_line("result = S64(S32({}{})) * S64(S32({}{})); lo = S32(result >> 0); hi = S32(result >> 32)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_dmult: - print_line("DMULT(S64({}{}), S64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_multu: - print_line("result = U64(U32({}{})) * U64(U32({}{})); lo = S32(result >> 0); hi = S32(result >> 32)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_dmultu: - print_line("DMULTU(U64({}{}), U64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_div: - // Cast to 64-bits before division to prevent artihmetic exception for s32(0x80000000) / -1 - print_line("lo = S32(S64(S32({}{})) / S64(S32({}{}))); hi = S32(S64(S32({}{})) % S64(S32({}{})))", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_ddiv: - print_line("DDIV(S64({}{}), S64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_divu: - print_line("lo = S32(U32({}{}) / U32({}{})); hi = S32(U32({}{}) % U32({}{}))", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); - break; case InstrId::cpu_ddivu: - print_line("DDIVU(U64({}{}), U64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt); + print_indent(); + generator.emit_muldiv(instr_id, rs, rt); break; // Branches case InstrId::cpu_jal: - if (!print_func_call(instr.getBranchVramGeneric())) { + if (!print_func_call_by_address(instr.getBranchVramGeneric())) { return false; } break; @@ -450,18 +480,19 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun return false; } needs_link_branch = true; - print_unconditional_branch("LOOKUP_FUNC({}{})(rdram, ctx)", ctx_gpr_prefix(rs), rs); + print_func_call_by_register(rs); break; case InstrId::cpu_j: case InstrId::cpu_b: { uint32_t branch_target = instr.getBranchVramGeneric(); if (branch_target == instr_vram) { - print_line("pause_self(rdram)"); + print_indent(); + generator.emit_pause_self(); } // Check if the branch is within this function else if (branch_target >= func.vram && branch_target < func_vram_end) { - print_unconditional_branch("goto L_{:08X}", branch_target); + print_goto_with_delay_slot(fmt::format("L_{:08X}", branch_target)); } // This may be a tail call in the middle of the control flow due to a previous check // For example: @@ -476,11 +507,12 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun // ``` // FIXME: how to deal with static functions? else if (context.functions_by_vram.find(branch_target) != context.functions_by_vram.end()) { - fmt::print("Tail call in {} to 0x{:08X}\n", func.name, branch_target); - if (!print_func_call(branch_target, false)) { + fmt::print("[Info] Tail call in {} to 0x{:08X}\n", func.name, branch_target); + if (!print_func_call_by_address(branch_target, true)) { return false; } - print_line("return"); + print_indent(); + generator.emit_return(); } else { fmt::print(stderr, "Unhandled branch in {} at 0x{:08X} to 0x{:08X}\n", func.name, instr_vram, branch_target); @@ -490,7 +522,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun break; case InstrId::cpu_jr: if (rs == (int)rabbitizer::Registers::Cpu::GprO32::GPR_O32_ra) { - print_unconditional_branch("return"); + print_return_with_delay_slot(); } else { auto jtbl_find_result = std::find_if(stats.jump_tables.begin(), stats.jump_tables.end(), [instr_vram](const N64Recomp::JumpTable& jtbl) { @@ -499,58 +531,41 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun if (jtbl_find_result != stats.jump_tables.end()) { const N64Recomp::JumpTable& cur_jtbl = *jtbl_find_result; - bool dummy_needs_link_branch, dummy_is_branch_likely; - size_t next_reloc_index = reloc_index; - uint32_t next_vram = instr_vram + 4; - if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) { - next_reloc_index++; - } - if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, false, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) { + if (!process_delay_slot(false)) { return false; } print_indent(); - fmt::print(output_file, "switch (jr_addend_{:08X} >> 2) {{\n", cur_jtbl.jr_vram); + generator.emit_switch(context, cur_jtbl, rs); for (size_t entry_index = 0; entry_index < cur_jtbl.entries.size(); entry_index++) { print_indent(); - print_line("case {}: goto L_{:08X}; break", entry_index, cur_jtbl.entries[entry_index]); + print_indent(); + generator.emit_case(entry_index, fmt::format("L_{:08X}", cur_jtbl.entries[entry_index])); } print_indent(); - print_line("default: switch_error(__func__, 0x{:08X}, 0x{:08X})", instr_vram, cur_jtbl.vram); print_indent(); - fmt::print(output_file, "}}\n"); + generator.emit_switch_error(instr_vram, cur_jtbl.vram); + print_indent(); + generator.emit_switch_close(); break; } - auto jump_find_result = std::find_if(stats.absolute_jumps.begin(), stats.absolute_jumps.end(), - [instr_vram](const N64Recomp::AbsoluteJump& jump) { - return jump.instruction_vram == instr_vram; - }); - - if (jump_find_result != stats.absolute_jumps.end()) { - print_unconditional_branch("LOOKUP_FUNC({})(rdram, ctx)", (uint64_t)(int32_t)jump_find_result->jump_target); - // jr doesn't link so it acts like a tail call, meaning we should return directly after the jump returns - print_line("return"); - break; - } - - bool is_tail_call = instr_vram == func_vram_end - 2 * sizeof(func.words[0]); - if (is_tail_call) { - fmt::print("Indirect tail call in {}\n", func.name); - print_unconditional_branch("LOOKUP_FUNC({}{})(rdram, ctx)", ctx_gpr_prefix(rs), rs); - print_line("return"); - break; - } - - fmt::print(stderr, "No jump table found for jr at 0x{:08X} and not tail call\n", instr_vram); + fmt::print("[Info] Indirect tail call in {}\n", func.name); + print_func_call_by_register(rs); + print_indent(); + generator.emit_return(); + break; } break; case InstrId::cpu_syscall: - print_line("recomp_syscall_handler(rdram, ctx, 0x{:08X})", instr_vram); + print_indent(); + generator.emit_syscall(instr_vram); // syscalls don't link, so treat it like a tail call - print_line("return"); + print_indent(); + generator.emit_return(); break; case InstrId::cpu_break: - print_line("do_break({})", instr_vram); + print_indent(); + generator.emit_do_break(instr_vram); break; // Cop1 rounding mode @@ -559,21 +574,22 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun fmt::print(stderr, "Invalid FP control register for ctc1: {}\n", cop1_cs); return false; } - print_line("rounding_mode = ({}{}) & 0x3", ctx_gpr_prefix(rt), rt); + print_indent(); + generator.emit_cop1_cs_write(rt); break; case InstrId::cpu_cfc1: if (cop1_cs != 31) { fmt::print(stderr, "Invalid FP control register for cfc1: {}\n", cop1_cs); return false; } - print_line("{}{} = rounding_mode", ctx_gpr_prefix(rt), rt); + print_indent(); + generator.emit_cop1_cs_read(rt); break; default: handled = false; break; } - CGenerator generator{}; InstructionContext instruction_context{}; instruction_context.rd = rd; instruction_context.rs = rs; @@ -589,28 +605,28 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun instruction_context.reloc_section_index = reloc_section; instruction_context.reloc_target_section_offset = reloc_target_section_offset; - auto do_check_fr = [](std::ostream& output_file, const CGenerator& generator, const InstructionContext& ctx, Operand operand) { + auto do_check_fr = [](const GeneratorType& generator, const InstructionContext& ctx, Operand operand) { switch (operand) { case Operand::Fd: case Operand::FdDouble: case Operand::FdU32L: case Operand::FdU32H: case Operand::FdU64: - generator.emit_check_fr(output_file, ctx.fd); + generator.emit_check_fr(ctx.fd); break; case Operand::Fs: case Operand::FsDouble: case Operand::FsU32L: case Operand::FsU32H: case Operand::FsU64: - generator.emit_check_fr(output_file, ctx.fs); + generator.emit_check_fr(ctx.fs); break; case Operand::Ft: case Operand::FtDouble: case Operand::FtU32L: case Operand::FtU32H: case Operand::FtU64: - generator.emit_check_fr(output_file, ctx.ft); + generator.emit_check_fr(ctx.ft); break; default: // No MIPS3 float check needed for non-float operands. @@ -618,25 +634,25 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun } }; - auto do_check_nan = [](std::ostream& output_file, const CGenerator& generator, const InstructionContext& ctx, Operand operand) { + auto do_check_nan = [](const GeneratorType& generator, const InstructionContext& ctx, Operand operand) { switch (operand) { case Operand::Fd: - generator.emit_check_nan(output_file, ctx.fd, false); + generator.emit_check_nan(ctx.fd, false); break; case Operand::Fs: - generator.emit_check_nan(output_file, ctx.fs, false); + generator.emit_check_nan(ctx.fs, false); break; case Operand::Ft: - generator.emit_check_nan(output_file, ctx.ft, false); + generator.emit_check_nan(ctx.ft, false); break; case Operand::FdDouble: - generator.emit_check_nan(output_file, ctx.fd, true); + generator.emit_check_nan(ctx.fd, true); break; case Operand::FsDouble: - generator.emit_check_nan(output_file, ctx.fs, true); + generator.emit_check_nan(ctx.fs, true); break; case Operand::FtDouble: - generator.emit_check_nan(output_file, ctx.ft, true); + generator.emit_check_nan(ctx.ft, true); break; default: // No NaN checks needed for non-float operands. @@ -644,54 +660,58 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun } }; - auto find_binary_it = binary_ops.find(instr.getUniqueId()); + auto find_binary_it = binary_ops.find(instr_id); if (find_binary_it != binary_ops.end()) { print_indent(); const BinaryOp& op = find_binary_it->second; if (op.check_fr) { - do_check_fr(output_file, generator, instruction_context, op.output); - do_check_fr(output_file, generator, instruction_context, op.operands.operands[0]); - do_check_fr(output_file, generator, instruction_context, op.operands.operands[1]); + do_check_fr(generator, instruction_context, op.output); + do_check_fr(generator, instruction_context, op.operands.operands[0]); + do_check_fr(generator, instruction_context, op.operands.operands[1]); } if (op.check_nan) { - do_check_nan(output_file, generator, instruction_context, op.operands.operands[0]); - do_check_nan(output_file, generator, instruction_context, op.operands.operands[1]); - fmt::print(output_file, "\n "); + do_check_nan(generator, instruction_context, op.operands.operands[0]); + do_check_nan(generator, instruction_context, op.operands.operands[1]); + fmt::print(output_file, "\n"); + print_indent(); } - generator.process_binary_op(output_file, op, instruction_context); + generator.process_binary_op(op, instruction_context); handled = true; } - auto find_unary_it = unary_ops.find(instr.getUniqueId()); + auto find_unary_it = unary_ops.find(instr_id); if (find_unary_it != unary_ops.end()) { print_indent(); const UnaryOp& op = find_unary_it->second; if (op.check_fr) { - do_check_fr(output_file, generator, instruction_context, op.output); - do_check_fr(output_file, generator, instruction_context, op.input); + do_check_fr(generator, instruction_context, op.output); + do_check_fr(generator, instruction_context, op.input); } if (op.check_nan) { - do_check_nan(output_file, generator, instruction_context, op.input); - fmt::print(output_file, "\n "); + do_check_nan(generator, instruction_context, op.input); + fmt::print(output_file, "\n"); + print_indent(); } - generator.process_unary_op(output_file, op, instruction_context); + generator.process_unary_op(op, instruction_context); handled = true; } - auto find_conditional_branch_it = conditional_branch_ops.find(instr.getUniqueId()); + auto find_conditional_branch_it = conditional_branch_ops.find(instr_id); if (find_conditional_branch_it != conditional_branch_ops.end()) { print_indent(); - generator.emit_branch_condition(output_file, find_conditional_branch_it->second, instruction_context); + // TODO combining the branch condition and branch target into one generator call would allow better optimization in the runtime's JIT generator. + // This would require splitting into a conditional jump method and conditional function call method. + generator.emit_branch_condition(find_conditional_branch_it->second, instruction_context); print_indent(); if (find_conditional_branch_it->second.link) { - if (!print_func_call(instr.getBranchVramGeneric())) { + if (!print_func_call_by_address(instr.getBranchVramGeneric())) { return false; } } @@ -701,22 +721,23 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun } } - generator.emit_branch_close(output_file); + print_indent(); + generator.emit_branch_close(); is_branch_likely = find_conditional_branch_it->second.likely; handled = true; } - auto find_store_it = store_ops.find(instr.getUniqueId()); + auto find_store_it = store_ops.find(instr_id); if (find_store_it != store_ops.end()) { print_indent(); const StoreOp& op = find_store_it->second; if (op.type == StoreOpType::SDC1) { - do_check_fr(output_file, generator, instruction_context, op.value_input); + do_check_fr(generator, instruction_context, op.value_input); } - generator.process_store_op(output_file, op, instruction_context); + generator.process_store_op(op, instruction_context); handled = true; } @@ -727,23 +748,20 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun // TODO is this used? if (emit_link_branch) { - fmt::print(output_file, " after_{}:\n", link_branch_index); + print_indent(); + generator.emit_label(fmt::format("after_{}", link_branch_index)); } return true; } -bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64Recomp::Function& func, std::ofstream& output_file, std::span> static_funcs_out, bool tag_reference_relocs) { +template +bool recompile_function_impl(GeneratorType& generator, const N64Recomp::Context& context, size_t func_index, std::ostream& output_file, std::span> static_funcs_out, bool tag_reference_relocs) { + const N64Recomp::Function& func = context.functions[func_index]; //fmt::print("Recompiling {}\n", func.name); std::vector instructions; - fmt::print(output_file, - "RECOMP_FUNC void {}(uint8_t* rdram, recomp_context* ctx) {{\n" - // these variables shouldn't need to be preserved across function boundaries, so make them local for more efficient output - " uint64_t hi = 0, lo = 0, result = 0;\n" - " unsigned int rounding_mode = DEFAULT_ROUNDING_MODE;\n" - " int c1cs = 0;\n", // cop1 conditional signal - func.name); + generator.emit_function_start(func.name, func_index); if (context.trace_mode) { fmt::print(output_file, @@ -784,11 +802,11 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R return false; } - std::unordered_set skipped_insns{}; + std::unordered_set jtbl_lw_instructions{}; // Add jump table labels into function for (const auto& jtbl : stats.jump_tables) { - skipped_insns.insert(jtbl.lw_vram); + jtbl_lw_instructions.insert(jtbl.lw_vram); for (uint32_t jtbl_entry : jtbl.entries) { branch_labels.insert(jtbl_entry); } @@ -808,11 +826,11 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R bool is_branch_likely = false; // If we're in the delay slot of a likely instruction, emit a goto to skip the instruction before any labels if (in_likely_delay_slot) { - fmt::print(output_file, " goto skip_{};\n", num_likely_branches); + generator.emit_goto(fmt::format("skip_{}", num_likely_branches)); } // If there are any other branch labels to insert and we're at the next one, insert it if (cur_label != branch_labels.end() && vram >= *cur_label) { - fmt::print(output_file, "L_{:08X}:\n", *cur_label); + generator.emit_label(fmt::format("L_{:08X}", *cur_label)); ++cur_label; } @@ -822,7 +840,7 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R } // Process the current instruction and check for errors - if (process_instruction(context, func, stats, skipped_insns, instr_index, instructions, output_file, false, needs_link_branch, num_link_branches, reloc_index, needs_link_branch, is_branch_likely, tag_reference_relocs, static_funcs_out) == false) { + if (process_instruction(generator, context, func, stats, jtbl_lw_instructions, instr_index, instructions, output_file, false, needs_link_branch, num_link_branches, reloc_index, needs_link_branch, is_branch_likely, tag_reference_relocs, static_funcs_out) == false) { fmt::print(stderr, "Error in recompiling {}, clearing output file\n", func.name); output_file.clear(); return false; @@ -833,7 +851,8 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R } // Now that the instruction has been processed, emit a skip label for the likely branch if needed if (in_likely_delay_slot) { - fmt::print(output_file, " skip_{}:\n", num_likely_branches); + fmt::print(output_file, " "); + generator.emit_label(fmt::format("skip_{}", num_likely_branches)); num_likely_branches++; } // Mark the next instruction as being in a likely delay slot if the @@ -844,7 +863,17 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R } // Terminate the function - fmt::print(output_file, ";}}\n"); + generator.emit_function_end(); return true; } + +// Wrap the templated function with CGenerator as the template parameter. +bool N64Recomp::recompile_function(const N64Recomp::Context& context, size_t function_index, std::ostream& output_file, std::span> static_funcs_out, bool tag_reference_relocs) { + CGenerator generator{output_file}; + return recompile_function_impl(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs); +} + +bool N64Recomp::recompile_function_custom(Generator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span> static_funcs_out, bool tag_reference_relocs) { + return recompile_function_impl(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs); +} diff --git a/src/symbol_lists.cpp b/src/symbol_lists.cpp index 4b4eff9..cbe5ff5 100644 --- a/src/symbol_lists.cpp +++ b/src/symbol_lists.cpp @@ -1,4 +1,4 @@ -#include "n64recomp.h" +#include "recompiler/context.h" const std::unordered_set N64Recomp::reimplemented_funcs { // OS initialize functions