From 66062a06e9ec26f96b47fa4881cd3b55bca43893 Mon Sep 17 00:00:00 2001
From: Wiseguy <68165316+Mr-Wiseguy@users.noreply.github.com>
Date: Tue, 31 Dec 2024 16:11:40 -0500
Subject: [PATCH] Implement live recompiler (#114)

This commit implements the "live recompiler", which is another backend for the recompiler that generates platform-specific assembly at runtime. This is still static recompilation as opposed to dynamic recompilation, as it still requires information about the binary to recompile and leverages the same static analysis that the C recompiler uses. However, similarly to dynamic recompilation it's aimed at recompiling binaries at runtime, mainly for modding purposes.

The live recompiler leverages a library called sljit to generate platform-specific code. This library provides an API that's implemented on several platforms, including the main targets of this component: x86_64 and ARM64.

Performance is expected to be slower than the C recompiler, but should still be plenty fast enough for running large amounts of recompiled code without an issue. Considering these ROMs can often be run through an interpreter and still hit their full speed, performance should not be a concern for running native code even if it's less optimal than the C recompiler's codegen.

As mentioned earlier, the main use of the live recompiler will be for loading mods in the N64Recomp runtime. This makes it so that modders don't need to ship platform-specific binaries for their mods, and allows fixing bugs with recompilation down the line without requiring modders to update their binaries.

This PR also includes a utility for testing the live recompiler. It accepts binaries in a custom format which contain the instructions, input data, and target data. Documentation for the test format as well as most of the tests that were used to validate the live recompiler can be found here. The few remaining tests were hacked together binaries that I put together very hastily, so they need to be cleaned up and will probably be uploaded at a later date. The only test in that suite that doesn't currently succeed is the div test, due to unknown behavior when the two operands aren't properly sign extended to 64 bits. This has no bearing on practical usage, since the inputs will always be sign extended as expected.
---
 .gitignore                                    |   10 +-
 .gitmodules                                   |    3 +
 CMakeLists.txt                                |   29 +
 LiveRecomp/live_generator.cpp                 | 1865 +++++++++++++++++
 LiveRecomp/live_recompiler_test.cpp           |  364 ++++
 OfflineModRecomp/main.cpp                     |    5 +-
 RecompModTool/main.cpp                        |    2 +-
 include/generator.h                           |   56 -
 include/recomp.h                              |  397 ++++
 include/{n64recomp.h => recompiler/context.h} |   29 +-
 include/recompiler/generator.h                |  109 +
 include/recompiler/live_recompiler.h          |  141 ++
 include/{ => recompiler}/operations.h         |   21 +-
 lib/sljit                                     |    1 +
 src/analysis.cpp                              |   17 +-
 src/analysis.h                                |   16 +-
 src/cgenerator.cpp                            |  224 +-
 src/config.cpp                                |    4 +-
 src/elf.cpp                                   |    2 +-
 src/main.cpp                                  |   31 +-
 src/mod_symbols.cpp                           |    2 +-
 src/operations.cpp                            |  104 +-
 src/recompilation.cpp                         |  403 ++--
 src/symbol_lists.cpp                          |    2 +-
 24 files changed, 3452 insertions(+), 385 deletions(-)
 create mode 100644 LiveRecomp/live_generator.cpp
 create mode 100644 LiveRecomp/live_recompiler_test.cpp
 delete mode 100644 include/generator.h
 create mode 100644 include/recomp.h
 rename include/{n64recomp.h => recompiler/context.h} (93%)
 create mode 100644 include/recompiler/generator.h
 create mode 100644 include/recompiler/live_recompiler.h
 rename include/{ => recompiler}/operations.h (92%)
 create mode 160000 lib/sljit

diff --git a/.gitignore b/.gitignore
index 13749d1..014e033 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,8 +6,8 @@
 *.elf
 *.z64
 
-# Output C files
-test/funcs
+# Local working data
+tests
 
 # Linux build output
 build/
@@ -42,12 +42,6 @@ bld/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 
-# Libraries (binaries that aren't in the repo)
-test/Lib
-
-# RT64 (since it's not public yet)
-test/RT64
-
 # Runtime files
 imgui.ini
 rt64.log
diff --git a/.gitmodules b/.gitmodules
index 2d7b930..1369f13 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "lib/tomlplusplus"]
 	path = lib/tomlplusplus
 	url = https://github.com/marzer/tomlplusplus
+[submodule "lib/sljit"]
+	path = lib/sljit
+	url = https://github.com/zherczeg/sljit
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2733666..7fc7581 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -164,3 +164,32 @@ target_sources(OfflineModRecomp PRIVATE
 )
 
 target_link_libraries(OfflineModRecomp fmt rabbitizer tomlplusplus::tomlplusplus N64Recomp)
+
+# Live recompiler
+project(LiveRecomp)
+add_library(LiveRecomp)
+
+target_sources(LiveRecomp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/LiveRecomp/live_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src/sljitLir.c
+)
+
+target_include_directories(LiveRecomp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src
+)
+
+target_link_libraries(LiveRecomp N64Recomp)
+
+# Live recompiler test
+project(LiveRecompTest)
+add_executable(LiveRecompTest)
+
+target_sources(LiveRecompTest PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/LiveRecomp/live_recompiler_test.cpp
+)
+
+target_include_directories(LiveRecompTest PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src
+)
+
+target_link_libraries(LiveRecompTest LiveRecomp)
diff --git a/LiveRecomp/live_generator.cpp b/LiveRecomp/live_generator.cpp
new file mode 100644
index 0000000..48c5dc6
--- /dev/null
+++ b/LiveRecomp/live_generator.cpp
@@ -0,0 +1,1865 @@
+#include <cassert>
+#include <fstream>
+#include <unordered_map>
+#include <cmath>
+
+#include "fmt/format.h"
+#include "fmt/ostream.h"
+
+#include "recompiler/live_recompiler.h"
+#include "recomp.h"
+
+#include "sljitLir.h"
+
+static_assert(sizeof(void*) >= sizeof(sljit_uw), "`void*` must be able to hold a `sljit_uw` value for rewritable jumps!");
+
+constexpr uint64_t rdram_offset = 0xFFFFFFFF80000000ULL;
+
+void N64Recomp::live_recompiler_init() {
+    RabbitizerConfig_Cfg.pseudos.pseudoMove = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBeqz = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBnez = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoNot = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBal = false;
+}
+
+namespace Registers {
+    constexpr int rdram = SLJIT_S0; // stores (rdram - rdram_offset)
+    constexpr int ctx = SLJIT_S1; // stores ctx
+    constexpr int c1cs = SLJIT_S2; // stores ctx
+    constexpr int hi = SLJIT_S3; // stores ctx
+    constexpr int lo = SLJIT_S4; // stores ctx
+    constexpr int arithmetic_temp1 = SLJIT_R0;
+    constexpr int arithmetic_temp2 = SLJIT_R1;
+    constexpr int arithmetic_temp3 = SLJIT_R2;
+    constexpr int arithmetic_temp4 = SLJIT_R3;
+}
+
+struct InnerCall {
+    size_t target_func_index;
+    sljit_jump* jump;
+};
+
+struct ReferenceSymbolCall {
+    N64Recomp::SymbolReference reference;
+    sljit_jump* jump;
+};
+
+struct SwitchErrorJump {
+    uint32_t instr_vram;
+    uint32_t jtbl_vram;
+    sljit_jump* jump;
+};
+
+struct N64Recomp::LiveGeneratorContext {
+    std::string function_name;
+    std::unordered_map<std::string, sljit_label*> labels;
+    std::unordered_map<std::string, std::vector<sljit_jump*>> pending_jumps;
+    std::vector<sljit_label*> func_labels;
+    std::vector<InnerCall> inner_calls;
+    std::vector<std::vector<std::string>> switch_jump_labels;
+    // See LiveGeneratorOutput::jump_tables for info. Contains sljit labels so they can be linked after recompilation.
+    std::vector<std::pair<std::vector<sljit_label*>, std::unique_ptr<void*[]>>> unlinked_jump_tables;
+    // Jump tables for the current function being recompiled.
+    std::vector<std::unique_ptr<void*[]>> pending_jump_tables;
+    // See LiveGeneratorOutput::reference_symbol_jumps for info.
+    std::vector<std::pair<ReferenceJumpDetails, sljit_jump*>> reference_symbol_jumps;
+    // See LiveGeneratorOutput::import_jumps_by_index for info.
+    std::unordered_multimap<size_t, sljit_jump*> import_jumps_by_index;
+    std::vector<SwitchErrorJump> switch_error_jumps;
+    sljit_jump* cur_branch_jump;
+};
+
+N64Recomp::LiveGenerator::LiveGenerator(size_t num_funcs, const LiveGeneratorInputs& inputs) : inputs(inputs) {
+    compiler = sljit_create_compiler(nullptr);
+    context = std::make_unique<LiveGeneratorContext>();
+    context->func_labels.resize(num_funcs);
+    errored = false;
+}
+
+N64Recomp::LiveGenerator::~LiveGenerator() {
+    if (compiler != nullptr) {
+        sljit_free_compiler(compiler);
+        compiler = nullptr;
+    }
+}
+
+N64Recomp::LiveGeneratorOutput N64Recomp::LiveGenerator::finish() {
+    LiveGeneratorOutput ret{};
+    if (errored) {
+        ret.good = false;
+        return ret;
+    }
+    
+    ret.good = true;
+
+    // Populate all the pending inner function calls.
+    for (const InnerCall& call : context->inner_calls) {
+        sljit_label* target_func_label = context->func_labels[call.target_func_index];
+
+        // Generation isn't valid if the target function wasn't recompiled.
+        if (target_func_label == nullptr) {
+            return { };
+        }
+
+        sljit_set_label(call.jump, target_func_label);
+    }
+
+    // Generate the switch error jump targets and assign the jump labels.
+    if (!context->switch_error_jumps.empty()) {
+        // Allocate the function name and place it in the literals.
+        char* func_name = new char[context->function_name.size() + 1];
+        memcpy(func_name, context->function_name.c_str(), context->function_name.size());
+        func_name[context->function_name.size()] = '\x00';
+        ret.string_literals.emplace_back(func_name);
+
+        std::vector<sljit_jump*> switch_error_return_jumps{};
+        switch_error_return_jumps.resize(context->switch_error_jumps.size());
+
+        // Generate and assign the labels for the switch error jumps.
+        for (size_t i = 0; i < context->switch_error_jumps.size(); i++) {
+            const auto& cur_error_jump = context->switch_error_jumps[i];
+
+            // Generate a label and assign it to the jump.
+            sljit_set_label(cur_error_jump.jump, sljit_emit_label(compiler));
+
+            // Load the arguments (function name, vram, jump table address)
+            sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sljit_sw(func_name));
+            sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, sljit_sw(cur_error_jump.instr_vram));
+            sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, sljit_sw(cur_error_jump.jtbl_vram));
+            
+            // Call switch_error.
+            sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3V(P, 32, 32), SLJIT_IMM, sljit_sw(inputs.switch_error));
+
+            // Jump to the return statement.
+            switch_error_return_jumps[i] = sljit_emit_jump(compiler, SLJIT_JUMP);
+        }
+
+        // Generate the return statement.
+        sljit_label* return_label = sljit_emit_label(compiler);
+        sljit_emit_return_void(compiler);
+
+        // Assign the label for all the return jumps.
+        for (sljit_jump* cur_jump : switch_error_return_jumps) {
+            sljit_set_label(cur_jump, return_label);
+        }
+    }
+    context->switch_error_jumps.clear();
+
+    // Generate the code.
+    ret.code = sljit_generate_code(compiler, 0, NULL);
+    ret.code_size = sljit_get_generated_code_size(compiler);
+    ret.functions.resize(context->func_labels.size());
+
+    // Get the function addresses.
+    for (size_t func_index = 0; func_index < ret.functions.size(); func_index++) {
+        sljit_label* func_label = context->func_labels[func_index];
+
+        // If the function wasn't recompiled, don't populate its address.
+        if (func_label != nullptr) {
+            ret.functions[func_index] = reinterpret_cast<recomp_func_t*>(sljit_get_label_addr(func_label));
+        }
+    }
+    context->func_labels.clear();
+
+    // Get the reference symbol jump instruction addresses.
+    ret.reference_symbol_jumps.resize(context->reference_symbol_jumps.size());
+    for (size_t jump_index = 0; jump_index < context->reference_symbol_jumps.size(); jump_index++) {
+        ReferenceJumpDetails& details = context->reference_symbol_jumps[jump_index].first;
+        sljit_jump* jump = context->reference_symbol_jumps[jump_index].second;
+
+        ret.reference_symbol_jumps[jump_index].first = details;
+        ret.reference_symbol_jumps[jump_index].second = reinterpret_cast<void*>(jump->addr);
+    }
+    context->reference_symbol_jumps.clear();
+    
+    // Get the import jump instruction addresses.
+    ret.import_jumps_by_index.reserve(context->import_jumps_by_index.size());
+    for (auto& [jump_index, jump] : context->import_jumps_by_index) {
+        ret.import_jumps_by_index.emplace(jump_index, reinterpret_cast<void*>(jump->addr));
+    }
+    context->import_jumps_by_index.clear();
+
+    // Populate label addresses for the jump tables and place them in the output.
+    for (auto& [labels, jump_table] : context->unlinked_jump_tables) {
+        for (size_t entry_index = 0; entry_index < labels.size(); entry_index++) {
+            sljit_label* cur_label = labels[entry_index];
+            jump_table[entry_index] = reinterpret_cast<void*>(sljit_get_label_addr(cur_label));
+        }
+        ret.jump_tables.emplace_back(std::move(jump_table));
+    }
+    context->unlinked_jump_tables.clear();
+
+    ret.executable_offset = sljit_get_executable_offset(compiler);
+
+    sljit_free_compiler(compiler);
+    compiler = nullptr;
+    errored = false;
+
+    return ret;
+}
+
+N64Recomp::LiveGeneratorOutput::~LiveGeneratorOutput() {
+    if (code != nullptr) {
+        sljit_free_code(code, nullptr);
+        code = nullptr;
+    }
+}
+
+size_t N64Recomp::LiveGeneratorOutput::num_reference_symbol_jumps() const {
+    return reference_symbol_jumps.size();
+}
+
+void N64Recomp::LiveGeneratorOutput::set_reference_symbol_jump(size_t jump_index, recomp_func_t* func) {
+    const auto& jump_entry = reference_symbol_jumps[jump_index];
+    sljit_set_jump_addr(reinterpret_cast<sljit_uw>(jump_entry.second), reinterpret_cast<sljit_uw>(func), executable_offset);
+}
+
+N64Recomp::ReferenceJumpDetails N64Recomp::LiveGeneratorOutput::get_reference_symbol_jump_details(size_t jump_index) {
+    return reference_symbol_jumps[jump_index].first;
+}
+
+void N64Recomp::LiveGeneratorOutput::populate_import_symbol_jumps(size_t import_index, recomp_func_t* func) {
+    auto find_range = import_jumps_by_index.equal_range(import_index);
+    for (auto it = find_range.first; it != find_range.second; ++it) {
+        sljit_set_jump_addr(reinterpret_cast<sljit_uw>(it->second), reinterpret_cast<sljit_uw>(func), executable_offset);
+    }
+}
+
+constexpr int get_gpr_context_offset(int gpr_index) {
+    return offsetof(recomp_context, r0) + sizeof(recomp_context::r0) * gpr_index;
+}
+
+constexpr int get_fpr_single_context_offset(int fpr_index) {
+    return offsetof(recomp_context, f0.fl) + sizeof(recomp_context::f0) * fpr_index;
+}
+
+constexpr int get_fpr_double_context_offset(int fpr_index) {
+    return offsetof(recomp_context, f0.d) + sizeof(recomp_context::f0) * fpr_index;
+}
+
+constexpr int get_fpr_u32l_context_offset(int fpr_index) {
+    if (fpr_index & 1) {
+        // TODO implement odd floats.
+        assert(false);
+        return -1;
+        // return fmt::format("ctx->f_odd[({} - 1) * 2]", fpr_index);
+    }
+    else {
+        return offsetof(recomp_context, f0.u32l) + sizeof(recomp_context::f0) * fpr_index;
+    }
+}
+
+constexpr int get_fpr_u64_context_offset(int fpr_index) {
+    return offsetof(recomp_context, f0.u64) + sizeof(recomp_context::f0) * fpr_index;
+}
+
+void get_gpr_values(int gpr, sljit_sw& out, sljit_sw& outw) {
+    if (gpr == 0) {
+        out = SLJIT_IMM;
+        outw = 0;
+    }
+    else {
+        out = SLJIT_MEM1(Registers::ctx);
+        outw = get_gpr_context_offset(gpr);
+    }
+}
+
+bool get_operand_values(N64Recomp::Operand operand, const N64Recomp::InstructionContext& context, sljit_sw& out, sljit_sw& outw) {
+    using namespace N64Recomp;
+
+    switch (operand) {
+        case Operand::Rd:
+            get_gpr_values(context.rd, out, outw);
+            break;
+        case Operand::Rs:
+            get_gpr_values(context.rs, out, outw);
+            break;
+        case Operand::Rt:
+            get_gpr_values(context.rt, out, outw);
+            break;
+        case Operand::Fd:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_single_context_offset(context.fd);
+            break;
+        case Operand::Fs:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_single_context_offset(context.fs);
+            break;
+        case Operand::Ft:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_single_context_offset(context.ft);
+            break;
+        case Operand::FdDouble:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_double_context_offset(context.fd);
+            break;
+        case Operand::FsDouble:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_double_context_offset(context.fs);
+            break;
+        case Operand::FtDouble:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_double_context_offset(context.ft);
+            break;
+        case Operand::FdU32L:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u32l_context_offset(context.fd);
+            break;
+        case Operand::FsU32L:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u32l_context_offset(context.fs);
+            break;
+        case Operand::FtU32L:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u32l_context_offset(context.ft);
+            break;
+        case Operand::FdU32H:
+            assert(false);
+            return false;
+        case Operand::FsU32H:
+            assert(false);
+            return false;
+        case Operand::FtU32H:
+            assert(false);
+            return false;
+        case Operand::FdU64:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u64_context_offset(context.fd);
+            break;
+        case Operand::FsU64:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u64_context_offset(context.fs);
+            break;
+        case Operand::FtU64:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u64_context_offset(context.ft);
+            break;
+        case Operand::ImmU16:
+            out = SLJIT_IMM;
+            outw = (sljit_sw)(uint16_t)context.imm16;
+            break;
+        case Operand::ImmS16:
+            out = SLJIT_IMM;
+            outw = (sljit_sw)(int16_t)context.imm16;
+            break;
+        case Operand::Sa:
+            out = SLJIT_IMM;
+            outw = context.sa;
+            break;
+        case Operand::Sa32:
+            out = SLJIT_IMM;
+            outw = context.sa + 32;
+            break;
+        case Operand::Cop1cs:
+            out = Registers::c1cs;
+            outw = 0;
+            break;
+        case Operand::Hi:
+            out = Registers::hi;
+            outw = 0;
+            break;
+        case Operand::Lo:
+            out = Registers::lo;
+            outw = 0;
+            break;
+        case Operand::Zero:
+            out = SLJIT_IMM;
+            outw = 0;
+            break;
+    }
+    return true;
+}
+
+bool outputs_to_zero(N64Recomp::Operand output, const N64Recomp::InstructionContext& ctx) {
+    if (output == N64Recomp::Operand::Rd && ctx.rd == 0) {
+        return true;
+    }
+    if (output == N64Recomp::Operand::Rt && ctx.rt == 0) {
+        return true;
+    }
+    if (output == N64Recomp::Operand::Rs && ctx.rs == 0) {
+        return true;
+    }
+    return false;
+}
+
+void N64Recomp::LiveGenerator::process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const {
+    // Skip instructions that output to $zero
+    if (outputs_to_zero(op.output, ctx)) {
+        return;
+    }
+ 
+    sljit_sw dst;
+    sljit_sw dstw;
+    sljit_sw src1;
+    sljit_sw src1w;
+    sljit_sw src2;
+    sljit_sw src2w;
+    bool output_good = get_operand_values(op.output, ctx, dst, dstw);
+    bool input0_good = get_operand_values(op.operands.operands[0], ctx, src1, src1w);
+    bool input1_good = get_operand_values(op.operands.operands[1], ctx, src2, src2w);
+
+    if (!output_good || !input0_good || !input1_good) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // If a relocation is present, perform the relocation and change src1/src1w to use the relocated value.
+    if (ctx.reloc_type != RelocType::R_MIPS_NONE) {
+        // Only allow LO16 relocations.
+        if (ctx.reloc_type != RelocType::R_MIPS_LO16) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Only allow relocations on immediates.
+        if (src2 != SLJIT_IMM) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Only allow relocations on loads and adds.
+        switch (op.type) {
+            case BinaryOpType::LD:
+            case BinaryOpType::LW:
+            case BinaryOpType::LWU:
+            case BinaryOpType::LH:
+            case BinaryOpType::LHU:
+            case BinaryOpType::LB:
+            case BinaryOpType::LBU:
+            case BinaryOpType::LDL:
+            case BinaryOpType::LDR:
+            case BinaryOpType::LWL:
+            case BinaryOpType::LWR:
+            case BinaryOpType::Add64:
+            case BinaryOpType::Add32:
+                break;
+            default:
+                // Relocations aren't allowed on this instruction.
+                assert(false);
+                errored = true;
+                return;
+        }
+        // Load the relocated address into temp2.
+        load_relocated_address(ctx, Registers::arithmetic_temp1);
+        // Extract the LO16 value from the full address (sign extended lower 16 bits).
+        sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        // Replace the immediate input (src2) with the LO16 value.
+        src2 = Registers::arithmetic_temp1;
+        src2w = 0;
+    }
+
+    // TODO validate that the unary ops are valid for the current binary op.
+    if (op.operands.operand_operations[0] != UnaryOpType::None &&
+        op.operands.operand_operations[0] != UnaryOpType::ToU64 &&
+        op.operands.operand_operations[0] != UnaryOpType::ToS64 &&
+        op.operands.operand_operations[0] != UnaryOpType::ToU32)
+    {
+        assert(false);
+        errored = true;
+        return;
+    }
+    
+    if (op.operands.operand_operations[1] != UnaryOpType::None &&
+        op.operands.operand_operations[1] != UnaryOpType::ToU64 &&
+        op.operands.operand_operations[1] != UnaryOpType::ToS64 &&
+        op.operands.operand_operations[1] != UnaryOpType::Mask5 && // Only for 32-bit shifts
+        op.operands.operand_operations[1] != UnaryOpType::Mask6) // Only for 64-bit shifts
+    {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    bool cmp_unsigned = op.operands.operand_operations[0] != UnaryOpType::ToS64;
+
+    auto sign_extend_and_store = [dst, dstw, this]() {
+        // Sign extend the result.
+        sljit_emit_op1(this->compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        // Store the result back into the context.
+        sljit_emit_op1(this->compiler, SLJIT_MOV_P, dst, dstw, Registers::arithmetic_temp1, 0);
+    };
+
+    auto do_op32 = [src1, src1w, src2, src2w, this, &sign_extend_and_store](sljit_s32 op) {
+        sljit_emit_op2(this->compiler, op, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+        sign_extend_and_store();
+    };
+
+    auto do_op64 = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op) {
+        sljit_emit_op2(this->compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    };
+
+    auto do_float_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op) {
+        sljit_emit_fop2(this->compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    };
+
+    auto do_load_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op, int address_xor) {
+        // TODO 0 immediate optimization.
+
+        // Add the base and immediate into the arithemtic temp.
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+
+        if (address_xor != 0) {
+            // xor the address with the specified amount
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, address_xor);
+        }
+        
+        // Load the value at rdram + address into the arithemtic temp with the given operation to allow for sign-extension or zero-extension.
+        sljit_emit_op1(compiler, op, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0);
+
+        // Move the arithmetic temp into the destination.
+        sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, Registers::arithmetic_temp1, 0);
+    };
+
+    auto do_compare_op = [cmp_unsigned, dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op_unsigned, sljit_s32 op_signed) {
+        // Pick the operation based on the signedness of the comparison.
+        sljit_s32 op = cmp_unsigned ? op_unsigned : op_signed;
+
+        // Pick the flags to set based on the operation.
+        sljit_s32 flags;
+        if (op <= SLJIT_NOT_ZERO) {
+            flags = SLJIT_SET_Z;
+        } else
+        {
+            flags = SLJIT_SET(op);
+        }
+
+        // Perform a subtraction with the determined flag.
+        sljit_emit_op2u(compiler, SLJIT_SUB | flags, src1, src1w, src2, src2w);
+        
+        // Move the operation's flag into the destination.
+        sljit_emit_op_flags(compiler, SLJIT_MOV, dst, dstw, op);
+    };
+
+    auto do_float_compare_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 flag_op, sljit_s32 set_op, bool double_precision) {
+        // Pick the operation based on the signedness of the comparison.
+        sljit_s32 compare_op = set_op | (double_precision ? SLJIT_CMP_F64 : SLJIT_CMP_F32);
+
+        // Perform the comparison with the determined operation.
+        // Float comparisons use fop1 and put the left hand side in dst.
+        sljit_emit_fop1(compiler, compare_op, src1, src1w, src2, src2w);
+        
+        // Move the operation's flag into the destination.
+        sljit_emit_op_flags(compiler, SLJIT_MOV, dst, dstw, flag_op);
+    };
+
+    auto do_unaligned_load_op = [dst, dstw, src1, src1w, src2, src2w, this](bool left, bool doubleword) {
+        // TODO 0 immediate optimization.
+
+        // Determine the shift direction to use for calculating the mask and shifting the loaded value.
+        sljit_sw shift_op = left ? SLJIT_SHL : SLJIT_LSHR;
+        // Determine the operation's word size.
+        sljit_sw word_size = doubleword ? 8 : 4;
+
+        // Add the base and immediate into the temp1.
+        // addr = base + offset
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+
+        // Mask the address with the alignment mask to get the misalignment and put it in temp2.
+        // misalignment = addr & (word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, word_size - 1);
+
+        // Mask the address with ~alignment_mask to get the aligned address and put it in temp1.
+        // addr = addr & ~(word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, ~(word_size - 1));
+
+        // Load the word at rdram + aligned address into the temp1 with sign-extension.
+        // loaded_value = *addr
+        if (doubleword) {
+            // Rotate the loaded doubleword by 32 bits to swap the two words into the right order.
+            sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32);
+        }
+        else {
+            // Use MOV_S32 to sign-extend the loaded word.
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0);
+        }
+
+        // Inverse the misalignment if this is a right load.
+        if (!left) {
+            // misalignment = (word_size - 1 - misalignment) * 8
+            sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp2, 0, SLJIT_IMM, word_size - 1, Registers::arithmetic_temp2, 0);
+        }
+
+        // Calculate the misalignment shift and put it into temp2.
+        // misalignment_shift = misalignment * 8
+        sljit_emit_op2(compiler, SLJIT_SHL, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, 3);
+
+        // Calculate the misalignment mask and put it into temp3. Use a 32-bit shift if this is a 32-bit operation.
+        // misalignment_mask = word(-1) SHIFT misalignment_shift
+        sljit_emit_op2(compiler, doubleword ? shift_op : (shift_op | SLJIT_32),
+            Registers::arithmetic_temp3, 0,
+            SLJIT_IMM, doubleword ? uint64_t(-1) : uint32_t(-1),
+            Registers::arithmetic_temp2, 0);
+
+        if (!doubleword) {
+            // Sign extend the misalignment mask.
+            // misalignment_mask = ((uint64_t)(int32_t)misalignment_mask)
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0);
+        }
+
+        // Shift the loaded value by the misalignment shift and put it into temp1.
+        // loaded_value SHIFT misalignment_shift
+        sljit_emit_op2(compiler, shift_op, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0);
+
+        if (left && !doubleword) {
+            // Sign extend the loaded value.
+            // loaded_value = (uint64_t)(int32_t)loaded_value
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        }
+
+        // Mask the shifted loaded value by the misalignment mask.
+        // loaded_value &= misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp3, 0);
+
+        // Invert the misalignment mask and store it into temp3.
+        // misalignment_mask = ~misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, SLJIT_IMM, sljit_sw(-1));
+
+        // Mask the initial value (stored in the destination) with the misalignment mask and place it into temp3.
+        // masked_value = initial_value & misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp3, 0, dst, dstw, Registers::arithmetic_temp3, 0);
+
+        // Combine the masked initial value with the shifted loaded value and store it in the destination.
+        // out = masked_value | loaded_value
+        sljit_emit_op2(compiler, SLJIT_OR, dst, dstw, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp1, 0);
+    };
+
+    switch (op.type) {
+        // Addition/subtraction
+        case BinaryOpType::Add32:
+            do_op32(SLJIT_ADD32);
+            break;
+        case BinaryOpType::Sub32:
+            do_op32(SLJIT_SUB32);
+            break;
+        case BinaryOpType::Add64:
+            do_op64(SLJIT_ADD);
+            break;
+        case BinaryOpType::Sub64:
+            do_op64(SLJIT_SUB);
+            break;
+
+        // Float arithmetic
+        case BinaryOpType::AddFloat:
+            do_float_op(SLJIT_ADD_F32);
+            break;
+        case BinaryOpType::AddDouble:
+            do_float_op(SLJIT_ADD_F64);
+            break;
+        case BinaryOpType::SubFloat:
+            do_float_op(SLJIT_SUB_F32);
+            break;
+        case BinaryOpType::SubDouble:
+            do_float_op(SLJIT_SUB_F64);
+            break;
+        case BinaryOpType::MulFloat:
+            do_float_op(SLJIT_MUL_F32);
+            break;
+        case BinaryOpType::MulDouble:
+            do_float_op(SLJIT_MUL_F64);
+            break;
+        case BinaryOpType::DivFloat:
+            do_float_op(SLJIT_DIV_F32);
+            break;
+        case BinaryOpType::DivDouble:
+            do_float_op(SLJIT_DIV_F64);
+            break;
+
+        // Bitwise
+        case BinaryOpType::And64:
+            do_op64(SLJIT_AND);
+            break;
+        case BinaryOpType::Or64:
+            do_op64(SLJIT_OR);
+            break;
+        case BinaryOpType::Nor64:
+            // Bitwise or the two registers and move the result into the temp, then invert the result and move it into the destination.
+            sljit_emit_op2(this->compiler, SLJIT_OR, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+            sljit_emit_op2(this->compiler, SLJIT_XOR, dst, dstw, Registers::arithmetic_temp1, 0, SLJIT_IMM, sljit_sw(-1));
+            break;
+        case BinaryOpType::Xor64:
+            do_op64(SLJIT_XOR);
+            break;
+        case BinaryOpType::Sll32:
+            // TODO only mask if the second input's op is Mask5.
+            do_op32(SLJIT_MSHL32);
+            break;
+        case BinaryOpType::Sll64:
+            // TODO only mask if the second input's op is Mask6.
+            do_op64(SLJIT_MSHL);
+            break;
+        case BinaryOpType::Srl32:
+            // TODO only mask if the second input's op is Mask5.
+            do_op32(SLJIT_MLSHR32);
+            break;
+        case BinaryOpType::Srl64:
+            // TODO only mask if the second input's op is Mask6.
+            do_op64(SLJIT_MLSHR);
+            break;
+        case BinaryOpType::Sra32:
+            // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half.
+            // This means we have to use a 64-bit shift and manually mask the input before shifting.
+            // TODO only mask if the second input's op is Mask5.
+            sljit_emit_op2(this->compiler, SLJIT_AND32, Registers::arithmetic_temp1, 0, src2, src2w, SLJIT_IMM, 0b11111);
+            sljit_emit_op2(this->compiler, SLJIT_MASHR, Registers::arithmetic_temp1, 0, src1, src1w, Registers::arithmetic_temp1, 0);
+            sign_extend_and_store();
+            break;
+        case BinaryOpType::Sra64:
+            // TODO only mask if the second input's op is Mask6.
+            do_op64(SLJIT_MASHR);
+            break;
+
+        // Comparisons
+        case BinaryOpType::Equal:
+            do_compare_op(SLJIT_EQUAL, SLJIT_EQUAL);
+            break;
+        case BinaryOpType::NotEqual:
+            do_compare_op(SLJIT_NOT_EQUAL, SLJIT_NOT_EQUAL);
+            break;
+        case BinaryOpType::Less:
+            do_compare_op(SLJIT_LESS, SLJIT_SIG_LESS);
+            break;
+        case BinaryOpType::LessEq:
+            do_compare_op(SLJIT_LESS_EQUAL, SLJIT_SIG_LESS_EQUAL);
+            break;
+        case BinaryOpType::Greater:
+            do_compare_op(SLJIT_GREATER, SLJIT_SIG_GREATER);
+            break;
+        case BinaryOpType::GreaterEq:
+            do_compare_op(SLJIT_GREATER_EQUAL, SLJIT_SIG_GREATER_EQUAL);
+            break;
+        case BinaryOpType::EqualFloat:
+            do_float_compare_op(SLJIT_F_EQUAL, SLJIT_SET_F_EQUAL, false);
+            break;
+        case BinaryOpType::LessFloat:
+            do_float_compare_op(SLJIT_F_LESS, SLJIT_SET_F_LESS, false);
+            break;
+        case BinaryOpType::LessEqFloat:
+            do_float_compare_op(SLJIT_F_LESS_EQUAL, SLJIT_SET_F_LESS_EQUAL, false);
+            break;
+        case BinaryOpType::EqualDouble:
+            do_float_compare_op(SLJIT_F_EQUAL, SLJIT_SET_F_EQUAL, true);
+            break;
+        case BinaryOpType::LessDouble:
+            do_float_compare_op(SLJIT_F_LESS, SLJIT_SET_F_LESS, true);
+            break;
+        case BinaryOpType::LessEqDouble:
+            do_float_compare_op(SLJIT_F_LESS_EQUAL, SLJIT_SET_F_LESS_EQUAL, true);
+            break;
+
+        // Loads
+        case BinaryOpType::LD:
+            // Add the base and immediate into the arithemtic temp.
+            sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+        
+            // Load the value at rdram + address into the arithemtic temp and rotate it by 32 bits to swap the two words into the right order.
+            sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32);
+
+            // Move the arithmetic temp into the destination.
+            sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, Registers::arithmetic_temp1, 0);
+            break;
+        case BinaryOpType::LW:
+            do_load_op(SLJIT_MOV_S32, 0);
+            break;
+        case BinaryOpType::LWU:
+            do_load_op(SLJIT_MOV_U32, 0);
+            break;
+        case BinaryOpType::LH:
+            do_load_op(SLJIT_MOV_S16, 2);
+            break;
+        case BinaryOpType::LHU:
+            do_load_op(SLJIT_MOV_U16, 2);
+            break;
+        case BinaryOpType::LB:
+            do_load_op(SLJIT_MOV_S8, 3);
+            break;
+        case BinaryOpType::LBU:
+            do_load_op(SLJIT_MOV_U8, 3);
+            break;
+        case BinaryOpType::LDL:
+            do_unaligned_load_op(true, true);
+            break;
+        case BinaryOpType::LDR:
+            do_unaligned_load_op(false, true);
+            break;
+        case BinaryOpType::LWL:
+            do_unaligned_load_op(true, false);
+            break;
+        case BinaryOpType::LWR:
+            do_unaligned_load_op(false, false);
+            break;
+        default:
+            assert(false);
+            errored = true;
+            return;
+    }
+}
+
+int32_t do_round_w_s(float num) {
+    return lroundf(num);
+}
+
+int32_t do_round_w_d(double num) {
+    return lround(num);
+}
+
+int64_t do_round_l_s(float num) {
+    return llroundf(num);
+}
+
+int64_t do_round_l_d(double num) {
+    return llround(num);
+}
+
+int32_t do_ceil_w_s(float num) {
+    return (int32_t)ceilf(num);
+}
+
+int32_t do_ceil_w_d(double num) {
+    return (int32_t)ceil(num);
+}
+
+int64_t do_ceil_l_s(float num) {
+    return (int64_t)ceilf(num);
+}
+
+int64_t do_ceil_l_d(double num) {
+    return (int64_t)ceil(num);
+}
+
+int32_t do_floor_w_s(float num) {
+    return (int32_t)floorf(num);
+}
+
+int32_t do_floor_w_d(double num) {
+    return (int32_t)floor(num);
+}
+
+int64_t do_floor_l_s(float num) {
+    return (int64_t)floorf(num);
+}
+
+int64_t do_floor_l_d(double num) {
+    return (int64_t)floor(num);
+}
+
+void N64Recomp::LiveGenerator::load_relocated_address(const InstructionContext& ctx, int reg) const {
+    // Get the pointer to the section address.
+    int32_t* section_addr_ptr = (ctx.reloc_tag_as_reference ? inputs.reference_section_addresses : inputs.local_section_addresses) + ctx.reloc_section_index;
+
+    // Load the section's address into the target register.
+    sljit_emit_op1(compiler, SLJIT_MOV_S32, reg, 0, SLJIT_MEM0(), sljit_sw(section_addr_ptr));
+
+    // Don't emit the add if the offset is zero (small optimization).
+    if (ctx.reloc_target_section_offset != 0) {
+        // Add the reloc section offset to the section's address and put the result in R0.
+        sljit_emit_op2(compiler, SLJIT_ADD, reg, 0, reg, 0, SLJIT_IMM, ctx.reloc_target_section_offset);
+    }
+}
+
+void N64Recomp::LiveGenerator::process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const {
+    // Skip instructions that output to $zero
+    if (outputs_to_zero(op.output, ctx)) {
+        return;
+    }
+
+    sljit_sw dst;
+    sljit_sw dstw;
+    sljit_sw src;
+    sljit_sw srcw;
+    bool output_good = get_operand_values(op.output, ctx, dst, dstw);
+    bool input_good = get_operand_values(op.input, ctx, src, srcw);
+
+    if (!output_good || !input_good) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // If a relocation is needed for the input operand, perform the relocation and store the result directly.
+    if (ctx.reloc_type != RelocType::R_MIPS_NONE) {
+        // Only allow relocation of lui with an immediate.
+        if (op.operation != UnaryOpType::Lui || op.input != Operand::ImmU16) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Only allow HI16 relocs.
+        if (ctx.reloc_type != RelocType::R_MIPS_HI16) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Load the relocated address into temp1.
+        load_relocated_address(ctx, Registers::arithmetic_temp1);
+
+        // HI16 reloc on a lui
+        // The 32-bit address (a) is equal to section address + section offset
+        // The 16-bit immediate is equal to (a - (int16_t)a) >> 16
+        // Therefore, the register should be set to (int32_t)(a - (int16_t)a) as the shifts cancel out and the lower 16 bits are zero.
+
+        // Extract a sign extended 16-bit value from the lower half of the relocated address and put it in temp2.
+        sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0);
+
+        // Subtract the sign extended 16-bit value from the full address to get the HI16 value and place it in the destination.
+        sljit_emit_op2(compiler, SLJIT_SUB, dst, dstw, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0);
+        return;
+    }
+
+    sljit_s32 jit_op = SLJIT_BREAKPOINT;
+
+    bool float_op = false;
+    bool func_float_op = false;
+
+    auto emit_s_func = [this, src, srcw, dst, dstw, &func_float_op](float (*func)(float)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(F32, F32), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, dst, dstw, SLJIT_RETURN_FREG, 0);
+    };
+
+    auto emit_d_func = [this, src, srcw, dst, dstw, &func_float_op](double (*func)(double)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(F64, F64), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, dst, dstw, SLJIT_RETURN_FREG, 0);
+    };
+
+    auto emit_l_from_s_func = [this, src, srcw, dst, dstw, &func_float_op](int64_t (*func)(float)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, F32), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    auto emit_w_from_s_func = [this, src, srcw, dst, dstw, &func_float_op](int32_t (*func)(float)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(32, F32), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    auto emit_l_from_d_func = [this, src, srcw, dst, dstw, &func_float_op](int64_t (*func)(double)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, F64), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    auto emit_w_from_d_func = [this, src, srcw, dst, dstw, &func_float_op](int32_t (*func)(double)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(32, F64), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    switch (op.operation) {
+        case UnaryOpType::Lui:
+            if (src != SLJIT_IMM) {
+                assert(false);
+                errored = true;
+                break;
+            }
+            src = SLJIT_IMM;
+            srcw = (sljit_sw)(int32_t)(srcw << 16);
+            jit_op = SLJIT_MOV;
+            break;
+        case UnaryOpType::NegateFloat:
+            jit_op = SLJIT_NEG_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::NegateDouble:
+            jit_op = SLJIT_NEG_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::AbsFloat:
+            jit_op = SLJIT_ABS_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::AbsDouble:
+            jit_op = SLJIT_ABS_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::SqrtFloat:
+            emit_s_func(sqrtf);
+            break;
+        case UnaryOpType::SqrtDouble:
+            emit_d_func(sqrt);
+            break;
+        case UnaryOpType::ConvertSFromW:
+            jit_op = SLJIT_CONV_F32_FROM_S32;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertWFromS:
+            emit_w_from_s_func(do_cvt_w_s);
+            break;
+        case UnaryOpType::ConvertDFromW:
+            jit_op = SLJIT_CONV_F64_FROM_S32;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertWFromD:
+            emit_w_from_d_func(do_cvt_w_d);
+            break;
+        case UnaryOpType::ConvertDFromS:
+            jit_op = SLJIT_CONV_F64_FROM_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertSFromD:
+            // SLJIT_CONV_F32_FROM_F64 uses the current rounding mode, just as CVT_S_D does.
+            jit_op = SLJIT_CONV_F32_FROM_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertDFromL:
+            jit_op = SLJIT_CONV_F64_FROM_SW;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertLFromD:
+            emit_l_from_d_func(do_cvt_l_d);
+            break;
+        case UnaryOpType::ConvertSFromL:
+            jit_op = SLJIT_CONV_F32_FROM_SW;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertLFromS:
+            emit_l_from_s_func(do_cvt_l_s);
+            break;
+        case UnaryOpType::TruncateWFromS:
+            // SLJIT_CONV_S32_FROM_F32 rounds towards zero, just as TRUNC_W_S does.
+            jit_op = SLJIT_CONV_S32_FROM_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::TruncateWFromD:
+            // SLJIT_CONV_S32_FROM_F64 rounds towards zero, just as TRUNC_W_D does.
+            jit_op = SLJIT_CONV_S32_FROM_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::TruncateLFromS:
+            // SLJIT_CONV_SW_FROM_F32 rounds towards zero, just as TRUNC_L_S does.
+            jit_op = SLJIT_CONV_SW_FROM_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::TruncateLFromD:
+            // SLJIT_CONV_SW_FROM_F64 rounds towards zero, just as TRUNC_L_D does.
+            jit_op = SLJIT_CONV_SW_FROM_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::RoundWFromS:
+            emit_w_from_s_func(do_round_w_s);
+            break;
+        case UnaryOpType::RoundWFromD:
+            emit_w_from_d_func(do_round_w_d);
+            break;
+        case UnaryOpType::RoundLFromS:
+            emit_l_from_s_func(do_round_l_s);
+            break;
+        case UnaryOpType::RoundLFromD:
+            emit_l_from_d_func(do_round_l_d);
+            break;
+        case UnaryOpType::CeilWFromS:
+            emit_w_from_s_func(do_ceil_w_s);
+            break;
+        case UnaryOpType::CeilWFromD:
+            emit_w_from_d_func(do_ceil_w_d);
+            break;
+        case UnaryOpType::CeilLFromS:
+            emit_l_from_s_func(do_ceil_l_s);
+            break;
+        case UnaryOpType::CeilLFromD:
+            emit_l_from_d_func(do_ceil_l_d);
+            break;
+        case UnaryOpType::FloorWFromS:
+            emit_w_from_s_func(do_floor_w_s);
+            break;
+        case UnaryOpType::FloorWFromD:
+            emit_w_from_d_func(do_floor_w_d);
+            break;
+        case UnaryOpType::FloorLFromS:
+            emit_l_from_s_func(do_floor_l_s);
+            break;
+        case UnaryOpType::FloorLFromD:
+            emit_l_from_d_func(do_floor_l_d);
+            break;
+        case UnaryOpType::None:
+            jit_op = SLJIT_MOV;
+            break;
+        case UnaryOpType::ToS32:
+        case UnaryOpType::ToInt32:
+            jit_op = SLJIT_MOV_S32;
+            break;
+        // Unary ops that can't be used as a standalone operation
+        case UnaryOpType::ToU32:
+        case UnaryOpType::ToS64:
+        case UnaryOpType::ToU64:
+        case UnaryOpType::Mask5:
+        case UnaryOpType::Mask6:
+            assert(false && "Unsupported unary op");
+            errored = true;
+            return;
+    }
+
+    if (func_float_op) {
+        // Already handled by the lambda.
+    }
+    else if (float_op) {
+        sljit_emit_fop1(compiler, jit_op, dst, dstw, src, srcw);
+    }
+    else {
+        sljit_emit_op1(compiler, jit_op, dst, dstw, src, srcw);
+    }
+}
+
+void N64Recomp::LiveGenerator::process_store_op(const StoreOp& op, const InstructionContext& ctx) const {
+    sljit_sw src;
+    sljit_sw srcw;
+    sljit_sw imm = (sljit_sw)(int16_t)ctx.imm16;
+
+    get_operand_values(op.value_input, ctx, src, srcw);
+
+    // Only LO16 relocs are valid on stores.
+    if (ctx.reloc_type != RelocType::R_MIPS_NONE && ctx.reloc_type != RelocType::R_MIPS_LO16) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    if (ctx.reloc_type == RelocType::R_MIPS_LO16) {
+        // Load the relocated address into temp1.
+        load_relocated_address(ctx, Registers::arithmetic_temp1);
+        // Extract the LO16 value from the full address (sign extended lower 16 bits).
+        sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        // Add the base register (rs) to the LO16 immediate.
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(ctx.rs));
+    }
+    else {
+        // TODO 0 immediate optimization.
+
+        // Add the base register (rs) and the immediate to get the address and store it in the arithemtic temp.
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(ctx.rs), SLJIT_IMM, imm);
+    }
+
+    auto do_unaligned_store_op = [src, srcw, this](bool left, bool doubleword) {
+        // Determine the shift direction to use for calculating the mask and shifting the loaded value.
+        sljit_sw shift_op = left ? SLJIT_LSHR : SLJIT_SHL;
+        // Determine the operation's word size.
+        sljit_sw word_size = doubleword ? 8 : 4;
+
+        // Mask the address with the alignment mask to get the misalignment and put it in temp2.
+        // misalignment = addr & (word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, word_size - 1);
+
+        // Mask the address with ~alignment_mask to get the aligned address and put it in temp1.
+        // addr = addr & ~(word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, ~(word_size - 1));
+
+        // Load the word at rdram + aligned address into the temp1 with sign-extension.
+        // loaded_value = *addr
+        if (doubleword) {
+            // Rotate the loaded doubleword by 32 bits to swap the two words into the right order.
+            sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp3, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32);
+        }
+        else {
+            // Use MOV_S32 to sign-extend the loaded word.
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp3, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0);
+        }
+
+        // Inverse the misalignment if this is a right load.
+        if (!left) {
+            // misalignment = (word_size - 1 - misalignment) * 8
+            sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp2, 0, SLJIT_IMM, word_size - 1, Registers::arithmetic_temp2, 0);
+        }
+
+        // Calculate the misalignment shift and put it into temp2.
+        // misalignment_shift = misalignment * 8
+        sljit_emit_op2(compiler, SLJIT_SHL, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, 3);
+
+        // Shift the input value by the misalignment shift and put it into temp4.
+        // input_value SHIFT= misalignment_shift
+        sljit_emit_op2(compiler, shift_op, Registers::arithmetic_temp4, 0, src, srcw, Registers::arithmetic_temp2, 0);
+
+        // Calculate the misalignment mask and put it into temp2. Use a 32-bit shift if this is a 32-bit operation.
+        // misalignment_mask = word(-1) SHIFT misalignment_shift
+        sljit_emit_op2(compiler, doubleword ? shift_op : (shift_op | SLJIT_32),
+            Registers::arithmetic_temp2, 0,
+            SLJIT_IMM, doubleword ? uint64_t(-1) : uint32_t(-1),
+            Registers::arithmetic_temp2, 0);
+
+        // Mask the input value with the misalignment mask and place it into temp4.
+        // masked_value = shifted_value & misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp2, 0);
+
+        // Invert the misalignment mask and store it into temp2.
+        // misalignment_mask = ~misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, sljit_sw(-1));
+
+        // Mask the loaded value by the misalignment mask.
+        // input_value &= misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp2, 0);
+
+        // Combine the masked initial value with the shifted loaded value and store it in the destination.
+        // out = masked_value | input_value
+        if (doubleword) {
+            // Combine the values into a temp so that it can be rotated to the correct word order.
+            sljit_emit_op2(compiler, SLJIT_OR, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp3, 0);
+            sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, Registers::arithmetic_temp4, 0, SLJIT_IMM, 32);
+        }
+        else {
+            sljit_emit_op2(compiler, SLJIT_OR32, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp3, 0);
+        }
+    };
+
+    switch (op.type) {
+        case StoreOpType::SD:
+        case StoreOpType::SDC1:        
+            // Rotate the arithmetic temp by 32 bits to swap the words and move it into the destination.
+            sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw, SLJIT_IMM, 32);
+            break;
+        case StoreOpType::SDL:
+            do_unaligned_store_op(true, true);
+            break;
+        case StoreOpType::SDR:
+            do_unaligned_store_op(false, true);
+            break;
+        case StoreOpType::SW:
+        case StoreOpType::SWC1:
+            // store the 32-bit value at address + rdram
+            sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw);
+            break;
+        case StoreOpType::SWL:
+            do_unaligned_store_op(true, false);
+            break;
+        case StoreOpType::SWR:
+            do_unaligned_store_op(false, false);
+            break;
+        case StoreOpType::SH:
+            // xor the address with 2
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, 2);
+            // store the 16-bit value at address + rdram
+            sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw);
+            break;
+        case StoreOpType::SB:
+            // xor the address with 3
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, 3);
+            // store the 8-bit value at address + rdram
+            sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw);
+            break;
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_function_start(const std::string& function_name, size_t func_index) const {
+    context->function_name = function_name;
+    context->func_labels[func_index] = sljit_emit_label(compiler);
+    // sljit_emit_op0(compiler, SLJIT_BREAKPOINT);
+    sljit_emit_enter(compiler, 0, SLJIT_ARGS2V(P, P), 4 | SLJIT_ENTER_FLOAT(1), 5 | SLJIT_ENTER_FLOAT(0), 0);
+    sljit_emit_op2(compiler, SLJIT_SUB, Registers::rdram, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+}
+
+void N64Recomp::LiveGenerator::emit_function_end() const {
+    // Check that all jumps have been paired to a label.
+    if (!context->pending_jumps.empty()) {
+        assert(false);
+        errored = true;
+    }
+    
+    // Populate the labels for pending switches and move them into the unlinked jump tables.
+    bool invalid_switch = false;
+    for (size_t switch_index = 0; switch_index < context->switch_jump_labels.size(); switch_index++) {
+        const std::vector<std::string>& cur_labels = context->switch_jump_labels[switch_index];
+        std::vector<sljit_label*> cur_label_addrs{};
+        cur_label_addrs.resize(cur_labels.size());
+        for (size_t case_index = 0; case_index < cur_labels.size(); case_index++) {
+            // Find the label.
+            auto find_it = context->labels.find(cur_labels[case_index]);
+            if (find_it == context->labels.end()) {
+                // Label not found, invalid switch.
+                // Track this in a variable instead of returning immediately so that the pending labels are still cleared.
+                invalid_switch = true;
+                break;
+            }
+            cur_label_addrs[case_index] = find_it->second;
+        }
+        context->unlinked_jump_tables.emplace_back(
+            std::make_pair<std::vector<sljit_label*>, std::unique_ptr<void*[]>>(
+                std::move(cur_label_addrs),
+                std::move(context->pending_jump_tables[switch_index])
+            )
+        );
+    }
+    context->switch_jump_labels.clear();
+    context->pending_jump_tables.clear();
+
+    // Clear the labels to prevent labels from one function being jumped to by another.
+    context->labels.clear();
+
+    if (invalid_switch) {
+        assert(false);
+        errored = true;
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_function_call_lookup(uint32_t addr) const {
+    // Load the address immediate into the first argument. 
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, int32_t(addr));
+    
+    // Call get_function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, 32), SLJIT_IMM, sljit_sw(inputs.get_function));
+    
+    // Copy the return value into R2 so that it can be used for icall
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_R0, 0);
+    
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+
+    // Call the function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P), SLJIT_R2, 0);
+}
+
+void N64Recomp::LiveGenerator::emit_function_call_by_register(int reg) const {
+    // Load the register's value into the first argument. 
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg));
+
+    // Call get_function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, 32), SLJIT_IMM, sljit_sw(inputs.get_function));
+
+    // Copy the return value into R2 so that it can be used for icall
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_R0, 0);
+
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+
+    // Call the function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P), SLJIT_R2, 0);
+}
+
+void N64Recomp::LiveGenerator::emit_function_call_reference_symbol(const Context&, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const {
+    (void)symbol_index;
+
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // sljit_emit_op0(compiler, SLJIT_BREAKPOINT);
+    // Call the function and save the jump to set its label later on.
+    sljit_jump* call_jump = sljit_emit_call(compiler, SLJIT_CALL | SLJIT_REWRITABLE_JUMP, SLJIT_ARGS2V(P, P));
+    // Set a dummy jump value, this will get replaced during reference/import symbol jump population.
+    if (section_index == N64Recomp::SectionImport) {
+        sljit_set_target(call_jump, sljit_uw(-1));
+        context->import_jumps_by_index.emplace(symbol_index, call_jump);
+    }
+    else {
+        sljit_set_target(call_jump, sljit_uw(-2));
+        context->reference_symbol_jumps.emplace_back(std::make_pair(
+            ReferenceJumpDetails{
+                .section = section_index,
+                .section_offset = target_section_offset
+            },
+            call_jump
+        ));
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_function_call(const Context&, size_t function_index) const {
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // Call the function and save the jump to set its label later on.
+    sljit_jump* call_jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P));
+    context->inner_calls.emplace_back(InnerCall{ .target_func_index = function_index, .jump = call_jump });
+}
+
+void N64Recomp::LiveGenerator::emit_named_function_call(const std::string& function_name) const {
+    // The live recompiler can't call functions by name. This is only used for statics, so it's not an issue.
+    assert(false);
+    errored = true;
+}
+
+void N64Recomp::LiveGenerator::emit_goto(const std::string& target) const {
+    sljit_jump* jump = sljit_emit_jump(compiler, SLJIT_JUMP);
+    // Check if the label already exists.
+    auto find_it = context->labels.find(target);
+    if (find_it != context->labels.end()) {
+        sljit_set_label(jump, find_it->second);
+    }
+    // It doesn't, so queue this as a pending jump to be resolved later.
+    else {
+        context->pending_jumps[target].push_back(jump);
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_label(const std::string& label_name) const {
+    sljit_label* label = sljit_emit_label(compiler);
+
+    // Check if there are any pending jumps for this label and assign them if so.
+    auto find_it = context->pending_jumps.find(label_name);
+    if (find_it != context->pending_jumps.end()) {
+        for (sljit_jump* jump : find_it->second) {
+            sljit_set_label(jump, label);
+        }
+
+        // Remove the pending jumps for this label.
+        context->pending_jumps.erase(find_it);
+    }
+
+    context->labels.emplace(label_name, label);
+}
+
+void N64Recomp::LiveGenerator::emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const {
+    (void)jtbl;
+    (void)reg;
+    // Nothing to do here, the live recompiler performs a subtraction to get the switch's case.
+}
+
+void N64Recomp::LiveGenerator::emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const {
+    // Make sure there's no pending jump.
+    if(context->cur_branch_jump != nullptr) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // Branch conditions do not allow unary ops, except for ToS64 on the first operand to indicate the branch comparison is signed.
+    if(op.operands.operand_operations[0] != UnaryOpType::None && op.operands.operand_operations[0] != UnaryOpType::ToS64) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    if (op.operands.operand_operations[1] != UnaryOpType::None) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    sljit_s32 condition_type;
+    bool cmp_signed = op.operands.operand_operations[0] == UnaryOpType::ToS64;
+    // Comparisons need to be inverted to account for the fact that the generator is expected to generate a code block that only runs if
+    // the condition is met, meaning the branch should be taken if the condition isn't met.
+    switch (op.comparison) {
+        case BinaryOpType::Equal:
+            condition_type = SLJIT_NOT_EQUAL;
+            break;
+        case BinaryOpType::NotEqual:
+            condition_type = SLJIT_EQUAL;
+            break;
+        case BinaryOpType::GreaterEq:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_LESS;
+            }
+            else {
+                condition_type = SLJIT_LESS;
+            }
+            break;
+        case BinaryOpType::Greater:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_LESS_EQUAL;
+            }
+            else {
+                condition_type = SLJIT_LESS_EQUAL;
+            }
+            break;
+        case BinaryOpType::LessEq:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_GREATER;
+            }
+            else {
+                condition_type = SLJIT_GREATER;
+            }
+            break;
+        case BinaryOpType::Less:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_GREATER_EQUAL;
+            }
+            else {
+                condition_type = SLJIT_GREATER_EQUAL;
+            }
+            break;
+        default:
+            assert(false && "Invalid branch condition comparison operation!");
+            errored = true;
+            return;
+    }
+    sljit_sw src1;
+    sljit_sw src1w;
+    sljit_sw src2;
+    sljit_sw src2w;
+
+    get_operand_values(op.operands.operands[0], ctx, src1, src1w);
+    get_operand_values(op.operands.operands[1], ctx, src2, src2w);
+
+    // Relocations aren't valid on conditional branches.
+    if(ctx.reloc_type != RelocType::R_MIPS_NONE) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // Create a compare jump and track it as the pending branch jump.
+    context->cur_branch_jump = sljit_emit_cmp(compiler, condition_type, src1, src1w, src2, src2w);
+}
+
+void N64Recomp::LiveGenerator::emit_branch_close() const {
+    // Make sure there's a pending branch jump.
+    if(context->cur_branch_jump == nullptr) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // Assign a label at this point to the pending branch jump and clear it.
+    sljit_set_label(context->cur_branch_jump, sljit_emit_label(compiler));
+    context->cur_branch_jump = nullptr;
+}
+
+void N64Recomp::LiveGenerator::emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const {
+    // Populate the switch's labels.
+    std::vector<std::string> cur_labels{};
+    cur_labels.resize(jtbl.entries.size());
+    for (size_t i = 0; i < cur_labels.size(); i++) {
+        cur_labels[i] = fmt::format("L_{:08X}", jtbl.entries[i]);
+    }
+    context->switch_jump_labels.emplace_back(std::move(cur_labels));
+
+    // Allocate the jump table.
+    std::unique_ptr<void* []> cur_jump_table = std::make_unique<void* []>(jtbl.entries.size());
+
+    /// Codegen
+
+    // Load the jump target register. The lw instruction was patched into an addiu, so this holds
+    // the address of the jump table entry instead of the actual jump target.
+    sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg));
+    // Subtract the jump table's address from the jump target to get the jump table addend.
+    // Sign extend the jump table address to 64 bits so that the entire register's contents are used instead of just the lower 32 bits.
+    const auto& jtbl_section = recompiler_context.sections[jtbl.section_index];
+    if (jtbl_section.relocatable) {
+        // Make a dummy instruction context to pass to `load_relocated_address`.
+        InstructionContext dummy_context{};
+        
+        // Get the relocated address of the jump table.
+        uint32_t section_offset = jtbl.vram - jtbl_section.ram_addr;
+
+        // Populate the necessary fields of the dummy context and load the relocated address into temp2.
+        dummy_context.reloc_section_index = jtbl.section_index;
+        dummy_context.reloc_target_section_offset = section_offset;
+        load_relocated_address(dummy_context, Registers::arithmetic_temp2);
+
+        // Subtract the relocated jump table start address from the loaded address. 
+        sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0);
+    }
+    else {
+        sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, (sljit_sw)((int32_t)jtbl.vram));
+    }
+    
+    // Bounds check the addend. If it's greater than or equal to the jump table size (entries * sizeof(u32)) then jump to the switch error.
+    sljit_jump* switch_error_jump = sljit_emit_cmp(compiler, SLJIT_GREATER_EQUAL, Registers::arithmetic_temp1, 0, SLJIT_IMM, jtbl.entries.size() * sizeof(uint32_t));
+    context->switch_error_jumps.emplace_back(SwitchErrorJump{.instr_vram = jtbl.jr_vram, .jtbl_vram = jtbl.vram, .jump = switch_error_jump});
+
+    // Multiply the jump table addend by 2 to get the addend for the real jump table. (4 bytes per entry to 8 bytes per entry).
+    sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+    // Load the real jump table address.
+    sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp2, 0, SLJIT_IMM, (sljit_sw)cur_jump_table.get());
+    // Load the real jump entry.
+    sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::arithmetic_temp1, Registers::arithmetic_temp2), 0);
+    // Jump to the loaded entry.
+    sljit_emit_ijump(compiler, SLJIT_JUMP, Registers::arithmetic_temp1, 0);
+
+    // Move the jump table into the pending jump tables.
+    context->pending_jump_tables.emplace_back(std::move(cur_jump_table));
+}
+
+void N64Recomp::LiveGenerator::emit_case(int case_index, const std::string& target_label) const {
+    (void)case_index;
+    (void)target_label;
+    // Nothing to do here, the jump table is built in emit_switch.
+}
+
+void N64Recomp::LiveGenerator::emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const {
+    (void)instr_vram;
+    (void)jtbl_vram;
+    // Nothing to do here, the jump table is built in emit_switch.
+}
+
+void N64Recomp::LiveGenerator::emit_switch_close() const {
+    // Nothing to do here, the jump table is built in emit_switch.
+}
+
+void N64Recomp::LiveGenerator::emit_return() const {
+    sljit_emit_return_void(compiler);
+}
+
+void N64Recomp::LiveGenerator::emit_check_fr(int fpr) const {
+    (void)fpr;
+    // Nothing to do here.
+}
+
+void N64Recomp::LiveGenerator::emit_check_nan(int fpr, bool is_double) const {
+    (void)fpr;
+    (void)is_double;
+    // Nothing to do here.
+}
+
+void N64Recomp::LiveGenerator::emit_cop0_status_read(int reg) const {
+    // Skip the read if the target is the zero register.
+    if (reg != 0) {
+        // Load ctx into R0.
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, Registers::ctx, 0);
+
+        // Call cop0_status_read.
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P,32), SLJIT_IMM, sljit_sw(inputs.cop0_status_read));
+
+        // Store the result in the output register.
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg), SLJIT_R0, 0);
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_cop0_status_write(int reg) const {
+    sljit_sw src;
+    sljit_sw srcw;
+    get_gpr_values(reg, src, srcw);
+    
+    // Load ctx and the input register value into R0 and R1
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, Registers::ctx, 0);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, src, srcw);
+
+    // Call cop0_status_write.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P,32), SLJIT_IMM, sljit_sw(inputs.cop0_status_write));
+}
+
+void N64Recomp::LiveGenerator::emit_cop1_cs_read(int reg) const {
+    // Skip the read if the target is the zero register.
+    if (reg != 0) {
+        sljit_sw dst;
+        sljit_sw dstw;
+        get_gpr_values(reg, dst, dstw);
+
+        // Call get_cop1_cs.
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS0(32), SLJIT_IMM, sljit_sw(get_cop1_cs));
+
+        // Store the result in the output register.
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0);
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_cop1_cs_write(int reg) const {
+    sljit_sw src;
+    sljit_sw srcw;
+    get_gpr_values(reg, src, srcw);
+
+    // Load the input register value into R0.
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw);
+
+    // Call set_cop1_cs.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(32), SLJIT_IMM, sljit_sw(set_cop1_cs));
+}
+
+void N64Recomp::LiveGenerator::emit_muldiv(InstrId instr_id, int reg1, int reg2) const {
+    sljit_sw src1;
+    sljit_sw src1w;
+    sljit_sw src2;
+    sljit_sw src2w;
+    get_gpr_values(reg1, src1, src1w);
+    get_gpr_values(reg2, src2, src2w);
+    
+    auto do_mul32_op = [src1, src1w, src2, src2w, this](bool is_signed) {
+        // Load the two inputs into the multiplication input registers (R0/R1).
+        if (is_signed) {
+            // 32-bit signed multiplication is really 64 bits * 35 bits, so load accordingly.
+            sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src1, src1w); 
+
+            // Sign extend to 35 bits by shifting left by 64 - 35 and then shifting right by the same amount.
+            sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, src2, src2w, SLJIT_IMM, 64 - 35);
+            sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 64 - 35);
+        }
+        else {
+            sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R0, 0, src1, src1w);
+            sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R1, 0, src2, src2w);
+        }
+
+        // Perform the multiplication.
+        sljit_emit_op0(compiler, is_signed ? SLJIT_LMUL_SW : SLJIT_LMUL_UW);
+
+        // Move the results into hi and lo with sign extension.
+        sljit_emit_op2(compiler, SLJIT_ASHR, Registers::hi, 0, SLJIT_R0, 0, SLJIT_IMM, 32);
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::lo, 0, SLJIT_R0, 0);
+    };
+    
+    auto do_mul64_op = [src1, src1w, src2, src2w, this](bool is_signed) {
+        // Load the two inputs into the multiplication input registers (R0/R1).
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src1, src1w); 
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, src2, src2w);
+
+        // Perform the multiplication.
+        sljit_emit_op0(compiler, is_signed ? SLJIT_LMUL_SW : SLJIT_LMUL_UW);
+
+        // Move the results into hi and lo.
+        sljit_emit_op1(compiler, SLJIT_MOV, Registers::hi, 0, SLJIT_R1, 0);
+        sljit_emit_op1(compiler, SLJIT_MOV, Registers::lo, 0, SLJIT_R0, 0);
+    };
+    
+    auto do_div_op = [src1, src1w, src2, src2w, this](bool doubleword, bool is_signed) {
+        // Pick the division opcode based on the bit width and signedness.
+        // Note that the 64-bit division opcode is used for 32-bit signed division to match hardware behavior and prevent overflow.
+        sljit_sw div_opcode = doubleword ?
+            (is_signed ? SLJIT_DIVMOD_SW : SLJIT_DIVMOD_UW) :
+            (is_signed ? SLJIT_DIVMOD_SW : SLJIT_DIVMOD_U32);
+
+        // Pick the move opcode to use for loading the operands.
+        sljit_sw load_opcode = doubleword ? SLJIT_MOV :
+            (is_signed ? SLJIT_MOV_S32 : SLJIT_MOV_U32);
+
+        // Pick the move opcode to use for saving the results.
+        sljit_sw save_opcode = doubleword ? SLJIT_MOV : SLJIT_MOV_S32;
+
+        // Load the two inputs into R0 and R1 (the numerator and denominator).
+        sljit_emit_op1(compiler, load_opcode, SLJIT_R0, 0, src1, src1w); 
+
+        // TODO figure out 32-bit signed division behavior when inputs aren't properly sign extended.
+        // if (!doubleword && is_signed) {
+        //     // Sign extend to 35 bits by shifting left by 64 - 35 and then shifting right by the same amount.
+        //     sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, src2, src2w, SLJIT_IMM, 64 - 35);
+        //     sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 64 - 35);
+        // }
+        // else {
+            sljit_emit_op1(compiler, load_opcode, SLJIT_R1, 0, src2, src2w);
+        // }
+
+        // Prevent overflow on 64-bit signed division.
+        if (doubleword && is_signed) {
+            // If the numerator is INT64_MIN and the denominator is -1, an overflow will occur. To prevent an exception and
+            // behave as the original hardware would, check if either of those conditions are false.
+            // If neither condition is false (i.e. both are true), set the denominator to 1.
+
+            // Xor the numerator with INT64_MIN. This will be zero if they're equal.
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, sljit_sw(INT64_MIN));
+
+            // Invert the denominator. This will be zero if it's -1.
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, sljit_sw(-1)); 
+
+            // Or the results of the previous two calculations and set the zero flag. This will be zero if both conditions were met.
+            sljit_emit_op2(compiler, SLJIT_OR | SLJIT_SET_Z, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp4, 0);
+
+            // If the zero flag is 0, meaning both conditions were true, replace the denominator with 1.
+            // i.e. conditionally move an immediate of 1 into arithmetic temp 2 if the zero flag is 0.
+            sljit_emit_select(compiler, SLJIT_ZERO, SLJIT_R1, SLJIT_IMM, 1, SLJIT_R1);
+        }
+
+        // If the denominator is 0, skip the division and jump the special handling for that case.
+        // Branch past the division if the divisor is 0.
+        sljit_jump* jump_skip_division = sljit_emit_cmp(compiler, SLJIT_EQUAL, SLJIT_R1, 0, SLJIT_IMM, 0);// sljit_emit_jump(compiler, SLJIT_ZERO);
+
+        // Perform the division.
+        sljit_emit_op0(compiler, div_opcode);
+
+        // Extract the remainder and quotient into the high and low registers respectively.
+        sljit_emit_op1(compiler, save_opcode, Registers::hi, 0, SLJIT_R1, 0);
+        sljit_emit_op1(compiler, save_opcode, Registers::lo, 0, SLJIT_R0, 0);
+
+        // Jump to the end of this routine.
+        sljit_jump* jump_to_end = sljit_emit_jump(compiler, SLJIT_JUMP);
+
+        // Emit a label and set it as the target of the jump if the denominator was zero.
+        sljit_label* after_division = sljit_emit_label(compiler);
+        sljit_set_label(jump_skip_division, after_division);
+
+        // Move the numerator into hi.
+        sljit_emit_op1(compiler, save_opcode, Registers::hi, 0, SLJIT_R0, 0);
+
+        if (is_signed) {
+            // Calculate the negative signum of the numerator and place it in lo.
+            // neg_signum = ((int64_t)(~x) >> (bit width - 1)) | 1
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::lo, 0, SLJIT_R0, 0, SLJIT_IMM, sljit_sw(-1));
+            sljit_emit_op2(compiler, SLJIT_ASHR, Registers::lo, 0, Registers::lo, 0, SLJIT_IMM, 64 - 1);
+            sljit_emit_op2(compiler, SLJIT_OR, Registers::lo, 0, Registers::lo, 0, SLJIT_IMM, 1);
+        }
+        else {
+            // Move -1 into lo.
+            sljit_emit_op1(compiler, SLJIT_MOV, Registers::lo, 0, SLJIT_IMM, sljit_sw(-1));
+        }
+
+        // Emit a label and set it as the target of the jump after the divison.
+        sljit_label* end_label = sljit_emit_label(compiler);
+        sljit_set_label(jump_to_end, end_label);
+    };
+    
+
+    switch (instr_id) {
+        case InstrId::cpu_mult:
+            do_mul32_op(true);
+            break;
+        case InstrId::cpu_multu:
+            do_mul32_op(false);
+            break;
+        case InstrId::cpu_dmult:
+            do_mul64_op(true);
+            break;
+        case InstrId::cpu_dmultu:
+            do_mul64_op(false);
+            break;
+        case InstrId::cpu_div:
+            do_div_op(false, true);
+            break;
+        case InstrId::cpu_divu:
+            do_div_op(false, false);
+            break;
+        case InstrId::cpu_ddiv:
+            do_div_op(true, true);
+            break;
+        case InstrId::cpu_ddivu:
+            do_div_op(true, false);
+            break;
+        default:
+            assert(false && "Invalid mul/div instruction id!");
+            break;
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_syscall(uint32_t instr_vram) const {
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // Load the vram into R2.
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, instr_vram);
+    // Call syscall_handler.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3V(P, P, 32), SLJIT_IMM, sljit_sw(inputs.syscall_handler));
+}
+
+void N64Recomp::LiveGenerator::emit_do_break(uint32_t instr_vram) const {
+    // Load the vram into R0.
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, instr_vram);
+    // Call do_break.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(32), SLJIT_IMM, sljit_sw(inputs.do_break));
+}
+
+void N64Recomp::LiveGenerator::emit_pause_self() const {
+    // Load rdram into R0.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    // Call pause_self.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(P), SLJIT_IMM, sljit_sw(inputs.pause_self));
+}
+
+void N64Recomp::LiveGenerator::emit_trigger_event(uint32_t event_index) const {
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // Load the global event index into R2.
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, event_index + inputs.base_event_index);
+    // Call trigger_event.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(P), SLJIT_IMM, sljit_sw(inputs.trigger_event));
+}
+
+void N64Recomp::LiveGenerator::emit_comment(const std::string& comment) const {
+    (void)comment;
+    // Nothing to do here.
+}
+
+bool N64Recomp::recompile_function_live(LiveGenerator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    return recompile_function_custom(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs);
+}
+
diff --git a/LiveRecomp/live_recompiler_test.cpp b/LiveRecomp/live_recompiler_test.cpp
new file mode 100644
index 0000000..c5673eb
--- /dev/null
+++ b/LiveRecomp/live_recompiler_test.cpp
@@ -0,0 +1,364 @@
+#include <fstream>
+#include <chrono>
+#include <filesystem>
+#include <cinttypes>
+
+#include "sljitLir.h"
+#include "recompiler/live_recompiler.h"
+#include "recomp.h"
+
+static std::vector<uint8_t> read_file(const std::filesystem::path& path, bool& found) {
+    std::vector<uint8_t> ret;
+    found = false;
+
+    std::ifstream file{ path, std::ios::binary};
+
+    if (file.good()) {
+        file.seekg(0, std::ios::end);
+        ret.resize(file.tellg());
+        file.seekg(0, std::ios::beg);
+
+        file.read(reinterpret_cast<char*>(ret.data()), ret.size());
+        found = true;
+    }
+
+    return ret;
+}
+
+
+uint32_t read_u32_swap(const std::vector<uint8_t>& vec, size_t offset) {
+    return byteswap(*reinterpret_cast<const uint32_t*>(&vec[offset]));
+}
+
+uint32_t read_u32(const std::vector<uint8_t>& vec, size_t offset) {
+    return *reinterpret_cast<const uint32_t*>(&vec[offset]);
+}
+
+std::vector<uint8_t> rdram;
+
+void byteswap_copy(uint8_t* dst, uint8_t* src, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        dst[i ^ 3] = src[i];
+    }
+}
+
+bool byteswap_compare(uint8_t* a, uint8_t* b, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        if (a[i ^ 3] != b[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+enum class TestError {
+    Success,
+    FailedToOpenInput,
+    FailedToRecompile,
+    UnknownStructType,
+    DataDifference
+};
+
+struct TestStats {
+    TestError error;
+    uint64_t codegen_microseconds;
+    uint64_t execution_microseconds;
+    uint64_t code_size;
+};
+
+void write1(uint8_t* rdram, recomp_context* ctx) {
+    MEM_B(0, ctx->r4) = 1;
+}
+
+recomp_func_t* test_get_function(int32_t vram) {
+    if (vram == 0x80100000) {
+        return write1;
+    }
+    assert(false);
+    return nullptr;
+}
+
+void test_switch_error(const char* func, uint32_t vram, uint32_t jtbl) {
+    printf("  Switch-case out of bounds in %s at 0x%08X for jump table at 0x%08X\n", func, vram, jtbl);
+}
+
+TestStats run_test(const std::filesystem::path& tests_dir, const std::string& test_name) {
+    std::filesystem::path input_path = tests_dir / (test_name + "_data.bin");
+    std::filesystem::path data_dump_path = tests_dir / (test_name + "_data_out.bin");
+
+    bool found;
+    std::vector<uint8_t> file_data = read_file(input_path, found);
+
+    if (!found) {
+        printf("Failed to open file: %s\n", input_path.string().c_str());
+        return { TestError::FailedToOpenInput };
+    }
+
+    // Parse the test file.
+    uint32_t text_offset = read_u32_swap(file_data, 0x00);
+    uint32_t text_length = read_u32_swap(file_data, 0x04);
+    uint32_t init_data_offset = read_u32_swap(file_data, 0x08);
+    uint32_t good_data_offset = read_u32_swap(file_data, 0x0C);
+    uint32_t data_length = read_u32_swap(file_data, 0x10);
+    uint32_t text_address = read_u32_swap(file_data, 0x14);
+    uint32_t data_address = read_u32_swap(file_data, 0x18);
+    uint32_t next_struct_address = read_u32_swap(file_data, 0x1C);
+
+    recomp_context ctx{};
+
+    byteswap_copy(&rdram[text_address - 0x80000000], &file_data[text_offset], text_length);
+    byteswap_copy(&rdram[data_address - 0x80000000], &file_data[init_data_offset], data_length);
+
+    // Build recompiler context.
+    N64Recomp::Context context{};
+
+    // Move the file data into the context.
+    context.rom = std::move(file_data);
+
+    context.sections.resize(2);
+    // Create a section for the function to exist in.
+    context.sections[0].ram_addr = text_address;
+    context.sections[0].rom_addr = text_offset;
+    context.sections[0].size = text_length;
+    context.sections[0].name = ".text";
+    context.sections[0].executable = true;
+    context.sections[0].relocatable = true;
+    context.section_functions.resize(context.sections.size());
+    // Create a section for .data (used for relocations)
+    context.sections[1].ram_addr = data_address;
+    context.sections[1].rom_addr = init_data_offset;
+    context.sections[1].size = data_length;
+    context.sections[1].name = ".data";
+    context.sections[1].executable = false;
+    context.sections[1].relocatable = true;
+
+    size_t start_func_index;
+    uint32_t function_desc_address = 0;
+    uint32_t reloc_desc_address = 0;
+
+    // Read any extra structs.
+    while (next_struct_address != 0) {
+        uint32_t cur_struct_address = next_struct_address;
+        uint32_t struct_type = read_u32_swap(context.rom, next_struct_address + 0x00);
+        next_struct_address = read_u32_swap(context.rom, next_struct_address + 0x04);
+
+        switch (struct_type) {
+            case 1: // Function desc
+                function_desc_address = cur_struct_address;
+                break;
+            case 2: // Relocation
+                reloc_desc_address = cur_struct_address;
+                break;
+            default:
+                printf("Unknown struct type %u\n", struct_type);
+                return { TestError::UnknownStructType };
+        }
+    }
+
+    // Check if a function description exists.
+    if (function_desc_address == 0) {
+        // No function description, so treat the whole thing as one function.
+
+        // Get the function's instruction words.
+        std::vector<uint32_t> text_words{};
+        text_words.resize(text_length / sizeof(uint32_t));
+        for (size_t i = 0; i < text_words.size(); i++) {
+            text_words[i] = read_u32(context.rom, text_offset + i * sizeof(uint32_t));
+        }
+
+        // Add the function to the context.
+        context.functions_by_vram[text_address].emplace_back(context.functions.size());
+        context.section_functions.emplace_back(context.functions.size());
+        context.sections[0].function_addrs.emplace_back(text_address);
+        context.functions.emplace_back(
+            text_address,
+            text_offset,
+            text_words,
+            "test_func",
+            0
+        );
+        start_func_index = 0;
+    }
+    else {
+        // Use the function description.
+        uint32_t num_funcs = read_u32_swap(context.rom, function_desc_address + 0x08);
+        start_func_index = read_u32_swap(context.rom, function_desc_address + 0x0C);
+
+        for (size_t func_index = 0; func_index < num_funcs; func_index++) {
+            uint32_t cur_func_address = read_u32_swap(context.rom, function_desc_address + 0x10 + 0x00 + 0x08 * func_index);
+            uint32_t cur_func_length = read_u32_swap(context.rom, function_desc_address + 0x10 + 0x04 + 0x08 * func_index);
+            uint32_t cur_func_offset = cur_func_address - text_address + text_offset;
+
+            // Get the function's instruction words.
+            std::vector<uint32_t> text_words{};
+            text_words.resize(cur_func_length / sizeof(uint32_t));
+            for (size_t i = 0; i < text_words.size(); i++) {
+                text_words[i] = read_u32(context.rom, cur_func_offset + i * sizeof(uint32_t));
+            }
+
+            // Add the function to the context.
+            context.functions_by_vram[cur_func_address].emplace_back(context.functions.size());
+            context.section_functions.emplace_back(context.functions.size());
+            context.sections[0].function_addrs.emplace_back(cur_func_address);
+            context.functions.emplace_back(
+                cur_func_address,
+                cur_func_offset,
+                std::move(text_words),
+                "test_func_" + std::to_string(func_index),
+                0
+            );
+        }
+    }
+
+    // Check if a relocation description exists.
+    if (reloc_desc_address != 0) {
+        uint32_t num_relocs = read_u32_swap(context.rom, reloc_desc_address + 0x08);
+        for (uint32_t reloc_index = 0; reloc_index < num_relocs; reloc_index++) {
+            uint32_t cur_desc_address = reloc_desc_address + 0x0C + reloc_index * 4 * sizeof(uint32_t);
+            uint32_t reloc_type = read_u32_swap(context.rom, cur_desc_address + 0x00);
+            uint32_t reloc_section = read_u32_swap(context.rom, cur_desc_address + 0x04);
+            uint32_t reloc_address = read_u32_swap(context.rom, cur_desc_address + 0x08);
+            uint32_t reloc_target_offset = read_u32_swap(context.rom, cur_desc_address + 0x0C);
+
+            context.sections[0].relocs.emplace_back(N64Recomp::Reloc{
+                .address = reloc_address,
+                .target_section_offset = reloc_target_offset,
+                .symbol_index = 0,
+                .target_section = static_cast<uint16_t>(reloc_section),
+                .type = static_cast<N64Recomp::RelocType>(reloc_type),
+                .reference_symbol = false
+            });
+        }
+    }
+
+    std::vector<std::vector<uint32_t>> dummy_static_funcs{};
+    std::vector<int32_t> section_addresses{};
+    section_addresses.emplace_back(text_address);
+    section_addresses.emplace_back(data_address);
+
+    auto before_codegen = std::chrono::system_clock::now();
+
+    N64Recomp::LiveGeneratorInputs generator_inputs {
+        .switch_error = test_switch_error,
+        .get_function = test_get_function,
+        .reference_section_addresses = nullptr,
+        .local_section_addresses = section_addresses.data()
+    };
+
+    // Create the sljit compiler and the generator.
+    N64Recomp::LiveGenerator generator{ context.functions.size(), generator_inputs };
+
+    for (size_t func_index = 0; func_index < context.functions.size(); func_index++) {
+        std::ostringstream dummy_ostream{};
+
+        //sljit_emit_op0(compiler, SLJIT_BREAKPOINT);
+
+        if (!N64Recomp::recompile_function_live(generator, context, func_index, dummy_ostream, dummy_static_funcs, true)) {
+            return { TestError::FailedToRecompile };
+        }
+    }
+
+    // Generate the code.
+    N64Recomp::LiveGeneratorOutput output = generator.finish();
+
+    auto after_codegen = std::chrono::system_clock::now();
+
+    auto before_execution = std::chrono::system_clock::now();
+
+    int old_rounding = fegetround();
+
+    // Run the generated code.
+    ctx.r29 = 0xFFFFFFFF80000000 + rdram.size() - 0x10; // Set the stack pointer.
+    output.functions[start_func_index](rdram.data(), &ctx);
+
+    fesetround(old_rounding);
+
+    auto after_execution = std::chrono::system_clock::now();
+
+    // Check the result of running the code.
+    bool good = byteswap_compare(&rdram[data_address - 0x80000000], &context.rom[good_data_offset], data_length);
+
+    // Dump the data if the results don't match.
+    if (!good) {
+        std::ofstream data_dump_file{ data_dump_path, std::ios::binary };
+        std::vector<uint8_t> data_swapped;
+        data_swapped.resize(data_length);
+        byteswap_copy(data_swapped.data(), &rdram[data_address - 0x80000000], data_length);
+        data_dump_file.write(reinterpret_cast<char*>(data_swapped.data()), data_length);
+        return { TestError::DataDifference };
+    }
+
+    // Return the test's stats.
+    TestStats ret{};
+    ret.error = TestError::Success;
+    ret.codegen_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(after_codegen - before_codegen).count();
+    ret.execution_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(after_execution - before_execution).count();
+    ret.code_size = output.code_size;
+
+    return ret;
+}
+
+int main(int argc, const char** argv) {
+    if (argc < 3) {
+        printf("Usage: %s [test directory] [test 1] ...\n", argv[0]);
+        return EXIT_SUCCESS;
+    }
+
+    N64Recomp::live_recompiler_init();
+
+    rdram.resize(0x8000000);
+
+    // Skip the first argument (program name) and second argument (test directory).
+    int count = argc - 1 - 1;
+    int passed_count = 0;
+
+    std::vector<size_t> failed_tests{};
+
+    for (size_t test_index = 0; test_index < count; test_index++) {
+        const char* cur_test_name = argv[2 + test_index];
+        printf("Running test: %s\n", cur_test_name);
+        TestStats stats = run_test(argv[1], cur_test_name);
+
+        switch (stats.error) {
+        case TestError::Success:
+            printf("  Success\n");
+            printf("  Generated %" PRIu64 " bytes in %" PRIu64 " microseconds and ran in %" PRIu64 " microseconds\n",
+                stats.code_size, stats.codegen_microseconds, stats.execution_microseconds);
+            passed_count++;
+            break;
+        case TestError::FailedToOpenInput:
+            printf("  Failed to open input data file\n");
+            break;
+        case TestError::FailedToRecompile:
+            printf("  Failed to recompile\n");
+            break;
+        case TestError::UnknownStructType:
+            printf("  Unknown additional data struct type in test data\n");
+            break;
+        case TestError::DataDifference:
+            printf("  Output data did not match, dumped to file\n");
+            break;
+        }
+
+        if (stats.error != TestError::Success) {
+            failed_tests.emplace_back(test_index);
+        }
+
+        printf("\n");
+    }
+
+    printf("Passed %d/%d tests\n", passed_count, count);
+    if (!failed_tests.empty()) {
+        printf("  Failed: ");
+        for (size_t i = 0; i < failed_tests.size(); i++) {
+            size_t test_index = failed_tests[i];
+
+            printf("%s", argv[2 + test_index]);
+            if (i != failed_tests.size() - 1) {
+                printf(", ");
+            }
+        }
+        printf("\n");
+    }
+    return 0;
+}
diff --git a/OfflineModRecomp/main.cpp b/OfflineModRecomp/main.cpp
index aa25dc8..29e5232 100644
--- a/OfflineModRecomp/main.cpp
+++ b/OfflineModRecomp/main.cpp
@@ -3,7 +3,7 @@
 #include <vector>
 #include <span>
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "rabbitizer.hpp"
 
 static std::vector<uint8_t> read_file(const std::filesystem::path& path, bool& found) {
@@ -221,8 +221,7 @@ int main(int argc, const char** argv) {
 
     // Perform a second pass for recompiling all the functions.
     for (size_t func_index = 0; func_index < mod_context.functions.size(); func_index++) {
-        auto& func = mod_context.functions[func_index];
-        if (!N64Recomp::recompile_function(mod_context, func, output_file, static_funcs_by_section, true)) {
+        if (!N64Recomp::recompile_function(mod_context, func_index, output_file, static_funcs_by_section, true)) {
             output_file.close();
             std::error_code ec;
             std::filesystem::remove(output_file_path, ec);
diff --git a/RecompModTool/main.cpp b/RecompModTool/main.cpp
index 78649ef..9fbb7d1 100644
--- a/RecompModTool/main.cpp
+++ b/RecompModTool/main.cpp
@@ -7,7 +7,7 @@
 #include <cstdlib>
 #include "fmt/format.h"
 #include "fmt/ostream.h"
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include <toml++/toml.hpp>
 
 #ifdef _WIN32
diff --git a/include/generator.h b/include/generator.h
deleted file mode 100644
index 5afcc57..0000000
--- a/include/generator.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef __GENERATOR_H__
-#define __GENERATOR_H__
-
-#include "n64recomp.h"
-#include "operations.h"
-
-namespace N64Recomp {
-    struct InstructionContext {
-        int rd;
-        int rs;
-        int rt;
-        int sa;
-
-        int fd;
-        int fs;
-        int ft;
-
-        int cop1_cs;
-
-        uint16_t imm16;
-
-        bool reloc_tag_as_reference;
-        RelocType reloc_type;
-        uint32_t reloc_section_index;
-        uint32_t reloc_target_section_offset;
-    };
-
-    class Generator {
-    public:
-        virtual void process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const = 0;
-        virtual void process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const = 0;
-        virtual void process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const = 0;
-        virtual void emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const = 0;
-        virtual void emit_branch_close(std::ostream& output_file) const = 0;
-        virtual void emit_check_fr(std::ostream& output_file, int fpr) const = 0;
-        virtual void emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const = 0;
-    };
-
-    class CGenerator final : Generator {
-    public:
-        CGenerator() = default;
-        void process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const final;
-        void process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const final;
-        void process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const final;
-        void emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const final;
-        void emit_branch_close(std::ostream& output_file) const final;
-        void emit_check_fr(std::ostream& output_file, int fpr) const final;
-        void emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const final;
-    private:
-        void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const;
-        void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const;
-        void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const;
-    };
-}
-
-#endif
diff --git a/include/recomp.h b/include/recomp.h
new file mode 100644
index 0000000..d291eec
--- /dev/null
+++ b/include/recomp.h
@@ -0,0 +1,397 @@
+#ifndef __RECOMP_H__
+#define __RECOMP_H__
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <fenv.h>
+#include <assert.h>
+
+// Compiler definition to disable inter-procedural optimization, allowing multiple functions to be in a single file without breaking interposition.
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+    // MSVC's __declspec(noinline) seems to disable inter-procedural optimization entirely, so it's all that's needed.
+    #define RECOMP_FUNC __declspec(noinline)
+    
+    // Use MSVC's fenv_access pragma.
+    #define SET_FENV_ACCESS() _Pragma("fenv_access(on)")
+#elif defined(__clang__)
+    // Clang has no dedicated IPO attribute, so we use a combination of other attributes to give the desired behavior.
+    // The inline keyword allows multiple definitions during linking, and extern forces clang to emit an externally visible definition.
+    // Weak forces Clang to not perform any IPO as the symbol can be interposed, which prevents actual inlining due to the inline keyword.
+    // Add noinline on for good measure, which doesn't conflict with the inline keyword as they have different meanings.
+    #define RECOMP_FUNC extern inline __attribute__((weak,noinline))
+
+    // Use the standard STDC FENV_ACCESS pragma.
+    #define SET_FENV_ACCESS() _Pragma("STDC FENV_ACCESS ON")
+#elif defined(__GNUC__) && !defined(__INTEL_COMPILER)
+    // Use GCC's attribute for disabling inter-procedural optimizations. Also enable the rounding-math compiler flag to disable
+    // constant folding so that arithmetic respects the floating point environment. This is needed because gcc doesn't implement
+    // any FENV_ACCESS pragma.
+    #define RECOMP_FUNC __attribute__((noipa, optimize("rounding-math")))
+
+    // There's no FENV_ACCESS pragma in gcc, so this can be empty.
+    #define SET_FENV_ACCESS()
+#else
+    #error "No RECOMP_FUNC definition for this compiler"
+#endif
+
+// Implementation of 64-bit multiply and divide instructions
+#if defined(__SIZEOF_INT128__)
+
+static inline void DMULT(int64_t a, int64_t b, int64_t* lo64, int64_t* hi64) {
+    __int128 full128 = ((__int128)a) * ((__int128)b);
+
+    *hi64 = (int64_t)(full128 >> 64);
+    *lo64 = (int64_t)(full128 >> 0);
+}
+
+static inline void DMULTU(uint64_t a, uint64_t b, uint64_t* lo64, uint64_t* hi64) {
+    unsigned __int128 full128 = ((unsigned __int128)a) * ((unsigned __int128)b);
+
+    *hi64 = (uint64_t)(full128 >> 64);
+    *lo64 = (uint64_t)(full128 >> 0);
+}
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+#pragma intrinsic(_mul128)
+#pragma intrinsic(_umul128)
+
+static inline void DMULT(int64_t a, int64_t b, int64_t* lo64, int64_t* hi64) {
+    *lo64 = _mul128(a, b, hi64);
+}
+
+static inline void DMULTU(uint64_t a, uint64_t b, uint64_t* lo64, uint64_t* hi64) {
+    *lo64 = _umul128(a, b, hi64);
+}
+
+#else
+#error "128-bit integer type not found"
+#endif
+
+static inline void DDIV(int64_t a, int64_t b, int64_t* quot, int64_t* rem) {
+    int overflow = ((uint64_t)a == 0x8000000000000000ull) && (b == -1ll);
+    *quot = overflow ? a : (a / b);
+    *rem = overflow ? 0 : (a % b);
+}
+
+static inline void DDIVU(uint64_t a, uint64_t b, uint64_t* quot, uint64_t* rem) {
+    *quot = a / b;
+    *rem = a % b;
+}
+
+typedef uint64_t gpr;
+
+#define SIGNED(val) \
+    ((int64_t)(val))
+
+#define ADD32(a, b) \
+    ((gpr)(int32_t)((a) + (b)))
+
+#define SUB32(a, b) \
+    ((gpr)(int32_t)((a) - (b)))
+
+#define MEM_W(offset, reg) \
+    (*(int32_t*)(rdram + ((((reg) + (offset))) - 0xFFFFFFFF80000000)))
+
+#define MEM_H(offset, reg) \
+    (*(int16_t*)(rdram + ((((reg) + (offset)) ^ 2) - 0xFFFFFFFF80000000)))
+
+#define MEM_B(offset, reg) \
+    (*(int8_t*)(rdram + ((((reg) + (offset)) ^ 3) - 0xFFFFFFFF80000000)))
+
+#define MEM_HU(offset, reg) \
+    (*(uint16_t*)(rdram + ((((reg) + (offset)) ^ 2) - 0xFFFFFFFF80000000)))
+
+#define MEM_BU(offset, reg) \
+    (*(uint8_t*)(rdram + ((((reg) + (offset)) ^ 3) - 0xFFFFFFFF80000000)))
+
+#define SD(val, offset, reg) { \
+    *(uint32_t*)(rdram + ((((reg) + (offset) + 4)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 0); \
+    *(uint32_t*)(rdram + ((((reg) + (offset) + 0)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 32); \
+}
+
+static inline uint64_t load_doubleword(uint8_t* rdram, gpr reg, gpr offset) {
+    uint64_t ret = 0;
+    uint64_t lo = (uint64_t)(uint32_t)MEM_W(reg, offset + 4);
+    uint64_t hi = (uint64_t)(uint32_t)MEM_W(reg, offset + 0);
+    ret = (lo << 0) | (hi << 32);
+    return ret;
+}
+
+#define LD(offset, reg) \
+    load_doubleword(rdram, offset, reg)
+
+static inline gpr do_lwl(uint8_t* rdram, gpr initial_value, gpr offset, gpr reg) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+
+    // Load the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t loaded_value = MEM_W(0, word_address);
+
+    // Mask the existing value and shift the loaded value appropriately
+    gpr misalignment = address & 0x3;
+    gpr masked_value = initial_value & (gpr)(uint32_t)~(0xFFFFFFFFu << (misalignment * 8));
+    loaded_value <<= (misalignment * 8);
+
+    // Cast to int32_t to sign extend first
+    return (gpr)(int32_t)(masked_value | loaded_value);
+}
+
+static inline gpr do_lwr(uint8_t* rdram, gpr initial_value, gpr offset, gpr reg) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+    
+    // Load the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t loaded_value = MEM_W(0, word_address);
+
+    // Mask the existing value and shift the loaded value appropriately
+    gpr misalignment = address & 0x3;
+    gpr masked_value = initial_value & (gpr)(uint32_t)~(0xFFFFFFFFu >> (24 - misalignment * 8));
+    loaded_value >>= (24 - misalignment * 8);
+
+    // Cast to int32_t to sign extend first
+    return (gpr)(int32_t)(masked_value | loaded_value);
+}
+
+static inline void do_swl(uint8_t* rdram, gpr offset, gpr reg, gpr val) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+
+    // Get the initial value of the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t initial_value = MEM_W(0, word_address);
+
+    // Mask the initial value and shift the input value appropriately
+    gpr misalignment = address & 0x3;
+    uint32_t masked_initial_value = initial_value & ~(0xFFFFFFFFu >> (misalignment * 8));
+    uint32_t shifted_input_value = ((uint32_t)val) >> (misalignment * 8);
+    MEM_W(0, word_address) = masked_initial_value | shifted_input_value;
+}
+
+static inline void do_swr(uint8_t* rdram, gpr offset, gpr reg, gpr val) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+
+    // Get the initial value of the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t initial_value = MEM_W(0, word_address);
+
+    // Mask the initial value and shift the input value appropriately
+    gpr misalignment = address & 0x3;
+    uint32_t masked_initial_value = initial_value & ~(0xFFFFFFFFu << (24 - misalignment * 8));
+    uint32_t shifted_input_value = ((uint32_t)val) << (24 - misalignment * 8);
+    MEM_W(0, word_address) = masked_initial_value | shifted_input_value;
+}
+
+static inline uint32_t get_cop1_cs() {
+    uint32_t rounding_mode = 0;
+    switch (fegetround()) {
+        // round to nearest value
+        case FE_TONEAREST:
+        default:
+            rounding_mode = 0;
+            break;
+        // round to zero (truncate)
+        case FE_TOWARDZERO:
+            rounding_mode = 1;
+            break;
+        // round to positive infinity (ceil)
+        case FE_UPWARD:
+            rounding_mode = 2;
+            break;
+        // round to negative infinity (floor)
+        case FE_DOWNWARD:
+            rounding_mode = 3;
+            break;
+    }
+    return rounding_mode;
+}
+
+static inline void set_cop1_cs(uint32_t val) {
+    uint32_t rounding_mode = val & 0x3;
+    int round = FE_TONEAREST;
+    switch (rounding_mode) {
+        case 0: // round to nearest value
+            round = FE_TONEAREST;
+            break;
+        case 1: // round to zero (truncate)
+            round = FE_TOWARDZERO;
+            break;
+        case 2: // round to positive infinity (ceil)
+            round = FE_UPWARD;
+            break;
+        case 3: // round to negative infinity (floor)
+            round = FE_DOWNWARD;
+            break;
+    }
+    fesetround(round);
+}
+
+#define S32(val) \
+    ((int32_t)(val))
+    
+#define U32(val) \
+    ((uint32_t)(val))
+
+#define S64(val) \
+    ((int64_t)(val))
+
+#define U64(val) \
+    ((uint64_t)(val))
+
+#define MUL_S(val1, val2) \
+    ((val1) * (val2))
+
+#define MUL_D(val1, val2) \
+    ((val1) * (val2))
+
+#define DIV_S(val1, val2) \
+    ((val1) / (val2))
+
+#define DIV_D(val1, val2) \
+    ((val1) / (val2))
+
+#define CVT_S_W(val) \
+    ((float)((int32_t)(val)))
+
+#define CVT_D_W(val) \
+    ((double)((int32_t)(val)))
+
+#define CVT_D_L(val) \
+    ((double)((int64_t)(val)))
+
+#define CVT_S_L(val) \
+    ((float)((int64_t)(val)))
+
+#define CVT_D_S(val) \
+    ((double)(val))
+
+#define CVT_S_D(val) \
+    ((float)(val))
+
+#define TRUNC_W_S(val) \
+    ((int32_t)(val))
+
+#define TRUNC_W_D(val) \
+    ((int32_t)(val))
+
+#define TRUNC_L_S(val) \
+    ((int64_t)(val))
+
+#define TRUNC_L_D(val) \
+    ((int64_t)(val))
+
+#define DEFAULT_ROUNDING_MODE 0
+
+static inline int32_t do_cvt_w_s(float val) {
+    // Rounding mode aware float to 32-bit int conversion.
+    return (int32_t)lrintf(val);
+}
+
+#define CVT_W_S(val) \
+    do_cvt_w_s(val)
+
+static inline int64_t do_cvt_l_s(float val) {
+    // Rounding mode aware float to 64-bit int conversion.
+    return (int64_t)llrintf(val);
+}
+
+#define CVT_L_S(val) \
+    do_cvt_l_s(val);
+
+static inline int32_t do_cvt_w_d(double val) {
+    // Rounding mode aware double to 32-bit int conversion.
+    return (int32_t)lrint(val);
+}
+
+#define CVT_W_D(val) \
+    do_cvt_w_d(val)
+
+static inline int64_t do_cvt_l_d(double val) {
+    // Rounding mode aware double to 64-bit int conversion.
+    return (int64_t)llrint(val);
+}
+
+#define CVT_L_D(val) \
+    do_cvt_l_d(val)
+
+#define NAN_CHECK(val) \
+    assert(val == val)
+
+//#define NAN_CHECK(val)
+
+typedef union {
+    double d;
+    struct {
+        float fl;
+        float fh;
+    };
+    struct {
+        uint32_t u32l;
+        uint32_t u32h;
+    };
+    uint64_t u64;
+} fpr;
+
+typedef struct {
+    gpr r0,  r1,  r2,  r3,  r4,  r5,  r6,  r7,
+        r8,  r9,  r10, r11, r12, r13, r14, r15,
+        r16, r17, r18, r19, r20, r21, r22, r23,
+        r24, r25, r26, r27, r28, r29, r30, r31;
+    fpr f0,  f1,  f2,  f3,  f4,  f5,  f6,  f7,
+        f8,  f9,  f10, f11, f12, f13, f14, f15,
+        f16, f17, f18, f19, f20, f21, f22, f23,
+        f24, f25, f26, f27, f28, f29, f30, f31;
+    uint64_t hi, lo;
+    uint32_t* f_odd;
+    uint32_t status_reg;
+    uint8_t mips3_float_mode;
+} recomp_context;
+
+// Checks if the target is an even float register or that mips3 float mode is enabled
+#define CHECK_FR(ctx, idx) \
+    assert(((idx) & 1) == 0 || (ctx)->mips3_float_mode)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void cop0_status_write(recomp_context* ctx, gpr value);
+gpr cop0_status_read(recomp_context* ctx);
+void switch_error(const char* func, uint32_t vram, uint32_t jtbl);
+void do_break(uint32_t vram);
+
+typedef void (recomp_func_t)(uint8_t* rdram, recomp_context* ctx);
+
+recomp_func_t* get_function(int32_t vram);
+
+#define LOOKUP_FUNC(val) \
+    get_function((int32_t)(val))
+
+extern int32_t* section_addresses;
+
+#define LO16(x) \
+    ((x) & 0xFFFF)
+
+#define HI16(x) \
+    (((x) >> 16) + (((x) >> 15) & 1))
+
+#define RELOC_HI16(section_index, offset) \
+    HI16(section_addresses[section_index] + (offset))
+
+#define RELOC_LO16(section_index, offset) \
+    LO16(section_addresses[section_index] + (offset))
+
+void recomp_syscall_handler(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram);
+
+void pause_self(uint8_t *rdram);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/n64recomp.h b/include/recompiler/context.h
similarity index 93%
rename from include/n64recomp.h
rename to include/recompiler/context.h
index c214ac7..df5224d 100644
--- a/include/n64recomp.h
+++ b/include/recompiler/context.h
@@ -36,6 +36,20 @@ namespace N64Recomp {
                 : vram(vram), rom(rom), words(std::move(words)), name(std::move(name)), section_index(section_index), ignored(ignored), reimplemented(reimplemented), stubbed(stubbed) {}
         Function() = default;
     };
+    
+    struct JumpTable {
+        uint32_t vram;
+        uint32_t addend_reg;
+        uint32_t rom;
+        uint32_t lw_vram;
+        uint32_t addu_vram;
+        uint32_t jr_vram;
+        uint16_t section_index;
+        std::vector<uint32_t> entries;
+
+        JumpTable(uint32_t vram, uint32_t addend_reg, uint32_t rom, uint32_t lw_vram, uint32_t addu_vram, uint32_t jr_vram, uint16_t section_index, std::vector<uint32_t>&& entries)
+                : vram(vram), addend_reg(addend_reg), rom(rom), lw_vram(lw_vram), addu_vram(addu_vram), jr_vram(jr_vram), section_index(section_index), entries(std::move(entries)) {}
+    };
 
     enum class RelocType : uint8_t {
         R_MIPS_NONE = 0,
@@ -175,6 +189,8 @@ namespace N64Recomp {
         std::vector<ReferenceSymbol> reference_symbols;
         // Mapping of symbol name to reference symbol index.
         std::unordered_map<std::string, SymbolReference> reference_symbols_by_name;
+        // Whether all reference sections should be treated as relocatable (used in live recompilation).
+        bool all_reference_sections_relocatable = false;
     public:
         std::vector<Section> sections;
         std::vector<Function> functions;
@@ -187,6 +203,8 @@ namespace N64Recomp {
         // The target ROM being recompiled, TODO move this outside of the context to avoid making a copy for mod contexts.
         // Used for reading relocations and for the output binary feature.
         std::vector<uint8_t> rom;
+        // Whether reference symbols should be validated when emitting function calls during recompilation.
+        bool skip_validating_reference_symbols = true;
 
         //// Only used by the CLI, TODO move this to a struct in the internal headers.
         // A mapping of function name to index in the functions vector
@@ -359,6 +377,9 @@ namespace N64Recomp {
         }
 
         bool is_reference_section_relocatable(uint16_t section_index) const {
+            if (all_reference_sections_relocatable) {
+                return true;
+            }
             if (section_index == SectionAbsolute) {
                 return false;
             }
@@ -518,9 +539,15 @@ namespace N64Recomp {
         void copy_reference_sections_from(const Context& rhs) {
             reference_sections = rhs.reference_sections;
         }
+
+        void set_all_reference_sections_relocatable() {
+            all_reference_sections_relocatable = true;
+        }
     };
 
-    bool recompile_function(const Context& context, const Function& func, std::ofstream& output_file, std::span<std::vector<uint32_t>> static_funcs, bool tag_reference_relocs);
+    class Generator;
+    bool recompile_function(const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs, bool tag_reference_relocs);
+    bool recompile_function_custom(Generator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs);
 
     enum class ModSymbolsError {
         Good,
diff --git a/include/recompiler/generator.h b/include/recompiler/generator.h
new file mode 100644
index 0000000..0ffde0b
--- /dev/null
+++ b/include/recompiler/generator.h
@@ -0,0 +1,109 @@
+#ifndef __GENERATOR_H__
+#define __GENERATOR_H__
+
+#include "recompiler/context.h"
+#include "operations.h"
+
+namespace N64Recomp {
+    struct InstructionContext {
+        int rd;
+        int rs;
+        int rt;
+        int sa;
+
+        int fd;
+        int fs;
+        int ft;
+
+        int cop1_cs;
+
+        uint16_t imm16;
+
+        bool reloc_tag_as_reference;
+        RelocType reloc_type;
+        uint32_t reloc_section_index;
+        uint32_t reloc_target_section_offset;
+    };
+
+    class Generator {
+    public:
+        virtual void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const = 0;
+        virtual void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const = 0;
+        virtual void process_store_op(const StoreOp& op, const InstructionContext& ctx) const = 0;
+        virtual void emit_function_start(const std::string& function_name, size_t func_index) const = 0;
+        virtual void emit_function_end() const = 0;
+        virtual void emit_function_call_lookup(uint32_t addr) const = 0;
+        virtual void emit_function_call_by_register(int reg) const = 0;
+        // target_section_offset can each be deduced from symbol_index if the full context is available,
+        // but for live recompilation the reference symbol list is unavailable so it's still provided.
+        virtual void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const = 0;
+        virtual void emit_function_call(const Context& context, size_t function_index) const = 0;
+        virtual void emit_named_function_call(const std::string& function_name) const = 0;
+        virtual void emit_goto(const std::string& target) const = 0;
+        virtual void emit_label(const std::string& label_name) const = 0;
+        virtual void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const = 0;
+        virtual void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const = 0;
+        virtual void emit_branch_close() const = 0;
+        virtual void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const = 0;
+        virtual void emit_case(int case_index, const std::string& target_label) const = 0;
+        virtual void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const = 0;
+        virtual void emit_switch_close() const = 0;
+        virtual void emit_return() const = 0;
+        virtual void emit_check_fr(int fpr) const = 0;
+        virtual void emit_check_nan(int fpr, bool is_double) const = 0;
+        virtual void emit_cop0_status_read(int reg) const = 0;
+        virtual void emit_cop0_status_write(int reg) const = 0;
+        virtual void emit_cop1_cs_read(int reg) const = 0;
+        virtual void emit_cop1_cs_write(int reg) const = 0;
+        virtual void emit_muldiv(InstrId instr_id, int reg1, int reg2) const = 0;
+        virtual void emit_syscall(uint32_t instr_vram) const = 0;
+        virtual void emit_do_break(uint32_t instr_vram) const = 0;
+        virtual void emit_pause_self() const = 0;
+        virtual void emit_trigger_event(uint32_t event_index) const = 0;
+        virtual void emit_comment(const std::string& comment) const = 0;
+    };
+
+    class CGenerator final : Generator {
+    public:
+        CGenerator(std::ostream& output_file) : output_file(output_file) {};
+        void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const final;
+        void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const final;
+        void process_store_op(const StoreOp& op, const InstructionContext& ctx) const final;
+        void emit_function_start(const std::string& function_name, size_t func_index) const final;
+        void emit_function_end() const final;
+        void emit_function_call_lookup(uint32_t addr) const final;
+        void emit_function_call_by_register(int reg) const final;
+        void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const final;
+        void emit_function_call(const Context& context, size_t function_index) const final;
+        void emit_named_function_call(const std::string& function_name) const final;
+        void emit_goto(const std::string& target) const final;
+        void emit_label(const std::string& label_name) const final;
+        void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const final;
+        void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const final;
+        void emit_branch_close() const final;
+        void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const final;
+        void emit_case(int case_index, const std::string& target_label) const final;
+        void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const final;
+        void emit_switch_close() const final;
+        void emit_return() const final;
+        void emit_check_fr(int fpr) const final;
+        void emit_check_nan(int fpr, bool is_double) const final;
+        void emit_cop0_status_read(int reg) const final;
+        void emit_cop0_status_write(int reg) const final;
+        void emit_cop1_cs_read(int reg) const final;
+        void emit_cop1_cs_write(int reg) const final;
+        void emit_muldiv(InstrId instr_id, int reg1, int reg2) const final;
+        void emit_syscall(uint32_t instr_vram) const final;
+        void emit_do_break(uint32_t instr_vram) const final;
+        void emit_pause_self() const final;
+        void emit_trigger_event(uint32_t event_index) const final;
+        void emit_comment(const std::string& comment) const final;
+    private:
+        void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const;
+        void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const;
+        void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const;
+        std::ostream& output_file;
+    };
+}
+
+#endif
diff --git a/include/recompiler/live_recompiler.h b/include/recompiler/live_recompiler.h
new file mode 100644
index 0000000..1b92d95
--- /dev/null
+++ b/include/recompiler/live_recompiler.h
@@ -0,0 +1,141 @@
+#ifndef __LIVE_RECOMPILER_H__
+#define __LIVE_RECOMPILER_H__
+
+#include <unordered_map>
+#include "recompiler/generator.h"
+#include "recomp.h"
+
+struct sljit_compiler;
+
+namespace N64Recomp {
+    struct LiveGeneratorContext;
+    struct ReferenceJumpDetails {
+        uint16_t section;
+        uint32_t section_offset;
+    };
+    struct LiveGeneratorOutput {
+        LiveGeneratorOutput() = default;
+        LiveGeneratorOutput(const LiveGeneratorOutput& rhs) = delete;
+        LiveGeneratorOutput(LiveGeneratorOutput&& rhs) { *this = std::move(rhs); }
+        LiveGeneratorOutput& operator=(const LiveGeneratorOutput& rhs) = delete;
+        LiveGeneratorOutput& operator=(LiveGeneratorOutput&& rhs) {
+            good = rhs.good;
+            string_literals = std::move(rhs.string_literals);
+            jump_tables = std::move(rhs.jump_tables);
+            code = rhs.code;
+            code_size = rhs.code_size;
+            functions = std::move(rhs.functions);
+            reference_symbol_jumps = std::move(rhs.reference_symbol_jumps);
+            import_jumps_by_index = std::move(rhs.import_jumps_by_index);
+            executable_offset = rhs.executable_offset;
+
+            rhs.good = false;
+            rhs.code = nullptr;
+            rhs.code_size = 0;
+            rhs.reference_symbol_jumps.clear();
+            rhs.executable_offset = 0;
+
+            return *this;
+        }
+        ~LiveGeneratorOutput();
+        size_t num_reference_symbol_jumps() const;
+        void set_reference_symbol_jump(size_t jump_index, recomp_func_t* func);
+        ReferenceJumpDetails get_reference_symbol_jump_details(size_t jump_index);
+        void populate_import_symbol_jumps(size_t import_index, recomp_func_t* func);
+        bool good = false;
+        // Storage for string literals referenced by recompiled code. These are allocated as unique_ptr arrays
+        // to prevent them from moving, as the referenced address is baked into the recompiled code.
+        std::vector<std::unique_ptr<char[]>> string_literals;
+        // Storage for jump tables referenced by recompiled code (vector of arrays of pointers). These are also
+        // allocated as unique_ptr arrays for the same reason as strings.
+        std::vector<std::unique_ptr<void*[]>> jump_tables;
+        // Recompiled code.
+        void* code;
+        // Size of the recompiled code.
+        size_t code_size;
+        // Pointers to each individual function within the recompiled code.
+        std::vector<recomp_func_t*> functions;
+    private:
+        // List of jump details and the corresponding jump instruction address. These jumps get populated after recompilation is complete
+        // during dependency resolution.
+        std::vector<std::pair<ReferenceJumpDetails, void*>> reference_symbol_jumps;
+        // Mapping of import symbol index to any jumps to that import symbol.
+        std::unordered_multimap<size_t, void*> import_jumps_by_index;
+        // sljit executable offset.
+        int64_t executable_offset;
+
+        friend class LiveGenerator;
+    };
+    struct LiveGeneratorInputs {
+        uint32_t base_event_index;
+        void (*cop0_status_write)(recomp_context* ctx, gpr value);
+        gpr (*cop0_status_read)(recomp_context* ctx);
+        void (*switch_error)(const char* func, uint32_t vram, uint32_t jtbl);
+        void (*do_break)(uint32_t vram);
+        recomp_func_t* (*get_function)(int32_t vram);
+        void (*syscall_handler)(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram);
+        void (*pause_self)(uint8_t* rdram);
+        void (*trigger_event)(uint8_t* rdram, recomp_context* ctx, uint32_t event_index);
+        int32_t *reference_section_addresses;
+        int32_t *local_section_addresses;
+    };
+    class LiveGenerator final : public Generator {
+    public:
+        LiveGenerator(size_t num_funcs, const LiveGeneratorInputs& inputs);
+        ~LiveGenerator();
+        // Prevent moving or copying.
+        LiveGenerator(const LiveGenerator& rhs) = delete;
+        LiveGenerator(LiveGenerator&& rhs) = delete;
+        LiveGenerator& operator=(const LiveGenerator& rhs) = delete;
+        LiveGenerator& operator=(LiveGenerator&& rhs) = delete;
+
+        LiveGeneratorOutput finish();
+        void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const final;
+        void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const final;
+        void process_store_op(const StoreOp& op, const InstructionContext& ctx) const final;
+        void emit_function_start(const std::string& function_name, size_t func_index) const final;
+        void emit_function_end() const final;
+        void emit_function_call_lookup(uint32_t addr) const final;
+        void emit_function_call_by_register(int reg) const final;
+        void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const final;
+        void emit_function_call(const Context& context, size_t function_index) const final;
+        void emit_named_function_call(const std::string& function_name) const final;
+        void emit_goto(const std::string& target) const final;
+        void emit_label(const std::string& label_name) const final;
+        void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const final;
+        void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const final;
+        void emit_branch_close() const final;
+        void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const final;
+        void emit_case(int case_index, const std::string& target_label) const final;
+        void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const final;
+        void emit_switch_close() const final;
+        void emit_return() const final;
+        void emit_check_fr(int fpr) const final;
+        void emit_check_nan(int fpr, bool is_double) const final;
+        void emit_cop0_status_read(int reg) const final;
+        void emit_cop0_status_write(int reg) const final;
+        void emit_cop1_cs_read(int reg) const final;
+        void emit_cop1_cs_write(int reg) const final;
+        void emit_muldiv(InstrId instr_id, int reg1, int reg2) const final;
+        void emit_syscall(uint32_t instr_vram) const final;
+        void emit_do_break(uint32_t instr_vram) const final;
+        void emit_pause_self() const final;
+        void emit_trigger_event(uint32_t event_index) const final;
+        void emit_comment(const std::string& comment) const final;
+    private:
+        void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const;
+        void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const;
+        void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const;
+        // Loads the relocated address specified by the instruction context into the target register.
+        void load_relocated_address(const InstructionContext& ctx, int reg) const;
+        sljit_compiler* compiler;
+        LiveGeneratorInputs inputs;
+        mutable std::unique_ptr<LiveGeneratorContext> context;
+        mutable bool errored;
+    };
+
+    void live_recompiler_init();
+    bool recompile_function_live(LiveGenerator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs);
+}
+
+#endif
\ No newline at end of file
diff --git a/include/operations.h b/include/recompiler/operations.h
similarity index 92%
rename from include/operations.h
rename to include/recompiler/operations.h
index 5cb407e..65f2ed7 100644
--- a/include/operations.h
+++ b/include/recompiler/operations.h
@@ -28,13 +28,12 @@ namespace N64Recomp {
         ToU32,
         ToS64,
         ToU64,
-        NegateS32,
-        NegateS64,
         Lui,
         Mask5, // Mask to 5 bits
         Mask6, // Mask to 5 bits
         ToInt32, // Functionally equivalent to ToS32, only exists for parity with old codegen
-        Negate,
+        NegateFloat,
+        NegateDouble,
         AbsFloat,
         AbsDouble,
         SqrtFloat,
@@ -51,12 +50,20 @@ namespace N64Recomp {
         ConvertLFromS,
         TruncateWFromS,
         TruncateWFromD,
+        TruncateLFromS,
+        TruncateLFromD,
         RoundWFromS,
         RoundWFromD,
+        RoundLFromS,
+        RoundLFromD,
         CeilWFromS,
         CeilWFromD,
+        CeilLFromS,
+        CeilLFromD,
         FloorWFromS,
-        FloorWFromD
+        FloorWFromD,
+        FloorLFromS,
+        FloorLFromD
     };
 
     enum class BinaryOpType {
@@ -92,6 +99,12 @@ namespace N64Recomp {
         LessEq,
         Greater,
         GreaterEq,
+        EqualFloat,
+        LessFloat,
+        LessEqFloat,
+        EqualDouble,
+        LessDouble,
+        LessEqDouble,
         // Loads
         LD,
         LW,
diff --git a/lib/sljit b/lib/sljit
new file mode 160000
index 0000000..f632608
--- /dev/null
+++ b/lib/sljit
@@ -0,0 +1 @@
+Subproject commit f6326087b3404efb07c6d3deed97b3c3b8098c0c
diff --git a/src/analysis.cpp b/src/analysis.cpp
index 5dfd955..92a421e 100644
--- a/src/analysis.cpp
+++ b/src/analysis.cpp
@@ -4,7 +4,7 @@
 #include "rabbitizer.hpp"
 #include "fmt/format.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "analysis.h"
 
 extern "C" const char* RabbitizerRegister_getNameGpr(uint8_t regValue);
@@ -194,21 +194,11 @@ bool analyze_instruction(const rabbitizer::InstructionCpu& instr, const N64Recom
                 reg_states[rs].loaded_lw_vram,
                 reg_states[rs].loaded_addu_vram,
                 instr.getVram(),
+                0, // section index gets filled in later
                 std::vector<uint32_t>{}
             );
-        } else if (reg_states[rs].valid_lui && reg_states[rs].valid_addiu && !reg_states[rs].valid_addend && !reg_states[rs].valid_loaded) {
-            uint32_t address = reg_states[rs].prev_addiu_vram + reg_states[rs].prev_lui;
-            stats.absolute_jumps.emplace_back(
-                address,
-                instr.getVram()
-            );
-        }
-        // Allow tail calls (TODO account for trailing nops due to bad function splits)
-        else if (instr.getVram() != func.vram + (func.words.size() - 2) * sizeof(func.words[0])) {
-            // Inconclusive analysis
-            fmt::print(stderr, "Failed to to find jump table for `jr {}` at 0x{:08X} in {}\n", RabbitizerRegister_getNameGpr(rs), instr.getVram(), func.name);
-            return false;
         }
+        // TODO stricter validation on tail calls, since not all indirect jumps can be treated as one.
         break;
     default:
         if (instr.modifiesRd()) {
@@ -256,6 +246,7 @@ bool N64Recomp::analyze_function(const N64Recomp::Context& context, const N64Rec
 
         // TODO this assumes that the jump table is in the same section as the function itself
         cur_jtbl.rom = cur_jtbl.vram + func.rom - func.vram;
+        cur_jtbl.section_index = func.section_index;
 
         while (vram < end_address) {
             // Retrieve the current entry of the jump table
diff --git a/src/analysis.h b/src/analysis.h
index eafd1e7..9e0562e 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -4,22 +4,9 @@
 #include <cstdint>
 #include <vector>
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 namespace N64Recomp {
-    struct JumpTable {
-        uint32_t vram;
-        uint32_t addend_reg;
-        uint32_t rom;
-        uint32_t lw_vram;
-        uint32_t addu_vram;
-        uint32_t jr_vram;
-        std::vector<uint32_t> entries;
-
-        JumpTable(uint32_t vram, uint32_t addend_reg, uint32_t rom, uint32_t lw_vram, uint32_t addu_vram, uint32_t jr_vram, std::vector<uint32_t>&& entries)
-                : vram(vram), addend_reg(addend_reg), rom(rom), lw_vram(lw_vram), addu_vram(addu_vram), jr_vram(jr_vram), entries(std::move(entries)) {}
-    };
-
     struct AbsoluteJump {
         uint32_t jump_target;
         uint32_t instruction_vram;
@@ -29,7 +16,6 @@ namespace N64Recomp {
 
     struct FunctionStats {
         std::vector<JumpTable> jump_tables;
-        std::vector<AbsoluteJump> absolute_jumps;
     };
 
     bool analyze_function(const Context& context, const Function& function, const std::vector<rabbitizer::InstructionCpu>& instructions, FunctionStats& stats);
diff --git a/src/cgenerator.cpp b/src/cgenerator.cpp
index 7751568..596ad60 100644
--- a/src/cgenerator.cpp
+++ b/src/cgenerator.cpp
@@ -4,11 +4,11 @@
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-#include "generator.h"
+#include "recompiler/generator.h"
 
 struct BinaryOpFields { std::string func_string; std::string infix_string; };
 
-std::vector<BinaryOpFields> c_op_fields = []() {
+static std::vector<BinaryOpFields> c_op_fields = []() {
     std::vector<BinaryOpFields> ret{};
     ret.resize(static_cast<size_t>(N64Recomp::BinaryOpType::COUNT));
     std::vector<char> ops_setup{};
@@ -45,9 +45,15 @@ std::vector<BinaryOpFields> c_op_fields = []() {
     setup_op(N64Recomp::BinaryOpType::Sra32,     "S32",    ">>"); // Arithmetic aspect will be taken care of by unary op for first operand.
     setup_op(N64Recomp::BinaryOpType::Sra64,     "",       ">>"); // Arithmetic aspect will be taken care of by unary op for first operand.
     setup_op(N64Recomp::BinaryOpType::Equal,     "",       "==");
+    setup_op(N64Recomp::BinaryOpType::EqualFloat,"",       "==");
+    setup_op(N64Recomp::BinaryOpType::EqualDouble,"",      "==");
     setup_op(N64Recomp::BinaryOpType::NotEqual,  "",       "!=");
     setup_op(N64Recomp::BinaryOpType::Less,      "",       "<");
+    setup_op(N64Recomp::BinaryOpType::LessFloat, "",       "<");
+    setup_op(N64Recomp::BinaryOpType::LessDouble,"",       "<");
     setup_op(N64Recomp::BinaryOpType::LessEq,    "",       "<=");
+    setup_op(N64Recomp::BinaryOpType::LessEqFloat,"",      "<=");
+    setup_op(N64Recomp::BinaryOpType::LessEqDouble,"",     "<=");
     setup_op(N64Recomp::BinaryOpType::Greater,   "",       ">");
     setup_op(N64Recomp::BinaryOpType::GreaterEq, "",       ">=");
     setup_op(N64Recomp::BinaryOpType::LD,        "LD",     "");
@@ -72,22 +78,22 @@ std::vector<BinaryOpFields> c_op_fields = []() {
     return ret;
 }();
 
-std::string gpr_to_string(int gpr_index) {
+static std::string gpr_to_string(int gpr_index) {
     if (gpr_index == 0) {
         return "0";
     }
     return fmt::format("ctx->r{}", gpr_index);
 }
 
-std::string fpr_to_string(int fpr_index) {
+static std::string fpr_to_string(int fpr_index) {
     return fmt::format("ctx->f{}.fl", fpr_index);
 }
 
-std::string fpr_double_to_string(int fpr_index) {
+static std::string fpr_double_to_string(int fpr_index) {
     return fmt::format("ctx->f{}.d", fpr_index);
 }
 
-std::string fpr_u32l_to_string(int fpr_index) {
+static std::string fpr_u32l_to_string(int fpr_index) {
     if (fpr_index & 1) {
         return fmt::format("ctx->f_odd[({} - 1) * 2]", fpr_index);
     }
@@ -96,11 +102,11 @@ std::string fpr_u32l_to_string(int fpr_index) {
     }
 }
 
-std::string fpr_u64_to_string(int fpr_index) {
+static std::string fpr_u64_to_string(int fpr_index) {
     return fmt::format("ctx->f{}.u64", fpr_index);
 }
 
-std::string unsigned_reloc(const N64Recomp::InstructionContext& context) {
+static std::string unsigned_reloc(const N64Recomp::InstructionContext& context) {
     switch (context.reloc_type) {
         case N64Recomp::RelocType::R_MIPS_HI16:
             return fmt::format("{}RELOC_HI16({}, {:#X})",
@@ -113,7 +119,7 @@ std::string unsigned_reloc(const N64Recomp::InstructionContext& context) {
     }
 }
 
-std::string signed_reloc(const N64Recomp::InstructionContext& context) {
+static std::string signed_reloc(const N64Recomp::InstructionContext& context) {
     return "(int16_t)" + unsigned_reloc(context);
 }
 
@@ -223,12 +229,6 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper
         case UnaryOpType::ToU64:
             // Nothing to do here, they're already U64
             break;
-        case UnaryOpType::NegateS32:
-            assert(false);
-            break;
-        case UnaryOpType::NegateS64:
-            assert(false);
-            break;
         case UnaryOpType::Lui:
             operand_string = "S32(" + operand_string + " << 16)"; 
             break;
@@ -241,7 +241,10 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper
         case UnaryOpType::ToInt32:
             operand_string = "(int32_t)" + operand_string; 
             break;
-        case UnaryOpType::Negate:
+        case UnaryOpType::NegateFloat:
+            operand_string = "-" + operand_string;
+            break;
+        case UnaryOpType::NegateDouble:
             operand_string = "-" + operand_string;
             break;
         case UnaryOpType::AbsFloat:
@@ -292,24 +295,48 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper
         case UnaryOpType::TruncateWFromD:
             operand_string = "TRUNC_W_D(" + operand_string + ")";
             break;
+        case UnaryOpType::TruncateLFromS:
+            operand_string = "TRUNC_L_S(" + operand_string + ")";
+            break;
+        case UnaryOpType::TruncateLFromD:
+            operand_string = "TRUNC_L_D(" + operand_string + ")";
+            break;
         case UnaryOpType::RoundWFromS:
             operand_string = "lroundf(" + operand_string + ")";
             break;
         case UnaryOpType::RoundWFromD:
             operand_string = "lround(" + operand_string + ")";
             break;
+        case UnaryOpType::RoundLFromS:
+            operand_string = "llroundf(" + operand_string + ")";
+            break;
+        case UnaryOpType::RoundLFromD:
+            operand_string = "llround(" + operand_string + ")";
+            break;
         case UnaryOpType::CeilWFromS:
             operand_string = "S32(ceilf(" + operand_string + "))";
             break;
         case UnaryOpType::CeilWFromD:
             operand_string = "S32(ceil(" + operand_string + "))";
             break;
+        case UnaryOpType::CeilLFromS:
+            operand_string = "S64(ceilf(" + operand_string + "))";
+            break;
+        case UnaryOpType::CeilLFromD:
+            operand_string = "S64(ceil(" + operand_string + "))";
+            break;
         case UnaryOpType::FloorWFromS:
             operand_string = "S32(floorf(" + operand_string + "))";
             break;
         case UnaryOpType::FloorWFromD:
             operand_string = "S32(floor(" + operand_string + "))";
             break;
+        case UnaryOpType::FloorLFromS:
+            operand_string = "S64(floorf(" + operand_string + "))";
+            break;
+        case UnaryOpType::FloorLFromD:
+            operand_string = "S64(floor(" + operand_string + "))";
+            break;
     }
 }
 
@@ -333,10 +360,10 @@ void N64Recomp::CGenerator::get_binary_expr_string(BinaryOpType type, const Bina
         expr_string = fmt::format("{} {} {} ? 1 : 0", input_a, infix_string, input_b);
     }
     else if (type == BinaryOpType::Equal && operands.operands[1] == Operand::Zero && operands.operand_operations[1] == UnaryOpType::None) {
-        expr_string = input_a;
+        expr_string = "!" + input_a;
     }
     else if (type == BinaryOpType::NotEqual && operands.operands[1] == Operand::Zero && operands.operand_operations[1] == UnaryOpType::None) {
-        expr_string = "!" + input_a;
+        expr_string = input_a;
     }
     // End unnecessary cases.
 
@@ -365,7 +392,57 @@ void N64Recomp::CGenerator::get_binary_expr_string(BinaryOpType type, const Bina
     }
 }
 
-void N64Recomp::CGenerator::emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::emit_function_start(const std::string& function_name, size_t func_index) const {
+    fmt::print(output_file,
+        "RECOMP_FUNC void {}(uint8_t* rdram, recomp_context* ctx) {{\n"
+        // these variables shouldn't need to be preserved across function boundaries, so make them local for more efficient output
+        "    uint64_t hi = 0, lo = 0, result = 0;\n"
+        "    int c1cs = 0;\n", // cop1 conditional signal
+        function_name);
+}
+
+void N64Recomp::CGenerator::emit_function_end() const {
+    fmt::print(output_file, ";}}\n");
+}
+
+void N64Recomp::CGenerator::emit_function_call_lookup(uint32_t addr) const {
+    fmt::print(output_file, "LOOKUP_FUNC(0x{:08X})(rdram, ctx);\n", addr);
+}
+
+void N64Recomp::CGenerator::emit_function_call_by_register(int reg) const {
+    fmt::print(output_file, "LOOKUP_FUNC({})(rdram, ctx);\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const {
+    (void)target_section_offset;
+    const N64Recomp::ReferenceSymbol& sym = context.get_reference_symbol(section_index, symbol_index);
+    fmt::print(output_file, "{}(rdram, ctx);\n", sym.name);
+}
+
+void N64Recomp::CGenerator::emit_function_call(const Context& context, size_t function_index) const {
+    fmt::print(output_file, "{}(rdram, ctx);\n", context.functions[function_index].name);
+}
+
+void N64Recomp::CGenerator::emit_named_function_call(const std::string& function_name) const {
+    fmt::print(output_file, "{}(rdram, ctx);\n", function_name);
+}
+
+void N64Recomp::CGenerator::emit_goto(const std::string& target) const {
+    fmt::print(output_file,
+        "    goto {};\n", target);
+}
+
+void N64Recomp::CGenerator::emit_label(const std::string& label_name) const {
+    fmt::print(output_file,
+        "{}:\n", label_name);
+}
+
+void N64Recomp::CGenerator::emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const {
+    std::string jump_variable = fmt::format("jr_addend_{:08X}", jtbl.jr_vram);
+    fmt::print(output_file, "gpr {} = {};\n", jump_variable, gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string expr_string{};
@@ -373,19 +450,114 @@ void N64Recomp::CGenerator::emit_branch_condition(std::ostream& output_file, con
     fmt::print(output_file, "if ({}) {{\n", expr_string);
 }
 
-void N64Recomp::CGenerator::emit_branch_close(std::ostream& output_file) const {
-    fmt::print(output_file, "    }}\n");
+void N64Recomp::CGenerator::emit_branch_close() const {
+    fmt::print(output_file, "}}\n");
 }
 
-void N64Recomp::CGenerator::emit_check_fr(std::ostream& output_file, int fpr) const {
+void N64Recomp::CGenerator::emit_switch_close() const {
+    fmt::print(output_file, "}}\n");
+}
+
+void N64Recomp::CGenerator::emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const {
+    (void)recompiler_context;
+    (void)reg;
+    // TODO generate code to subtract the jump table address from the register's value instead.
+    // Once that's done, the addend temp can be deleted to simplify the generator interface.
+    std::string jump_variable = fmt::format("jr_addend_{:08X}", jtbl.jr_vram);
+
+    fmt::print(output_file, "switch ({} >> 2) {{\n", jump_variable);
+}
+
+void N64Recomp::CGenerator::emit_case(int case_index, const std::string& target_label) const {
+    fmt::print(output_file, "case {}: goto {}; break;\n", case_index, target_label);
+}
+
+void N64Recomp::CGenerator::emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const {
+    fmt::print(output_file, "default: switch_error(__func__, 0x{:08X}, 0x{:08X});\n", instr_vram, jtbl_vram);
+}
+
+void N64Recomp::CGenerator::emit_return() const {
+    fmt::print(output_file, "return;\n");
+}
+
+void N64Recomp::CGenerator::emit_check_fr(int fpr) const {
     fmt::print(output_file, "CHECK_FR(ctx, {});\n    ", fpr);
 }
 
-void N64Recomp::CGenerator::emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const {
+void N64Recomp::CGenerator::emit_check_nan(int fpr, bool is_double) const {
     fmt::print(output_file, "NAN_CHECK(ctx->f{}.{}); ", fpr, is_double ? "d" : "fl");
 }
 
-void N64Recomp::CGenerator::process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::emit_cop0_status_read(int reg) const {
+    fmt::print(output_file, "{} = cop0_status_read(ctx);\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_cop0_status_write(int reg) const {
+    fmt::print(output_file, "cop0_status_write(ctx, {});", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_cop1_cs_read(int reg) const {
+    fmt::print(output_file, "{} = get_cop1_cs();\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_cop1_cs_write(int reg) const {
+    fmt::print(output_file, "set_cop1_cs({});\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_muldiv(InstrId instr_id, int reg1, int reg2) const {
+    switch (instr_id) {
+        case InstrId::cpu_mult:
+            fmt::print(output_file, "result = S64(S32({})) * S64(S32({})); lo = S32(result >> 0); hi = S32(result >> 32);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_dmult:
+            fmt::print(output_file, "DMULT(S64({}), S64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_multu:
+            fmt::print(output_file, "result = U64(U32({})) * U64(U32({})); lo = S32(result >> 0); hi = S32(result >> 32);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_dmultu:
+            fmt::print(output_file, "DMULTU(U64({}), U64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_div:
+            // Cast to 64-bits before division to prevent artihmetic exception for s32(0x80000000) / -1
+            fmt::print(output_file, "lo = S32(S64(S32({0})) / S64(S32({1}))); hi = S32(S64(S32({0})) % S64(S32({1})));\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_ddiv:
+            fmt::print(output_file, "DDIV(S64({}), S64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_divu:
+            fmt::print(output_file, "lo = S32(U32({0}) / U32({1})); hi = S32(U32({0}) % U32({1}));\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_ddivu:
+            fmt::print(output_file, "DDIVU(U64({}), U64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        default:
+            assert(false);
+            break;
+    }
+}
+
+void N64Recomp::CGenerator::emit_syscall(uint32_t instr_vram) const {
+    fmt::print(output_file, "recomp_syscall_handler(rdram, ctx, 0x{:08X});\n", instr_vram);
+}
+
+void N64Recomp::CGenerator::emit_do_break(uint32_t instr_vram) const {
+    fmt::print(output_file, "do_break({});\n", instr_vram);
+}
+
+void N64Recomp::CGenerator::emit_pause_self() const {
+    fmt::print(output_file, "pause_self(rdram);\n");
+}
+
+void N64Recomp::CGenerator::emit_trigger_event(uint32_t event_index) const {
+    fmt::print(output_file, "recomp_trigger_event(rdram, ctx, base_event_index + {});\n", event_index);
+}
+
+void N64Recomp::CGenerator::emit_comment(const std::string& comment) const {
+    fmt::print(output_file, "// {}\n", comment);
+}
+
+void N64Recomp::CGenerator::process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string output{};
@@ -395,7 +567,7 @@ void N64Recomp::CGenerator::process_binary_op(std::ostream& output_file, const B
     fmt::print(output_file, "{} = {};\n", output, expression);
 }
 
-void N64Recomp::CGenerator::process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string output{};
@@ -406,7 +578,7 @@ void N64Recomp::CGenerator::process_unary_op(std::ostream& output_file, const Un
     fmt::print(output_file, "{} = {};\n", output, input);
 }
 
-void N64Recomp::CGenerator::process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::process_store_op(const StoreOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string base_str{};
diff --git a/src/config.cpp b/src/config.cpp
index d3b236f..f191ba5 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -3,7 +3,7 @@
 #include <toml++/toml.hpp>
 #include "fmt/format.h"
 #include "config.h"
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 std::filesystem::path concat_if_not_empty(const std::filesystem::path& parent, const std::filesystem::path& child) {
     if (!child.empty()) {
@@ -375,7 +375,7 @@ N64Recomp::Config::Config(const char* path) {
             recomp_include = recomp_include_opt.value();
         }
         else {
-            recomp_include = "#include \"librecomp/recomp.h\"";
+            recomp_include = "#include \"recomp.h\"";
         }
 
         std::optional<int32_t> funcs_per_file_opt = input_data["functions_per_output_file"].value<int32_t>();
diff --git a/src/elf.cpp b/src/elf.cpp
index a18fdbd..d83908c 100644
--- a/src/elf.cpp
+++ b/src/elf.cpp
@@ -3,7 +3,7 @@
 #include "fmt/format.h"
 // #include "fmt/ostream.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "elfio/elfio.hpp"
 
 bool read_symbols(N64Recomp::Context& context, const ELFIO::elfio& elf_file, ELFIO::section* symtab_section, const N64Recomp::ElfParsingConfig& elf_config, bool dumping_context, std::unordered_map<uint16_t, std::vector<N64Recomp::DataSymbol>>& data_syms) {
diff --git a/src/main.cpp b/src/main.cpp
index a2ccdc1..8a8fe91 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -9,7 +9,7 @@
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "config.h"
 #include <set>
 
@@ -111,7 +111,7 @@ bool compare_files(const std::filesystem::path& file1_path, const std::filesyste
     return std::equal(begin1, std::istreambuf_iterator<char>(), begin2); //Second argument is end-of-range iterator
 }
 
-bool recompile_single_function(const N64Recomp::Context& context, const N64Recomp::Function& func, const std::string& recomp_include, const std::filesystem::path& output_path, std::span<std::vector<uint32_t>> static_funcs_out) {
+bool recompile_single_function(const N64Recomp::Context& context, size_t func_index, const std::string& recomp_include, const std::filesystem::path& output_path, std::span<std::vector<uint32_t>> static_funcs_out) {
     // Open the temporary output file
     std::filesystem::path temp_path = output_path;
     temp_path.replace_extension(".tmp");
@@ -127,7 +127,7 @@ bool recompile_single_function(const N64Recomp::Context& context, const N64Recom
         "\n",
         recomp_include);
 
-    if (!N64Recomp::recompile_function(context, func, output_file, static_funcs_out, false)) {
+    if (!N64Recomp::recompile_function(context, func_index, output_file, static_funcs_out, false)) {
         return false;
     }
     
@@ -725,7 +725,7 @@ int main(int argc, char** argv) {
 
             // Recompile the function.
             if (config.single_file_output || config.functions_per_output_file > 1) {
-                result = N64Recomp::recompile_function(context, func, current_output_file, static_funcs_by_section, false);
+                result = N64Recomp::recompile_function(context, i, current_output_file, static_funcs_by_section, false);
                 if (!config.single_file_output) {
                     cur_file_function_count++;
                     if (cur_file_function_count >= config.functions_per_output_file) {
@@ -734,7 +734,7 @@ int main(int argc, char** argv) {
                 }
             }
             else {
-                result = recompile_single_function(context, func, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section);
+                result = recompile_single_function(context, i, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section);
             }
             if (result == false) {
                 fmt::print(stderr, "Error recompiling {}\n", func.name);
@@ -797,22 +797,25 @@ int main(int argc, char** argv) {
             std::vector<uint32_t> insn_words((cur_func_end - static_func_addr) / sizeof(uint32_t));
             insn_words.assign(func_rom_start, func_rom_start + insn_words.size());
 
-            N64Recomp::Function func {
+            // Create the new function and add it to the context.
+            size_t new_func_index = context.functions.size();
+            context.functions.emplace_back(
                 static_func_addr,
                 rom_addr,
                 std::move(insn_words),
                 fmt::format("static_{}_{:08X}", section_index, static_func_addr),
                 static_cast<uint16_t>(section_index),
                 false
-            };
+            );
+            const N64Recomp::Function& new_func = context.functions[new_func_index];
 
             fmt::print(func_header_file,
-                       "void {}(uint8_t* rdram, recomp_context* ctx);\n", func.name);
+                       "void {}(uint8_t* rdram, recomp_context* ctx);\n", new_func.name);
 
             bool result;
-            size_t prev_num_statics = static_funcs_by_section[func.section_index].size();
+            size_t prev_num_statics = static_funcs_by_section[new_func.section_index].size();
             if (config.single_file_output || config.functions_per_output_file > 1) {
-                result = N64Recomp::recompile_function(context, func, current_output_file, static_funcs_by_section, false);
+                result = N64Recomp::recompile_function(context, new_func_index, current_output_file, static_funcs_by_section, false);
                 if (!config.single_file_output) {
                     cur_file_function_count++;
                     if (cur_file_function_count >= config.functions_per_output_file) {
@@ -821,14 +824,14 @@ int main(int argc, char** argv) {
                 }
             }
             else {
-                result = recompile_single_function(context, func, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section);
+                result = recompile_single_function(context, new_func_index, config.recomp_include, config.output_func_path / (new_func.name + ".c"), static_funcs_by_section);
             }
 
             // Add any new static functions that were found while recompiling this one.
-            size_t cur_num_statics = static_funcs_by_section[func.section_index].size();
+            size_t cur_num_statics = static_funcs_by_section[new_func.section_index].size();
             if (cur_num_statics != prev_num_statics) {
                 for (size_t new_static_index = prev_num_statics; new_static_index < cur_num_statics; new_static_index++) {
-                    uint32_t new_static_vram = static_funcs_by_section[func.section_index][new_static_index];
+                    uint32_t new_static_vram = static_funcs_by_section[new_func.section_index][new_static_index];
 
                     if (!statics_set.contains(new_static_vram)) {
                         statics_set.emplace(new_static_vram);
@@ -838,7 +841,7 @@ int main(int argc, char** argv) {
             }
 
             if (result == false) {
-                fmt::print(stderr, "Error recompiling {}\n", func.name);
+                fmt::print(stderr, "Error recompiling {}\n", new_func.name);
                 std::exit(EXIT_FAILURE);
             }
         }
diff --git a/src/mod_symbols.cpp b/src/mod_symbols.cpp
index 24675fe..fcfdead 100644
--- a/src/mod_symbols.cpp
+++ b/src/mod_symbols.cpp
@@ -1,6 +1,6 @@
 #include <cstring>
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 struct FileHeader {
     char magic[8]; // N64RSYMS
diff --git a/src/operations.cpp b/src/operations.cpp
index d73b278..70201d3 100644
--- a/src/operations.cpp
+++ b/src/operations.cpp
@@ -1,4 +1,4 @@
-#include "operations.h"
+#include "recompiler/operations.h"
 
 namespace N64Recomp {
     const std::unordered_map<InstrId, UnaryOp> unary_ops {
@@ -12,8 +12,8 @@ namespace N64Recomp {
         // Float operations
         { InstrId::cpu_mov_s,     { UnaryOpType::None,           Operand::Fd,       Operand::Fs,       true } },
         { InstrId::cpu_mov_d,     { UnaryOpType::None,           Operand::FdDouble, Operand::FsDouble, true } },
-        { InstrId::cpu_neg_s,     { UnaryOpType::Negate,         Operand::Fd,       Operand::Fs,       true, true } },
-        { InstrId::cpu_neg_d,     { UnaryOpType::Negate,         Operand::FdDouble, Operand::FsDouble, true, true } },
+        { InstrId::cpu_neg_s,     { UnaryOpType::NegateFloat,    Operand::Fd,       Operand::Fs,       true, true } },
+        { InstrId::cpu_neg_d,     { UnaryOpType::NegateDouble,   Operand::FdDouble, Operand::FsDouble, true, true } },
         { InstrId::cpu_abs_s,     { UnaryOpType::AbsFloat,       Operand::Fd,       Operand::Fs,       true, true } },
         { InstrId::cpu_abs_d,     { UnaryOpType::AbsDouble,      Operand::FdDouble, Operand::FsDouble, true, true } },
         { InstrId::cpu_sqrt_s,    { UnaryOpType::SqrtFloat,      Operand::Fd,       Operand::Fs,       true, true } },
@@ -65,24 +65,22 @@ namespace N64Recomp {
         { InstrId::cpu_ori,    { BinaryOpType::Or64,  Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rs, Operand::ImmU16 }}} },
         { InstrId::cpu_xori,   { BinaryOpType::Xor64, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rs, Operand::ImmU16 }}} },
         // Shifts
-        /* BUG Should mask after (change op to Sll32 and input op to ToU32) */
-        { InstrId::cpu_sllv,   { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
+        { InstrId::cpu_sllv,   { BinaryOpType::Sll32, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_dsllv,  { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_srlv,   { BinaryOpType::Srl32, Operand::Rd, {{ UnaryOpType::ToU32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_dsrlv,  { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} },
-        /* BUG Should mask after (change op to Sra32 and input op to ToS64) */
-        { InstrId::cpu_srav,   { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
+        // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half.
+        { InstrId::cpu_srav,   { BinaryOpType::Sra32, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_dsrav,  { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} },
         // Shifts (immediate)
-        /* BUG Should mask after (change op to Sll32 and input op to ToU32) */
-        { InstrId::cpu_sll,    { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
+        { InstrId::cpu_sll,    { BinaryOpType::Sll32, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsll,   { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsll32, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} },
         { InstrId::cpu_srl,    { BinaryOpType::Srl32, Operand::Rd, {{ UnaryOpType::ToU32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsrl,   { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsrl32, { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} },
-        /* BUG should cast after (change op to Sra32 and input op to ToS64) */
-        { InstrId::cpu_sra,    { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
+        // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half.
+        { InstrId::cpu_sra,    { BinaryOpType::Sra32, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsra,   { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsra32, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} },
         // Comparisons
@@ -101,47 +99,47 @@ namespace N64Recomp {
         { InstrId::cpu_div_s, { BinaryOpType::DivFloat,  Operand::Fd,       {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true, true } },
         { InstrId::cpu_div_d, { BinaryOpType::DivDouble, Operand::FdDouble, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true, true } },
         // Float comparisons TODO remaining operations and investigate ordered/unordered and default values
-        { InstrId::cpu_c_lt_s,  { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_nge_s, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_olt_s, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ult_s, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_lt_d,  { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_nge_d, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_olt_d, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ult_d, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_lt_s,  { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_nge_s, { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_olt_s, { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ult_s, { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_lt_d,  { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_nge_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_olt_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ult_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
 
-        { InstrId::cpu_c_le_s,  { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ngt_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ole_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ule_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_le_d,  { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ngt_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ole_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ule_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_le_s,  { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ngt_s, { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ole_s, { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ule_s, { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_le_d,  { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ngt_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ole_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ule_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
 
-        { InstrId::cpu_c_eq_s,  { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ueq_s, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ngl_s, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_seq_s, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_eq_d,  { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ueq_d, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ngl_d, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_eq_s,  { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ueq_s, { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ngl_s, { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_seq_s, { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_eq_d,  { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ueq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ngl_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
         /* TODO rename to c_seq_d when fixed in rabbitizer */
-        { InstrId::cpu_c_deq_d, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_deq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
         // Loads
-        { InstrId::cpu_ld,   { BinaryOpType::LD,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lw,   { BinaryOpType::LW,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwu,  { BinaryOpType::LWU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lh,   { BinaryOpType::LH,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lhu,  { BinaryOpType::LHU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lb,   { BinaryOpType::LB,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lbu,  { BinaryOpType::LBU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_ldl,  { BinaryOpType::LDL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_ldr,  { BinaryOpType::LDR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwl,  { BinaryOpType::LWL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwr,  { BinaryOpType::LWR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwc1, { BinaryOpType::LW, Operand::FtU32L, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_ldc1, { BinaryOpType::LD, Operand::FtU64,  {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}, true } },
+        { InstrId::cpu_ld,   { BinaryOpType::LD,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lw,   { BinaryOpType::LW,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwu,  { BinaryOpType::LWU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lh,   { BinaryOpType::LH,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lhu,  { BinaryOpType::LHU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lb,   { BinaryOpType::LB,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lbu,  { BinaryOpType::LBU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_ldl,  { BinaryOpType::LDL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_ldr,  { BinaryOpType::LDR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwl,  { BinaryOpType::LWL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwr,  { BinaryOpType::LWR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwc1, { BinaryOpType::LW, Operand::FtU32L, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_ldc1, { BinaryOpType::LD, Operand::FtU64,  {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}, true } },
     };
 
     const std::unordered_map<InstrId, ConditionalBranchOp> conditional_branch_ops {
@@ -159,10 +157,12 @@ namespace N64Recomp {
         { InstrId::cpu_bltzl,   { BinaryOpType::Less,      {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, false, true }},
         { InstrId::cpu_bgezal,  { BinaryOpType::GreaterEq, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, false }},
         { InstrId::cpu_bgezall, { BinaryOpType::GreaterEq, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, true }},
-        { InstrId::cpu_bc1f,    { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
-        { InstrId::cpu_bc1fl,   { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
-        { InstrId::cpu_bc1t,    { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
-        { InstrId::cpu_bc1tl,   { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
+        { InstrId::cpu_bltzal,  { BinaryOpType::Less,      {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, false }},
+        { InstrId::cpu_bltzall, { BinaryOpType::Less,      {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, true }},
+        { InstrId::cpu_bc1f,    { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
+        { InstrId::cpu_bc1fl,   { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
+        { InstrId::cpu_bc1t,    { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
+        { InstrId::cpu_bc1tl,   { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
     };
 
     const std::unordered_map<InstrId, StoreOp> store_ops {
diff --git a/src/recompilation.cpp b/src/recompilation.cpp
index 1faef25..cf12c49 100644
--- a/src/recompilation.cpp
+++ b/src/recompilation.cpp
@@ -8,10 +8,10 @@
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "analysis.h"
-#include "operations.h"
-#include "generator.h"
+#include "recompiler/operations.h"
+#include "recompiler/generator.h"
 
 enum class JalResolutionResult {
     NoMatch,
@@ -28,7 +28,6 @@ JalResolutionResult resolve_jal(const N64Recomp::Context& context, size_t cur_se
     uint32_t section_vram_start = cur_section.ram_addr;
     uint32_t section_vram_end = cur_section.ram_addr + cur_section.size;
     bool in_current_section = target_func_vram >= section_vram_start && target_func_vram < section_vram_end;
-    bool needs_static = false;
     bool exact_match_found = false;
 
     // Use a thread local to prevent reallocation across runs and to allow multi-threading in the future.
@@ -109,8 +108,8 @@ std::string_view ctx_gpr_prefix(int reg) {
     return "";
 }
 
-// Major TODO, this function grew very organically and needs to be cleaned up. Ideally, it'll get split up into some sort of lookup table grouped by similar instruction types.
-bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Function& func, const N64Recomp::FunctionStats& stats, const std::unordered_set<uint32_t>& skipped_insns, size_t instr_index, const std::vector<rabbitizer::InstructionCpu>& instructions, std::ofstream& output_file, bool indent, bool emit_link_branch, int link_branch_index, size_t reloc_index, bool& needs_link_branch, bool& is_branch_likely, bool tag_reference_relocs, std::span<std::vector<uint32_t>> static_funcs_out) {
+template <typename GeneratorType>
+bool process_instruction(GeneratorType& generator, const N64Recomp::Context& context, const N64Recomp::Function& func, const N64Recomp::FunctionStats& stats, const std::unordered_set<uint32_t>& jtbl_lw_instructions, size_t instr_index, const std::vector<rabbitizer::InstructionCpu>& instructions, std::ostream& output_file, bool indent, bool emit_link_branch, int link_branch_index, size_t reloc_index, bool& needs_link_branch, bool& is_branch_likely, bool tag_reference_relocs, std::span<std::vector<uint32_t>> static_funcs_out) {
     using namespace N64Recomp;
 
     const auto& section = context.sections[func.section_index];
@@ -118,6 +117,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
     needs_link_branch = false;
     is_branch_likely = false;
     uint32_t instr_vram = instr.getVram();
+    InstrId instr_id = instr.getUniqueId();
 
     auto print_indent = [&]() {
         fmt::print(output_file, "    ");
@@ -132,16 +132,20 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
     }
 
     // Output a comment with the original instruction
-    if (instr.isBranch() || instr.getUniqueId() == InstrId::cpu_j) {
-        fmt::print(output_file, "    // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric())));
-    } else if (instr.getUniqueId() == InstrId::cpu_jal) {
-        fmt::print(output_file, "    // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric())));
+    print_indent();
+    if (instr.isBranch() || instr_id == InstrId::cpu_j) {
+        generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric()))));
+    } else if (instr_id == InstrId::cpu_jal) {
+        generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric()))));
     } else {
-        fmt::print(output_file, "    // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0));
+        generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0)));
     }
 
-    if (skipped_insns.contains(instr_vram)) {
-        return true;
+    // Replace loads for jump table entries into addiu. This leaves the jump table entry's address in the output register
+    // instead of the entry's value, which can then be used to determine the offset from the start of the jump table.
+    if (jtbl_lw_instructions.contains(instr_vram)) {
+        assert(instr_id == InstrId::cpu_lw);
+        instr_id = InstrId::cpu_addiu;
     }
 
     N64Recomp::RelocType reloc_type = N64Recomp::RelocType::R_MIPS_NONE;
@@ -178,9 +182,9 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
                     // Don't try to relocate special section symbols.
                     if (context.is_regular_reference_section(reloc.target_section) || reloc_section == N64Recomp::SectionAbsolute) {
                         bool ref_section_relocatable = context.is_reference_section_relocatable(reloc.target_section);
-                        uint32_t ref_section_vram = context.get_reference_section_vram(reloc.target_section);
                         // Resolve HI16 and LO16 reference symbol relocs to non-relocatable sections by patching the instruction immediate.
                         if (!ref_section_relocatable && (reloc_type == N64Recomp::RelocType::R_MIPS_HI16 || reloc_type == N64Recomp::RelocType::R_MIPS_LO16)) {
+                            uint32_t ref_section_vram = context.get_reference_section_vram(reloc.target_section);
                             uint32_t full_immediate = reloc.target_section_offset + ref_section_vram;
 
                             if (reloc_type == N64Recomp::RelocType::R_MIPS_HI16) {
@@ -206,13 +210,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         }
     }
 
-    auto print_line = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
-        print_indent();
-        fmt::vprint(output_file, fmt_str, fmt::make_format_args(args...));
-        fmt::print(output_file, ";\n");
-    };
-
-    auto print_unconditional_branch = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
+    auto process_delay_slot = [&](bool use_indent) {
         if (instr_index < instructions.size() - 1) {
             bool dummy_needs_link_branch;
             bool dummy_is_branch_likely;
@@ -221,56 +219,87 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) {
                 next_reloc_index++;
             }
-            if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, false, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
+            if (!process_instruction(generator, context, func, stats, jtbl_lw_instructions, instr_index + 1, instructions, output_file, use_indent, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
                 return false;
             }
         }
-        print_indent();
-        fmt::vprint(output_file, fmt_str, fmt::make_format_args(args...));
-        if (needs_link_branch) {
-            fmt::print(output_file, ";\n    goto after_{};\n", link_branch_index);
-        } else {
-            fmt::print(output_file, ";\n");
-        }
         return true;
     };
 
-    auto print_func_call = [reloc_target_section_offset, reloc_section, reloc_reference_symbol, reloc_type, &context, &section, &func, &static_funcs_out, &needs_link_branch, &print_unconditional_branch]
-        (uint32_t target_func_vram, bool link_branch = true, bool indent = false)
+    auto print_link_branch = [&]() {
+        if (needs_link_branch) {
+            print_indent();
+            generator.emit_goto(fmt::format("after_{}", link_branch_index));
+        }
+    };
+
+    auto print_return_with_delay_slot = [&]() {
+        if (!process_delay_slot(false)) {
+            return false;
+        }
+        print_indent();
+        generator.emit_return();
+        print_link_branch();
+        return true;
+    };
+
+    auto print_goto_with_delay_slot = [&](const std::string& target) {
+        if (!process_delay_slot(false)) {
+            return false;
+        }
+        print_indent();
+        generator.emit_goto(target);
+        print_link_branch();
+        return true;
+    };
+
+    auto print_func_call_by_register = [&](int reg) {
+        if (!process_delay_slot(false)) {
+            return false;
+        }
+        print_indent();
+        generator.emit_function_call_by_register(reg);
+        print_link_branch();
+        return true;
+    };
+
+    auto print_func_call_by_address = [&generator, reloc_target_section_offset, reloc_section, reloc_reference_symbol, reloc_type, &context, &func, &static_funcs_out, &needs_link_branch, &print_indent, &process_delay_slot, &print_link_branch]
+        (uint32_t target_func_vram, bool tail_call = false, bool indent = false)
     {
+        bool call_by_lookup = false;
+        bool call_by_name = false;
         // Event symbol, emit a call to the runtime to trigger this event.
         if (reloc_section == N64Recomp::SectionEvent) {
-            needs_link_branch = link_branch;
+            needs_link_branch = !tail_call;
             if (indent) {
-                if (!print_unconditional_branch("    recomp_trigger_event(rdram, ctx, base_event_index + {})", reloc_reference_symbol)) {
-                    return false;
-                }
-            } else {
-                if (!print_unconditional_branch("recomp_trigger_event(rdram, ctx, base_event_index + {})", reloc_reference_symbol)) {
-                    return false;
-                }
+                print_indent();
             }
+            if (!process_delay_slot(false)) {
+                return false;
+            }
+            print_indent();
+            generator.emit_trigger_event((uint32_t)reloc_reference_symbol);
+            print_link_branch();
         }
         // Normal symbol or reference symbol, 
         else {
             std::string jal_target_name{};
+            size_t matched_func_index = (size_t)-1;
             if (reloc_reference_symbol != (size_t)-1) {
-                const auto& ref_symbol = context.get_reference_symbol(reloc_section, reloc_reference_symbol);
-
                 if (reloc_type != N64Recomp::RelocType::R_MIPS_26) {
                     fmt::print(stderr, "Unsupported reloc type {} on jal instruction in {}\n", (int)reloc_type, func.name);
                     return false;
                 }
 
-                if (ref_symbol.section_offset != reloc_target_section_offset) {
-                    fmt::print(stderr, "Function {} uses a MIPS_R_26 addend, which is not supported yet\n", func.name);
-                    return false;
+                if (!context.skip_validating_reference_symbols) {
+                    const auto& ref_symbol = context.get_reference_symbol(reloc_section, reloc_reference_symbol);
+                    if (ref_symbol.section_offset != reloc_target_section_offset) {
+                        fmt::print(stderr, "Function {} uses a MIPS_R_26 addend, which is not supported yet\n", func.name);
+                        return false;
+                    }
                 }
-
-                jal_target_name = ref_symbol.name;
             }
             else {
-                size_t matched_func_index = 0;
                 JalResolutionResult jal_result = resolve_jal(context, func.section_index, target_func_vram, matched_func_index);
 
                 switch (jal_result) {
@@ -284,65 +313,78 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
                         // Create a static function add it to the static function list for this section.
                         jal_target_name = fmt::format("static_{}_{:08X}", func.section_index, target_func_vram);
                         static_funcs_out[func.section_index].push_back(target_func_vram);
+                        call_by_name = true;
                         break;
                     case JalResolutionResult::Ambiguous:
                         fmt::print(stderr, "[Info] Ambiguous jal target 0x{:08X} in function {}, falling back to function lookup\n", target_func_vram, func.name);
                         // Relocation isn't necessary for jumps inside a relocatable section, as this code path will never run if the target vram
                         // is in the current function's section (see the branch for `in_current_section` above).
                         // If a game ever needs to jump between multiple relocatable sections, relocation will be necessary here.
-                        jal_target_name = fmt::format("LOOKUP_FUNC(0x{:08X})", target_func_vram);
+                        call_by_lookup = true;
                         break;
                     case JalResolutionResult::Error:
                         fmt::print(stderr, "Internal error when resolving jal to address 0x{:08X} in function {}. Please report this issue.\n", target_func_vram, func.name);
                         return false;
                 }
             }
-            needs_link_branch = link_branch;
+            needs_link_branch = !tail_call;
             if (indent) {
-                if (!print_unconditional_branch("    {}(rdram, ctx)", jal_target_name)) {
-                    return false;
-                }
-            } else {
-                if (!print_unconditional_branch("{}(rdram, ctx)", jal_target_name)) {
-                    return false;
-                }
+                print_indent();
             }
+            if (!process_delay_slot(false)) {
+                return false;
+            }
+            print_indent();
+            if (reloc_reference_symbol != (size_t)-1) {
+                generator.emit_function_call_reference_symbol(context, reloc_section, reloc_reference_symbol, reloc_target_section_offset);
+            }
+            else if (call_by_lookup) {
+                generator.emit_function_call_lookup(target_func_vram);
+            }
+            else if (call_by_name) {
+                generator.emit_named_function_call(jal_target_name);
+            }
+            else {
+                generator.emit_function_call(context, matched_func_index);
+            }
+            print_link_branch();
         }
         return true;
     };
 
     auto print_branch = [&](uint32_t branch_target) {
+        // If the branch target is outside the current function, check if it can be treated as a tail call.
         if (branch_target < func.vram || branch_target >= func_vram_end) {
+            // If the branch target is the start of some known function, this can be handled as a tail call.
             // FIXME: how to deal with static functions?
             if (context.functions_by_vram.find(branch_target) != context.functions_by_vram.end()) {
                 fmt::print("Tail call in {} to 0x{:08X}\n", func.name, branch_target);
-                if (!print_func_call(branch_target, false, true)) {
+                if (!print_func_call_by_address(branch_target, true, true)) {
                     return false;
                 }
-                print_line("    return");
-                fmt::print(output_file, "    }}\n");
+                print_indent();
+                generator.emit_return();
+                // TODO check if this branch close should exist.
+                // print_indent();
+                // generator.emit_branch_close();
                 return true;
             }
 
             fmt::print(stderr, "[Warn] Function {} is branching outside of the function (to 0x{:08X})\n", func.name, branch_target);
         }
 
-        if (instr_index < instructions.size() - 1) {
-            bool dummy_needs_link_branch;
-            bool dummy_is_branch_likely;
-            size_t next_reloc_index = reloc_index;
-            uint32_t next_vram = instr_vram + 4;
-            if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) {
-                next_reloc_index++;
-            }
-            if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, true, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
-                return false;
-            }
+        if (!process_delay_slot(true)) {
+            return false;
         }
 
-        fmt::print(output_file, "        goto L_{:08X};\n", branch_target);
+        print_indent();
+        print_indent();
+        generator.emit_goto(fmt::format("L_{:08X}", branch_target));
+        // TODO check if this link branch ever exists.
         if (needs_link_branch) {
-            fmt::print(output_file, "        goto after_{};\n", link_branch_index);
+            print_indent();
+            print_indent();
+            generator.emit_goto(fmt::format("after_{}", link_branch_index));
         }
         return true;
     };
@@ -353,7 +395,6 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
     int rd = (int)instr.GetO32_rd();
     int rs = (int)instr.GetO32_rs();
-    int base = rs;
     int rt = (int)instr.GetO32_rt();
     int sa = (int)instr.Get_sa();
 
@@ -365,7 +406,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
     bool handled = true;
 
-    switch (instr.getUniqueId()) {
+    switch (instr_id) {
     case InstrId::cpu_nop:
         fmt::print(output_file, "\n");
         break;
@@ -375,7 +416,8 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             Cop0Reg reg = instr.Get_cop0d();
             switch (reg) {
             case Cop0Reg::COP0_Status:
-                print_line("{}{} = cop0_status_read(ctx)", ctx_gpr_prefix(rt), rt);
+                print_indent();
+                generator.emit_cop0_status_read(rt);
                 break;
             default:
                 fmt::print(stderr, "Unhandled cop0 register in mfc0: {}\n", (int)reg);
@@ -388,7 +430,8 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             Cop0Reg reg = instr.Get_cop0d();
             switch (reg) {
             case Cop0Reg::COP0_Status:
-                print_line("cop0_status_write(ctx, {}{})", ctx_gpr_prefix(rt), rt);
+                print_indent();
+                generator.emit_cop0_status_write(rt);
                 break;
             default:
                 fmt::print(stderr, "Unhandled cop0 register in mtc0: {}\n", (int)reg);
@@ -408,38 +451,25 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             // If so, create a temp to preserve the addend register's value
             if (find_result != stats.jump_tables.end()) {
                 const N64Recomp::JumpTable& cur_jtbl = *find_result;
-                print_line("gpr jr_addend_{:08X} = {}{}", cur_jtbl.jr_vram, ctx_gpr_prefix(cur_jtbl.addend_reg), cur_jtbl.addend_reg);
+                print_indent();
+                generator.emit_jtbl_addend_declaration(cur_jtbl, cur_jtbl.addend_reg);
             }
         }
         break;
     case InstrId::cpu_mult:
-        print_line("result = S64(S32({}{})) * S64(S32({}{})); lo = S32(result >> 0); hi = S32(result >> 32)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_dmult:
-        print_line("DMULT(S64({}{}), S64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_multu:
-        print_line("result = U64(U32({}{})) * U64(U32({}{})); lo = S32(result >> 0); hi = S32(result >> 32)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_dmultu:
-        print_line("DMULTU(U64({}{}), U64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_div:
-        // Cast to 64-bits before division to prevent artihmetic exception for s32(0x80000000) / -1
-        print_line("lo = S32(S64(S32({}{})) / S64(S32({}{}))); hi = S32(S64(S32({}{})) % S64(S32({}{})))", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_ddiv:
-        print_line("DDIV(S64({}{}), S64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_divu:
-        print_line("lo = S32(U32({}{}) / U32({}{})); hi = S32(U32({}{}) % U32({}{}))", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_ddivu:
-        print_line("DDIVU(U64({}{}), U64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+        print_indent();
+        generator.emit_muldiv(instr_id, rs, rt);
         break;
     // Branches
     case InstrId::cpu_jal:
-        if (!print_func_call(instr.getBranchVramGeneric())) {
+        if (!print_func_call_by_address(instr.getBranchVramGeneric())) {
             return false;
         }
         break;
@@ -450,18 +480,19 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             return false;
         }
         needs_link_branch = true;
-        print_unconditional_branch("LOOKUP_FUNC({}{})(rdram, ctx)", ctx_gpr_prefix(rs), rs);
+        print_func_call_by_register(rs);
         break;
     case InstrId::cpu_j:
     case InstrId::cpu_b:
         {
             uint32_t branch_target = instr.getBranchVramGeneric();
             if (branch_target == instr_vram) {
-                print_line("pause_self(rdram)");
+                print_indent();
+                generator.emit_pause_self();
             }
             // Check if the branch is within this function
             else if (branch_target >= func.vram && branch_target < func_vram_end) {
-                print_unconditional_branch("goto L_{:08X}", branch_target);
+                print_goto_with_delay_slot(fmt::format("L_{:08X}", branch_target));
             }
             // This may be a tail call in the middle of the control flow due to a previous check
             // For example:
@@ -476,11 +507,12 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             // ```
             // FIXME: how to deal with static functions?
             else if (context.functions_by_vram.find(branch_target) != context.functions_by_vram.end()) {
-                fmt::print("Tail call in {} to 0x{:08X}\n", func.name, branch_target);
-                if (!print_func_call(branch_target, false)) {
+                fmt::print("[Info] Tail call in {} to 0x{:08X}\n", func.name, branch_target);
+                if (!print_func_call_by_address(branch_target, true)) {
                     return false;
                 }
-                print_line("return");
+                print_indent();
+                generator.emit_return();
             }
             else {
                 fmt::print(stderr, "Unhandled branch in {} at 0x{:08X} to 0x{:08X}\n", func.name, instr_vram, branch_target);
@@ -490,7 +522,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         break;
     case InstrId::cpu_jr:
         if (rs == (int)rabbitizer::Registers::Cpu::GprO32::GPR_O32_ra) {
-            print_unconditional_branch("return");
+            print_return_with_delay_slot();
         } else {
             auto jtbl_find_result = std::find_if(stats.jump_tables.begin(), stats.jump_tables.end(),
                 [instr_vram](const N64Recomp::JumpTable& jtbl) {
@@ -499,58 +531,41 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
             if (jtbl_find_result != stats.jump_tables.end()) {
                 const N64Recomp::JumpTable& cur_jtbl = *jtbl_find_result;
-                bool dummy_needs_link_branch, dummy_is_branch_likely;
-                size_t next_reloc_index = reloc_index;
-                uint32_t next_vram = instr_vram + 4;
-                if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) {
-                    next_reloc_index++;
-                }
-                if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, false, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
+                if (!process_delay_slot(false)) {
                     return false;
                 }
                 print_indent();
-                fmt::print(output_file, "switch (jr_addend_{:08X} >> 2) {{\n", cur_jtbl.jr_vram);
+                generator.emit_switch(context, cur_jtbl, rs);
                 for (size_t entry_index = 0; entry_index < cur_jtbl.entries.size(); entry_index++) {
                     print_indent();
-                    print_line("case {}: goto L_{:08X}; break", entry_index, cur_jtbl.entries[entry_index]);
+                    print_indent();
+                    generator.emit_case(entry_index, fmt::format("L_{:08X}", cur_jtbl.entries[entry_index]));
                 }
                 print_indent();
-                print_line("default: switch_error(__func__, 0x{:08X}, 0x{:08X})", instr_vram, cur_jtbl.vram);
                 print_indent();
-                fmt::print(output_file, "}}\n");
+                generator.emit_switch_error(instr_vram, cur_jtbl.vram);
+                print_indent();
+                generator.emit_switch_close();
                 break;
             }
 
-            auto jump_find_result = std::find_if(stats.absolute_jumps.begin(), stats.absolute_jumps.end(),
-                [instr_vram](const N64Recomp::AbsoluteJump& jump) {
-                return jump.instruction_vram == instr_vram;
-            });
-
-            if (jump_find_result != stats.absolute_jumps.end()) {
-                print_unconditional_branch("LOOKUP_FUNC({})(rdram, ctx)", (uint64_t)(int32_t)jump_find_result->jump_target);
-                // jr doesn't link so it acts like a tail call, meaning we should return directly after the jump returns
-                print_line("return");
-                break;
-            }
-
-            bool is_tail_call = instr_vram == func_vram_end - 2 * sizeof(func.words[0]);
-            if (is_tail_call) {
-                fmt::print("Indirect tail call in {}\n", func.name);
-                print_unconditional_branch("LOOKUP_FUNC({}{})(rdram, ctx)", ctx_gpr_prefix(rs), rs);
-                print_line("return");
-                break;
-            }
-
-            fmt::print(stderr, "No jump table found for jr at 0x{:08X} and not tail call\n", instr_vram);
+            fmt::print("[Info] Indirect tail call in {}\n", func.name);
+            print_func_call_by_register(rs);
+            print_indent();
+            generator.emit_return();
+            break;
         }
         break;
     case InstrId::cpu_syscall:
-        print_line("recomp_syscall_handler(rdram, ctx, 0x{:08X})", instr_vram);
+        print_indent();
+        generator.emit_syscall(instr_vram);
         // syscalls don't link, so treat it like a tail call
-        print_line("return");
+        print_indent();
+        generator.emit_return();
         break;
     case InstrId::cpu_break:
-        print_line("do_break({})", instr_vram);
+        print_indent();
+        generator.emit_do_break(instr_vram);
         break;
 
     // Cop1 rounding mode
@@ -559,21 +574,22 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             fmt::print(stderr, "Invalid FP control register for ctc1: {}\n", cop1_cs);
             return false;
         }
-        print_line("rounding_mode = ({}{}) & 0x3", ctx_gpr_prefix(rt), rt);
+        print_indent();
+        generator.emit_cop1_cs_write(rt);
         break;
     case InstrId::cpu_cfc1:
         if (cop1_cs != 31) {
             fmt::print(stderr, "Invalid FP control register for cfc1: {}\n", cop1_cs);
             return false;
         }
-        print_line("{}{} = rounding_mode", ctx_gpr_prefix(rt), rt);
+        print_indent();
+        generator.emit_cop1_cs_read(rt);
         break;
     default:
         handled = false;
         break;
     }
 
-    CGenerator generator{};
     InstructionContext instruction_context{};
     instruction_context.rd = rd;
     instruction_context.rs = rs;
@@ -589,28 +605,28 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
     instruction_context.reloc_section_index = reloc_section;
     instruction_context.reloc_target_section_offset = reloc_target_section_offset;
     
-    auto do_check_fr = [](std::ostream& output_file, const CGenerator& generator, const InstructionContext& ctx, Operand operand) {
+    auto do_check_fr = [](const GeneratorType& generator, const InstructionContext& ctx, Operand operand) {
         switch (operand) {
             case Operand::Fd:
             case Operand::FdDouble:
             case Operand::FdU32L:
             case Operand::FdU32H:
             case Operand::FdU64:
-                generator.emit_check_fr(output_file, ctx.fd);
+                generator.emit_check_fr(ctx.fd);
                 break;
             case Operand::Fs:
             case Operand::FsDouble:
             case Operand::FsU32L:
             case Operand::FsU32H:
             case Operand::FsU64:
-                generator.emit_check_fr(output_file, ctx.fs);
+                generator.emit_check_fr(ctx.fs);
                 break;
             case Operand::Ft:
             case Operand::FtDouble:
             case Operand::FtU32L:
             case Operand::FtU32H:
             case Operand::FtU64:
-                generator.emit_check_fr(output_file, ctx.ft);
+                generator.emit_check_fr(ctx.ft);
                 break;
             default:
                 // No MIPS3 float check needed for non-float operands.
@@ -618,25 +634,25 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         }
     };
     
-    auto do_check_nan = [](std::ostream& output_file, const CGenerator& generator, const InstructionContext& ctx, Operand operand) {
+    auto do_check_nan = [](const GeneratorType& generator, const InstructionContext& ctx, Operand operand) {
         switch (operand) {
             case Operand::Fd:
-                generator.emit_check_nan(output_file, ctx.fd, false);
+                generator.emit_check_nan(ctx.fd, false);
                 break;
             case Operand::Fs:
-                generator.emit_check_nan(output_file, ctx.fs, false);
+                generator.emit_check_nan(ctx.fs, false);
                 break;
             case Operand::Ft:
-                generator.emit_check_nan(output_file, ctx.ft, false);
+                generator.emit_check_nan(ctx.ft, false);
                 break;
             case Operand::FdDouble:
-                generator.emit_check_nan(output_file, ctx.fd, true);
+                generator.emit_check_nan(ctx.fd, true);
                 break;
             case Operand::FsDouble:
-                generator.emit_check_nan(output_file, ctx.fs, true);
+                generator.emit_check_nan(ctx.fs, true);
                 break;
             case Operand::FtDouble:
-                generator.emit_check_nan(output_file, ctx.ft, true);
+                generator.emit_check_nan(ctx.ft, true);
                 break;
             default:
                 // No NaN checks needed for non-float operands.
@@ -644,54 +660,58 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         }
     };
 
-    auto find_binary_it = binary_ops.find(instr.getUniqueId());
+    auto find_binary_it = binary_ops.find(instr_id);
     if (find_binary_it != binary_ops.end()) {
         print_indent();
         const BinaryOp& op = find_binary_it->second;
         
         if (op.check_fr) {
-            do_check_fr(output_file, generator, instruction_context, op.output);
-            do_check_fr(output_file, generator, instruction_context, op.operands.operands[0]);
-            do_check_fr(output_file, generator, instruction_context, op.operands.operands[1]);
+            do_check_fr(generator, instruction_context, op.output);
+            do_check_fr(generator, instruction_context, op.operands.operands[0]);
+            do_check_fr(generator, instruction_context, op.operands.operands[1]);
         }
 
         if (op.check_nan) {
-            do_check_nan(output_file, generator, instruction_context, op.operands.operands[0]);
-            do_check_nan(output_file, generator, instruction_context, op.operands.operands[1]);
-            fmt::print(output_file, "\n    ");
+            do_check_nan(generator, instruction_context, op.operands.operands[0]);
+            do_check_nan(generator, instruction_context, op.operands.operands[1]);
+            fmt::print(output_file, "\n");
+            print_indent();
         }
 
-        generator.process_binary_op(output_file, op, instruction_context);
+        generator.process_binary_op(op, instruction_context);
         handled = true;
     }
 
-    auto find_unary_it = unary_ops.find(instr.getUniqueId());
+    auto find_unary_it = unary_ops.find(instr_id);
     if (find_unary_it != unary_ops.end()) {
         print_indent();
         const UnaryOp& op = find_unary_it->second;
         
         if (op.check_fr) {
-            do_check_fr(output_file, generator, instruction_context, op.output);
-            do_check_fr(output_file, generator, instruction_context, op.input);
+            do_check_fr(generator, instruction_context, op.output);
+            do_check_fr(generator, instruction_context, op.input);
         }
 
         if (op.check_nan) {
-            do_check_nan(output_file, generator, instruction_context, op.input);
-            fmt::print(output_file, "\n    ");
+            do_check_nan(generator, instruction_context, op.input);
+            fmt::print(output_file, "\n");
+            print_indent();
         }
 
-        generator.process_unary_op(output_file, op, instruction_context);
+        generator.process_unary_op(op, instruction_context);
         handled = true;
     }
 
-    auto find_conditional_branch_it = conditional_branch_ops.find(instr.getUniqueId());
+    auto find_conditional_branch_it = conditional_branch_ops.find(instr_id);
     if (find_conditional_branch_it != conditional_branch_ops.end()) {
         print_indent();
-        generator.emit_branch_condition(output_file, find_conditional_branch_it->second, instruction_context);
+        // TODO combining the branch condition and branch target into one generator call would allow better optimization in the runtime's JIT generator.
+        // This would require splitting into a conditional jump method and conditional function call method.
+        generator.emit_branch_condition(find_conditional_branch_it->second, instruction_context);
 
         print_indent();
         if (find_conditional_branch_it->second.link) {
-            if (!print_func_call(instr.getBranchVramGeneric())) {
+            if (!print_func_call_by_address(instr.getBranchVramGeneric())) {
                 return false;
             }
         }
@@ -701,22 +721,23 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             }
         }
 
-        generator.emit_branch_close(output_file);
+        print_indent();
+        generator.emit_branch_close();
         
         is_branch_likely = find_conditional_branch_it->second.likely;
         handled = true;
     }
 
-    auto find_store_it = store_ops.find(instr.getUniqueId());
+    auto find_store_it = store_ops.find(instr_id);
     if (find_store_it != store_ops.end()) {
         print_indent();
         const StoreOp& op = find_store_it->second;
 
         if (op.type == StoreOpType::SDC1) {
-            do_check_fr(output_file, generator, instruction_context, op.value_input);
+            do_check_fr(generator, instruction_context, op.value_input);
         }
 
-        generator.process_store_op(output_file, op, instruction_context);
+        generator.process_store_op(op, instruction_context);
         handled = true;
     }
 
@@ -727,23 +748,20 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
     // TODO is this used?
     if (emit_link_branch) {
-        fmt::print(output_file, "    after_{}:\n", link_branch_index);
+        print_indent();
+        generator.emit_label(fmt::format("after_{}", link_branch_index));
     }
 
     return true;
 }
 
-bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64Recomp::Function& func, std::ofstream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+template <typename GeneratorType>
+bool recompile_function_impl(GeneratorType& generator, const N64Recomp::Context& context, size_t func_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    const N64Recomp::Function& func = context.functions[func_index];
     //fmt::print("Recompiling {}\n", func.name);
     std::vector<rabbitizer::InstructionCpu> instructions;
 
-    fmt::print(output_file,
-        "RECOMP_FUNC void {}(uint8_t* rdram, recomp_context* ctx) {{\n"
-        // these variables shouldn't need to be preserved across function boundaries, so make them local for more efficient output
-        "    uint64_t hi = 0, lo = 0, result = 0;\n"
-        "    unsigned int rounding_mode = DEFAULT_ROUNDING_MODE;\n"
-        "    int c1cs = 0;\n", // cop1 conditional signal
-        func.name);
+    generator.emit_function_start(func.name, func_index);
 
     if (context.trace_mode) {
         fmt::print(output_file,
@@ -784,11 +802,11 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             return false;
         }
 
-        std::unordered_set<uint32_t> skipped_insns{};
+        std::unordered_set<uint32_t> jtbl_lw_instructions{};
 
         // Add jump table labels into function
         for (const auto& jtbl : stats.jump_tables) {
-            skipped_insns.insert(jtbl.lw_vram);
+            jtbl_lw_instructions.insert(jtbl.lw_vram);
             for (uint32_t jtbl_entry : jtbl.entries) {
                 branch_labels.insert(jtbl_entry);
             }
@@ -808,11 +826,11 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             bool is_branch_likely = false;
             // If we're in the delay slot of a likely instruction, emit a goto to skip the instruction before any labels
             if (in_likely_delay_slot) {
-                fmt::print(output_file, "    goto skip_{};\n", num_likely_branches);
+                generator.emit_goto(fmt::format("skip_{}", num_likely_branches));
             }
             // If there are any other branch labels to insert and we're at the next one, insert it
             if (cur_label != branch_labels.end() && vram >= *cur_label) {
-                fmt::print(output_file, "L_{:08X}:\n", *cur_label);
+                generator.emit_label(fmt::format("L_{:08X}", *cur_label));
                 ++cur_label;
             }
 
@@ -822,7 +840,7 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             }
 
             // Process the current instruction and check for errors
-            if (process_instruction(context, func, stats, skipped_insns, instr_index, instructions, output_file, false, needs_link_branch, num_link_branches, reloc_index, needs_link_branch, is_branch_likely, tag_reference_relocs, static_funcs_out) == false) {
+            if (process_instruction(generator, context, func, stats, jtbl_lw_instructions, instr_index, instructions, output_file, false, needs_link_branch, num_link_branches, reloc_index, needs_link_branch, is_branch_likely, tag_reference_relocs, static_funcs_out) == false) {
                 fmt::print(stderr, "Error in recompiling {}, clearing output file\n", func.name);
                 output_file.clear();
                 return false;
@@ -833,7 +851,8 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             }
             // Now that the instruction has been processed, emit a skip label for the likely branch if needed
             if (in_likely_delay_slot) {
-                fmt::print(output_file, "    skip_{}:\n", num_likely_branches);
+                fmt::print(output_file, "    ");
+                generator.emit_label(fmt::format("skip_{}", num_likely_branches));
                 num_likely_branches++;
             }
             // Mark the next instruction as being in a likely delay slot if the 
@@ -844,7 +863,17 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
     }
 
     // Terminate the function
-    fmt::print(output_file, ";}}\n");
+    generator.emit_function_end();
     
     return true;
 }
+
+// Wrap the templated function with CGenerator as the template parameter.
+bool N64Recomp::recompile_function(const N64Recomp::Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    CGenerator generator{output_file};
+    return recompile_function_impl(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs);
+}
+
+bool N64Recomp::recompile_function_custom(Generator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    return recompile_function_impl(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs);
+}
diff --git a/src/symbol_lists.cpp b/src/symbol_lists.cpp
index 4b4eff9..cbe5ff5 100644
--- a/src/symbol_lists.cpp
+++ b/src/symbol_lists.cpp
@@ -1,4 +1,4 @@
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 const std::unordered_set<std::string> N64Recomp::reimplemented_funcs {
     // OS initialize functions