diff --git a/.gitignore b/.gitignore
index 13749d1..014e033 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,8 +6,8 @@
 *.elf
 *.z64
 
-# Output C files
-test/funcs
+# Local working data
+tests
 
 # Linux build output
 build/
@@ -42,12 +42,6 @@ bld/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 
-# Libraries (binaries that aren't in the repo)
-test/Lib
-
-# RT64 (since it's not public yet)
-test/RT64
-
 # Runtime files
 imgui.ini
 rt64.log
diff --git a/.gitmodules b/.gitmodules
index 2d7b930..1369f13 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "lib/tomlplusplus"]
 	path = lib/tomlplusplus
 	url = https://github.com/marzer/tomlplusplus
+[submodule "lib/sljit"]
+	path = lib/sljit
+	url = https://github.com/zherczeg/sljit
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2733666..7fc7581 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -164,3 +164,32 @@ target_sources(OfflineModRecomp PRIVATE
 )
 
 target_link_libraries(OfflineModRecomp fmt rabbitizer tomlplusplus::tomlplusplus N64Recomp)
+
+# Live recompiler
+project(LiveRecomp)
+add_library(LiveRecomp)
+
+target_sources(LiveRecomp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/LiveRecomp/live_generator.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src/sljitLir.c
+)
+
+target_include_directories(LiveRecomp PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src
+)
+
+target_link_libraries(LiveRecomp N64Recomp)
+
+# Live recompiler test
+project(LiveRecompTest)
+add_executable(LiveRecompTest)
+
+target_sources(LiveRecompTest PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/LiveRecomp/live_recompiler_test.cpp
+)
+
+target_include_directories(LiveRecompTest PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/lib/sljit/sljit_src
+)
+
+target_link_libraries(LiveRecompTest LiveRecomp)
diff --git a/LiveRecomp/live_generator.cpp b/LiveRecomp/live_generator.cpp
new file mode 100644
index 0000000..48c5dc6
--- /dev/null
+++ b/LiveRecomp/live_generator.cpp
@@ -0,0 +1,1865 @@
+#include <cassert>
+#include <fstream>
+#include <unordered_map>
+#include <cmath>
+
+#include "fmt/format.h"
+#include "fmt/ostream.h"
+
+#include "recompiler/live_recompiler.h"
+#include "recomp.h"
+
+#include "sljitLir.h"
+
+static_assert(sizeof(void*) >= sizeof(sljit_uw), "`void*` must be able to hold a `sljit_uw` value for rewritable jumps!");
+
+constexpr uint64_t rdram_offset = 0xFFFFFFFF80000000ULL;
+
+void N64Recomp::live_recompiler_init() {
+    RabbitizerConfig_Cfg.pseudos.pseudoMove = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBeqz = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBnez = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoNot = false;
+    RabbitizerConfig_Cfg.pseudos.pseudoBal = false;
+}
+
+namespace Registers {
+    constexpr int rdram = SLJIT_S0; // stores (rdram - rdram_offset)
+    constexpr int ctx = SLJIT_S1; // stores ctx
+    constexpr int c1cs = SLJIT_S2; // stores ctx
+    constexpr int hi = SLJIT_S3; // stores ctx
+    constexpr int lo = SLJIT_S4; // stores ctx
+    constexpr int arithmetic_temp1 = SLJIT_R0;
+    constexpr int arithmetic_temp2 = SLJIT_R1;
+    constexpr int arithmetic_temp3 = SLJIT_R2;
+    constexpr int arithmetic_temp4 = SLJIT_R3;
+}
+
+struct InnerCall {
+    size_t target_func_index;
+    sljit_jump* jump;
+};
+
+struct ReferenceSymbolCall {
+    N64Recomp::SymbolReference reference;
+    sljit_jump* jump;
+};
+
+struct SwitchErrorJump {
+    uint32_t instr_vram;
+    uint32_t jtbl_vram;
+    sljit_jump* jump;
+};
+
+struct N64Recomp::LiveGeneratorContext {
+    std::string function_name;
+    std::unordered_map<std::string, sljit_label*> labels;
+    std::unordered_map<std::string, std::vector<sljit_jump*>> pending_jumps;
+    std::vector<sljit_label*> func_labels;
+    std::vector<InnerCall> inner_calls;
+    std::vector<std::vector<std::string>> switch_jump_labels;
+    // See LiveGeneratorOutput::jump_tables for info. Contains sljit labels so they can be linked after recompilation.
+    std::vector<std::pair<std::vector<sljit_label*>, std::unique_ptr<void*[]>>> unlinked_jump_tables;
+    // Jump tables for the current function being recompiled.
+    std::vector<std::unique_ptr<void*[]>> pending_jump_tables;
+    // See LiveGeneratorOutput::reference_symbol_jumps for info.
+    std::vector<std::pair<ReferenceJumpDetails, sljit_jump*>> reference_symbol_jumps;
+    // See LiveGeneratorOutput::import_jumps_by_index for info.
+    std::unordered_multimap<size_t, sljit_jump*> import_jumps_by_index;
+    std::vector<SwitchErrorJump> switch_error_jumps;
+    sljit_jump* cur_branch_jump;
+};
+
+N64Recomp::LiveGenerator::LiveGenerator(size_t num_funcs, const LiveGeneratorInputs& inputs) : inputs(inputs) {
+    compiler = sljit_create_compiler(nullptr);
+    context = std::make_unique<LiveGeneratorContext>();
+    context->func_labels.resize(num_funcs);
+    errored = false;
+}
+
+N64Recomp::LiveGenerator::~LiveGenerator() {
+    if (compiler != nullptr) {
+        sljit_free_compiler(compiler);
+        compiler = nullptr;
+    }
+}
+
+N64Recomp::LiveGeneratorOutput N64Recomp::LiveGenerator::finish() {
+    LiveGeneratorOutput ret{};
+    if (errored) {
+        ret.good = false;
+        return ret;
+    }
+    
+    ret.good = true;
+
+    // Populate all the pending inner function calls.
+    for (const InnerCall& call : context->inner_calls) {
+        sljit_label* target_func_label = context->func_labels[call.target_func_index];
+
+        // Generation isn't valid if the target function wasn't recompiled.
+        if (target_func_label == nullptr) {
+            return { };
+        }
+
+        sljit_set_label(call.jump, target_func_label);
+    }
+
+    // Generate the switch error jump targets and assign the jump labels.
+    if (!context->switch_error_jumps.empty()) {
+        // Allocate the function name and place it in the literals.
+        char* func_name = new char[context->function_name.size() + 1];
+        memcpy(func_name, context->function_name.c_str(), context->function_name.size());
+        func_name[context->function_name.size()] = '\x00';
+        ret.string_literals.emplace_back(func_name);
+
+        std::vector<sljit_jump*> switch_error_return_jumps{};
+        switch_error_return_jumps.resize(context->switch_error_jumps.size());
+
+        // Generate and assign the labels for the switch error jumps.
+        for (size_t i = 0; i < context->switch_error_jumps.size(); i++) {
+            const auto& cur_error_jump = context->switch_error_jumps[i];
+
+            // Generate a label and assign it to the jump.
+            sljit_set_label(cur_error_jump.jump, sljit_emit_label(compiler));
+
+            // Load the arguments (function name, vram, jump table address)
+            sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, SLJIT_IMM, sljit_sw(func_name));
+            sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R1, 0, SLJIT_IMM, sljit_sw(cur_error_jump.instr_vram));
+            sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, sljit_sw(cur_error_jump.jtbl_vram));
+            
+            // Call switch_error.
+            sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3V(P, 32, 32), SLJIT_IMM, sljit_sw(inputs.switch_error));
+
+            // Jump to the return statement.
+            switch_error_return_jumps[i] = sljit_emit_jump(compiler, SLJIT_JUMP);
+        }
+
+        // Generate the return statement.
+        sljit_label* return_label = sljit_emit_label(compiler);
+        sljit_emit_return_void(compiler);
+
+        // Assign the label for all the return jumps.
+        for (sljit_jump* cur_jump : switch_error_return_jumps) {
+            sljit_set_label(cur_jump, return_label);
+        }
+    }
+    context->switch_error_jumps.clear();
+
+    // Generate the code.
+    ret.code = sljit_generate_code(compiler, 0, NULL);
+    ret.code_size = sljit_get_generated_code_size(compiler);
+    ret.functions.resize(context->func_labels.size());
+
+    // Get the function addresses.
+    for (size_t func_index = 0; func_index < ret.functions.size(); func_index++) {
+        sljit_label* func_label = context->func_labels[func_index];
+
+        // If the function wasn't recompiled, don't populate its address.
+        if (func_label != nullptr) {
+            ret.functions[func_index] = reinterpret_cast<recomp_func_t*>(sljit_get_label_addr(func_label));
+        }
+    }
+    context->func_labels.clear();
+
+    // Get the reference symbol jump instruction addresses.
+    ret.reference_symbol_jumps.resize(context->reference_symbol_jumps.size());
+    for (size_t jump_index = 0; jump_index < context->reference_symbol_jumps.size(); jump_index++) {
+        ReferenceJumpDetails& details = context->reference_symbol_jumps[jump_index].first;
+        sljit_jump* jump = context->reference_symbol_jumps[jump_index].second;
+
+        ret.reference_symbol_jumps[jump_index].first = details;
+        ret.reference_symbol_jumps[jump_index].second = reinterpret_cast<void*>(jump->addr);
+    }
+    context->reference_symbol_jumps.clear();
+    
+    // Get the import jump instruction addresses.
+    ret.import_jumps_by_index.reserve(context->import_jumps_by_index.size());
+    for (auto& [jump_index, jump] : context->import_jumps_by_index) {
+        ret.import_jumps_by_index.emplace(jump_index, reinterpret_cast<void*>(jump->addr));
+    }
+    context->import_jumps_by_index.clear();
+
+    // Populate label addresses for the jump tables and place them in the output.
+    for (auto& [labels, jump_table] : context->unlinked_jump_tables) {
+        for (size_t entry_index = 0; entry_index < labels.size(); entry_index++) {
+            sljit_label* cur_label = labels[entry_index];
+            jump_table[entry_index] = reinterpret_cast<void*>(sljit_get_label_addr(cur_label));
+        }
+        ret.jump_tables.emplace_back(std::move(jump_table));
+    }
+    context->unlinked_jump_tables.clear();
+
+    ret.executable_offset = sljit_get_executable_offset(compiler);
+
+    sljit_free_compiler(compiler);
+    compiler = nullptr;
+    errored = false;
+
+    return ret;
+}
+
+N64Recomp::LiveGeneratorOutput::~LiveGeneratorOutput() {
+    if (code != nullptr) {
+        sljit_free_code(code, nullptr);
+        code = nullptr;
+    }
+}
+
+size_t N64Recomp::LiveGeneratorOutput::num_reference_symbol_jumps() const {
+    return reference_symbol_jumps.size();
+}
+
+void N64Recomp::LiveGeneratorOutput::set_reference_symbol_jump(size_t jump_index, recomp_func_t* func) {
+    const auto& jump_entry = reference_symbol_jumps[jump_index];
+    sljit_set_jump_addr(reinterpret_cast<sljit_uw>(jump_entry.second), reinterpret_cast<sljit_uw>(func), executable_offset);
+}
+
+N64Recomp::ReferenceJumpDetails N64Recomp::LiveGeneratorOutput::get_reference_symbol_jump_details(size_t jump_index) {
+    return reference_symbol_jumps[jump_index].first;
+}
+
+void N64Recomp::LiveGeneratorOutput::populate_import_symbol_jumps(size_t import_index, recomp_func_t* func) {
+    auto find_range = import_jumps_by_index.equal_range(import_index);
+    for (auto it = find_range.first; it != find_range.second; ++it) {
+        sljit_set_jump_addr(reinterpret_cast<sljit_uw>(it->second), reinterpret_cast<sljit_uw>(func), executable_offset);
+    }
+}
+
+constexpr int get_gpr_context_offset(int gpr_index) {
+    return offsetof(recomp_context, r0) + sizeof(recomp_context::r0) * gpr_index;
+}
+
+constexpr int get_fpr_single_context_offset(int fpr_index) {
+    return offsetof(recomp_context, f0.fl) + sizeof(recomp_context::f0) * fpr_index;
+}
+
+constexpr int get_fpr_double_context_offset(int fpr_index) {
+    return offsetof(recomp_context, f0.d) + sizeof(recomp_context::f0) * fpr_index;
+}
+
+constexpr int get_fpr_u32l_context_offset(int fpr_index) {
+    if (fpr_index & 1) {
+        // TODO implement odd floats.
+        assert(false);
+        return -1;
+        // return fmt::format("ctx->f_odd[({} - 1) * 2]", fpr_index);
+    }
+    else {
+        return offsetof(recomp_context, f0.u32l) + sizeof(recomp_context::f0) * fpr_index;
+    }
+}
+
+constexpr int get_fpr_u64_context_offset(int fpr_index) {
+    return offsetof(recomp_context, f0.u64) + sizeof(recomp_context::f0) * fpr_index;
+}
+
+void get_gpr_values(int gpr, sljit_sw& out, sljit_sw& outw) {
+    if (gpr == 0) {
+        out = SLJIT_IMM;
+        outw = 0;
+    }
+    else {
+        out = SLJIT_MEM1(Registers::ctx);
+        outw = get_gpr_context_offset(gpr);
+    }
+}
+
+bool get_operand_values(N64Recomp::Operand operand, const N64Recomp::InstructionContext& context, sljit_sw& out, sljit_sw& outw) {
+    using namespace N64Recomp;
+
+    switch (operand) {
+        case Operand::Rd:
+            get_gpr_values(context.rd, out, outw);
+            break;
+        case Operand::Rs:
+            get_gpr_values(context.rs, out, outw);
+            break;
+        case Operand::Rt:
+            get_gpr_values(context.rt, out, outw);
+            break;
+        case Operand::Fd:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_single_context_offset(context.fd);
+            break;
+        case Operand::Fs:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_single_context_offset(context.fs);
+            break;
+        case Operand::Ft:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_single_context_offset(context.ft);
+            break;
+        case Operand::FdDouble:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_double_context_offset(context.fd);
+            break;
+        case Operand::FsDouble:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_double_context_offset(context.fs);
+            break;
+        case Operand::FtDouble:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_double_context_offset(context.ft);
+            break;
+        case Operand::FdU32L:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u32l_context_offset(context.fd);
+            break;
+        case Operand::FsU32L:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u32l_context_offset(context.fs);
+            break;
+        case Operand::FtU32L:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u32l_context_offset(context.ft);
+            break;
+        case Operand::FdU32H:
+            assert(false);
+            return false;
+        case Operand::FsU32H:
+            assert(false);
+            return false;
+        case Operand::FtU32H:
+            assert(false);
+            return false;
+        case Operand::FdU64:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u64_context_offset(context.fd);
+            break;
+        case Operand::FsU64:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u64_context_offset(context.fs);
+            break;
+        case Operand::FtU64:
+            out = SLJIT_MEM1(Registers::ctx);
+            outw = get_fpr_u64_context_offset(context.ft);
+            break;
+        case Operand::ImmU16:
+            out = SLJIT_IMM;
+            outw = (sljit_sw)(uint16_t)context.imm16;
+            break;
+        case Operand::ImmS16:
+            out = SLJIT_IMM;
+            outw = (sljit_sw)(int16_t)context.imm16;
+            break;
+        case Operand::Sa:
+            out = SLJIT_IMM;
+            outw = context.sa;
+            break;
+        case Operand::Sa32:
+            out = SLJIT_IMM;
+            outw = context.sa + 32;
+            break;
+        case Operand::Cop1cs:
+            out = Registers::c1cs;
+            outw = 0;
+            break;
+        case Operand::Hi:
+            out = Registers::hi;
+            outw = 0;
+            break;
+        case Operand::Lo:
+            out = Registers::lo;
+            outw = 0;
+            break;
+        case Operand::Zero:
+            out = SLJIT_IMM;
+            outw = 0;
+            break;
+    }
+    return true;
+}
+
+bool outputs_to_zero(N64Recomp::Operand output, const N64Recomp::InstructionContext& ctx) {
+    if (output == N64Recomp::Operand::Rd && ctx.rd == 0) {
+        return true;
+    }
+    if (output == N64Recomp::Operand::Rt && ctx.rt == 0) {
+        return true;
+    }
+    if (output == N64Recomp::Operand::Rs && ctx.rs == 0) {
+        return true;
+    }
+    return false;
+}
+
+void N64Recomp::LiveGenerator::process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const {
+    // Skip instructions that output to $zero
+    if (outputs_to_zero(op.output, ctx)) {
+        return;
+    }
+ 
+    sljit_sw dst;
+    sljit_sw dstw;
+    sljit_sw src1;
+    sljit_sw src1w;
+    sljit_sw src2;
+    sljit_sw src2w;
+    bool output_good = get_operand_values(op.output, ctx, dst, dstw);
+    bool input0_good = get_operand_values(op.operands.operands[0], ctx, src1, src1w);
+    bool input1_good = get_operand_values(op.operands.operands[1], ctx, src2, src2w);
+
+    if (!output_good || !input0_good || !input1_good) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // If a relocation is present, perform the relocation and change src1/src1w to use the relocated value.
+    if (ctx.reloc_type != RelocType::R_MIPS_NONE) {
+        // Only allow LO16 relocations.
+        if (ctx.reloc_type != RelocType::R_MIPS_LO16) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Only allow relocations on immediates.
+        if (src2 != SLJIT_IMM) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Only allow relocations on loads and adds.
+        switch (op.type) {
+            case BinaryOpType::LD:
+            case BinaryOpType::LW:
+            case BinaryOpType::LWU:
+            case BinaryOpType::LH:
+            case BinaryOpType::LHU:
+            case BinaryOpType::LB:
+            case BinaryOpType::LBU:
+            case BinaryOpType::LDL:
+            case BinaryOpType::LDR:
+            case BinaryOpType::LWL:
+            case BinaryOpType::LWR:
+            case BinaryOpType::Add64:
+            case BinaryOpType::Add32:
+                break;
+            default:
+                // Relocations aren't allowed on this instruction.
+                assert(false);
+                errored = true;
+                return;
+        }
+        // Load the relocated address into temp2.
+        load_relocated_address(ctx, Registers::arithmetic_temp1);
+        // Extract the LO16 value from the full address (sign extended lower 16 bits).
+        sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        // Replace the immediate input (src2) with the LO16 value.
+        src2 = Registers::arithmetic_temp1;
+        src2w = 0;
+    }
+
+    // TODO validate that the unary ops are valid for the current binary op.
+    if (op.operands.operand_operations[0] != UnaryOpType::None &&
+        op.operands.operand_operations[0] != UnaryOpType::ToU64 &&
+        op.operands.operand_operations[0] != UnaryOpType::ToS64 &&
+        op.operands.operand_operations[0] != UnaryOpType::ToU32)
+    {
+        assert(false);
+        errored = true;
+        return;
+    }
+    
+    if (op.operands.operand_operations[1] != UnaryOpType::None &&
+        op.operands.operand_operations[1] != UnaryOpType::ToU64 &&
+        op.operands.operand_operations[1] != UnaryOpType::ToS64 &&
+        op.operands.operand_operations[1] != UnaryOpType::Mask5 && // Only for 32-bit shifts
+        op.operands.operand_operations[1] != UnaryOpType::Mask6) // Only for 64-bit shifts
+    {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    bool cmp_unsigned = op.operands.operand_operations[0] != UnaryOpType::ToS64;
+
+    auto sign_extend_and_store = [dst, dstw, this]() {
+        // Sign extend the result.
+        sljit_emit_op1(this->compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        // Store the result back into the context.
+        sljit_emit_op1(this->compiler, SLJIT_MOV_P, dst, dstw, Registers::arithmetic_temp1, 0);
+    };
+
+    auto do_op32 = [src1, src1w, src2, src2w, this, &sign_extend_and_store](sljit_s32 op) {
+        sljit_emit_op2(this->compiler, op, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+        sign_extend_and_store();
+    };
+
+    auto do_op64 = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op) {
+        sljit_emit_op2(this->compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    };
+
+    auto do_float_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op) {
+        sljit_emit_fop2(this->compiler, op, dst, dstw, src1, src1w, src2, src2w);
+    };
+
+    auto do_load_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op, int address_xor) {
+        // TODO 0 immediate optimization.
+
+        // Add the base and immediate into the arithemtic temp.
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+
+        if (address_xor != 0) {
+            // xor the address with the specified amount
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, address_xor);
+        }
+        
+        // Load the value at rdram + address into the arithemtic temp with the given operation to allow for sign-extension or zero-extension.
+        sljit_emit_op1(compiler, op, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0);
+
+        // Move the arithmetic temp into the destination.
+        sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, Registers::arithmetic_temp1, 0);
+    };
+
+    auto do_compare_op = [cmp_unsigned, dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 op_unsigned, sljit_s32 op_signed) {
+        // Pick the operation based on the signedness of the comparison.
+        sljit_s32 op = cmp_unsigned ? op_unsigned : op_signed;
+
+        // Pick the flags to set based on the operation.
+        sljit_s32 flags;
+        if (op <= SLJIT_NOT_ZERO) {
+            flags = SLJIT_SET_Z;
+        } else
+        {
+            flags = SLJIT_SET(op);
+        }
+
+        // Perform a subtraction with the determined flag.
+        sljit_emit_op2u(compiler, SLJIT_SUB | flags, src1, src1w, src2, src2w);
+        
+        // Move the operation's flag into the destination.
+        sljit_emit_op_flags(compiler, SLJIT_MOV, dst, dstw, op);
+    };
+
+    auto do_float_compare_op = [dst, dstw, src1, src1w, src2, src2w, this](sljit_s32 flag_op, sljit_s32 set_op, bool double_precision) {
+        // Pick the operation based on the signedness of the comparison.
+        sljit_s32 compare_op = set_op | (double_precision ? SLJIT_CMP_F64 : SLJIT_CMP_F32);
+
+        // Perform the comparison with the determined operation.
+        // Float comparisons use fop1 and put the left hand side in dst.
+        sljit_emit_fop1(compiler, compare_op, src1, src1w, src2, src2w);
+        
+        // Move the operation's flag into the destination.
+        sljit_emit_op_flags(compiler, SLJIT_MOV, dst, dstw, flag_op);
+    };
+
+    auto do_unaligned_load_op = [dst, dstw, src1, src1w, src2, src2w, this](bool left, bool doubleword) {
+        // TODO 0 immediate optimization.
+
+        // Determine the shift direction to use for calculating the mask and shifting the loaded value.
+        sljit_sw shift_op = left ? SLJIT_SHL : SLJIT_LSHR;
+        // Determine the operation's word size.
+        sljit_sw word_size = doubleword ? 8 : 4;
+
+        // Add the base and immediate into the temp1.
+        // addr = base + offset
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+
+        // Mask the address with the alignment mask to get the misalignment and put it in temp2.
+        // misalignment = addr & (word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, word_size - 1);
+
+        // Mask the address with ~alignment_mask to get the aligned address and put it in temp1.
+        // addr = addr & ~(word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, ~(word_size - 1));
+
+        // Load the word at rdram + aligned address into the temp1 with sign-extension.
+        // loaded_value = *addr
+        if (doubleword) {
+            // Rotate the loaded doubleword by 32 bits to swap the two words into the right order.
+            sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32);
+        }
+        else {
+            // Use MOV_S32 to sign-extend the loaded word.
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0);
+        }
+
+        // Inverse the misalignment if this is a right load.
+        if (!left) {
+            // misalignment = (word_size - 1 - misalignment) * 8
+            sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp2, 0, SLJIT_IMM, word_size - 1, Registers::arithmetic_temp2, 0);
+        }
+
+        // Calculate the misalignment shift and put it into temp2.
+        // misalignment_shift = misalignment * 8
+        sljit_emit_op2(compiler, SLJIT_SHL, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, 3);
+
+        // Calculate the misalignment mask and put it into temp3. Use a 32-bit shift if this is a 32-bit operation.
+        // misalignment_mask = word(-1) SHIFT misalignment_shift
+        sljit_emit_op2(compiler, doubleword ? shift_op : (shift_op | SLJIT_32),
+            Registers::arithmetic_temp3, 0,
+            SLJIT_IMM, doubleword ? uint64_t(-1) : uint32_t(-1),
+            Registers::arithmetic_temp2, 0);
+
+        if (!doubleword) {
+            // Sign extend the misalignment mask.
+            // misalignment_mask = ((uint64_t)(int32_t)misalignment_mask)
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0);
+        }
+
+        // Shift the loaded value by the misalignment shift and put it into temp1.
+        // loaded_value SHIFT misalignment_shift
+        sljit_emit_op2(compiler, shift_op, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0);
+
+        if (left && !doubleword) {
+            // Sign extend the loaded value.
+            // loaded_value = (uint64_t)(int32_t)loaded_value
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        }
+
+        // Mask the shifted loaded value by the misalignment mask.
+        // loaded_value &= misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp3, 0);
+
+        // Invert the misalignment mask and store it into temp3.
+        // misalignment_mask = ~misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, SLJIT_IMM, sljit_sw(-1));
+
+        // Mask the initial value (stored in the destination) with the misalignment mask and place it into temp3.
+        // masked_value = initial_value & misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp3, 0, dst, dstw, Registers::arithmetic_temp3, 0);
+
+        // Combine the masked initial value with the shifted loaded value and store it in the destination.
+        // out = masked_value | loaded_value
+        sljit_emit_op2(compiler, SLJIT_OR, dst, dstw, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp1, 0);
+    };
+
+    switch (op.type) {
+        // Addition/subtraction
+        case BinaryOpType::Add32:
+            do_op32(SLJIT_ADD32);
+            break;
+        case BinaryOpType::Sub32:
+            do_op32(SLJIT_SUB32);
+            break;
+        case BinaryOpType::Add64:
+            do_op64(SLJIT_ADD);
+            break;
+        case BinaryOpType::Sub64:
+            do_op64(SLJIT_SUB);
+            break;
+
+        // Float arithmetic
+        case BinaryOpType::AddFloat:
+            do_float_op(SLJIT_ADD_F32);
+            break;
+        case BinaryOpType::AddDouble:
+            do_float_op(SLJIT_ADD_F64);
+            break;
+        case BinaryOpType::SubFloat:
+            do_float_op(SLJIT_SUB_F32);
+            break;
+        case BinaryOpType::SubDouble:
+            do_float_op(SLJIT_SUB_F64);
+            break;
+        case BinaryOpType::MulFloat:
+            do_float_op(SLJIT_MUL_F32);
+            break;
+        case BinaryOpType::MulDouble:
+            do_float_op(SLJIT_MUL_F64);
+            break;
+        case BinaryOpType::DivFloat:
+            do_float_op(SLJIT_DIV_F32);
+            break;
+        case BinaryOpType::DivDouble:
+            do_float_op(SLJIT_DIV_F64);
+            break;
+
+        // Bitwise
+        case BinaryOpType::And64:
+            do_op64(SLJIT_AND);
+            break;
+        case BinaryOpType::Or64:
+            do_op64(SLJIT_OR);
+            break;
+        case BinaryOpType::Nor64:
+            // Bitwise or the two registers and move the result into the temp, then invert the result and move it into the destination.
+            sljit_emit_op2(this->compiler, SLJIT_OR, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+            sljit_emit_op2(this->compiler, SLJIT_XOR, dst, dstw, Registers::arithmetic_temp1, 0, SLJIT_IMM, sljit_sw(-1));
+            break;
+        case BinaryOpType::Xor64:
+            do_op64(SLJIT_XOR);
+            break;
+        case BinaryOpType::Sll32:
+            // TODO only mask if the second input's op is Mask5.
+            do_op32(SLJIT_MSHL32);
+            break;
+        case BinaryOpType::Sll64:
+            // TODO only mask if the second input's op is Mask6.
+            do_op64(SLJIT_MSHL);
+            break;
+        case BinaryOpType::Srl32:
+            // TODO only mask if the second input's op is Mask5.
+            do_op32(SLJIT_MLSHR32);
+            break;
+        case BinaryOpType::Srl64:
+            // TODO only mask if the second input's op is Mask6.
+            do_op64(SLJIT_MLSHR);
+            break;
+        case BinaryOpType::Sra32:
+            // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half.
+            // This means we have to use a 64-bit shift and manually mask the input before shifting.
+            // TODO only mask if the second input's op is Mask5.
+            sljit_emit_op2(this->compiler, SLJIT_AND32, Registers::arithmetic_temp1, 0, src2, src2w, SLJIT_IMM, 0b11111);
+            sljit_emit_op2(this->compiler, SLJIT_MASHR, Registers::arithmetic_temp1, 0, src1, src1w, Registers::arithmetic_temp1, 0);
+            sign_extend_and_store();
+            break;
+        case BinaryOpType::Sra64:
+            // TODO only mask if the second input's op is Mask6.
+            do_op64(SLJIT_MASHR);
+            break;
+
+        // Comparisons
+        case BinaryOpType::Equal:
+            do_compare_op(SLJIT_EQUAL, SLJIT_EQUAL);
+            break;
+        case BinaryOpType::NotEqual:
+            do_compare_op(SLJIT_NOT_EQUAL, SLJIT_NOT_EQUAL);
+            break;
+        case BinaryOpType::Less:
+            do_compare_op(SLJIT_LESS, SLJIT_SIG_LESS);
+            break;
+        case BinaryOpType::LessEq:
+            do_compare_op(SLJIT_LESS_EQUAL, SLJIT_SIG_LESS_EQUAL);
+            break;
+        case BinaryOpType::Greater:
+            do_compare_op(SLJIT_GREATER, SLJIT_SIG_GREATER);
+            break;
+        case BinaryOpType::GreaterEq:
+            do_compare_op(SLJIT_GREATER_EQUAL, SLJIT_SIG_GREATER_EQUAL);
+            break;
+        case BinaryOpType::EqualFloat:
+            do_float_compare_op(SLJIT_F_EQUAL, SLJIT_SET_F_EQUAL, false);
+            break;
+        case BinaryOpType::LessFloat:
+            do_float_compare_op(SLJIT_F_LESS, SLJIT_SET_F_LESS, false);
+            break;
+        case BinaryOpType::LessEqFloat:
+            do_float_compare_op(SLJIT_F_LESS_EQUAL, SLJIT_SET_F_LESS_EQUAL, false);
+            break;
+        case BinaryOpType::EqualDouble:
+            do_float_compare_op(SLJIT_F_EQUAL, SLJIT_SET_F_EQUAL, true);
+            break;
+        case BinaryOpType::LessDouble:
+            do_float_compare_op(SLJIT_F_LESS, SLJIT_SET_F_LESS, true);
+            break;
+        case BinaryOpType::LessEqDouble:
+            do_float_compare_op(SLJIT_F_LESS_EQUAL, SLJIT_SET_F_LESS_EQUAL, true);
+            break;
+
+        // Loads
+        case BinaryOpType::LD:
+            // Add the base and immediate into the arithemtic temp.
+            sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, src1, src1w, src2, src2w);
+        
+            // Load the value at rdram + address into the arithemtic temp and rotate it by 32 bits to swap the two words into the right order.
+            sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32);
+
+            // Move the arithmetic temp into the destination.
+            sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, Registers::arithmetic_temp1, 0);
+            break;
+        case BinaryOpType::LW:
+            do_load_op(SLJIT_MOV_S32, 0);
+            break;
+        case BinaryOpType::LWU:
+            do_load_op(SLJIT_MOV_U32, 0);
+            break;
+        case BinaryOpType::LH:
+            do_load_op(SLJIT_MOV_S16, 2);
+            break;
+        case BinaryOpType::LHU:
+            do_load_op(SLJIT_MOV_U16, 2);
+            break;
+        case BinaryOpType::LB:
+            do_load_op(SLJIT_MOV_S8, 3);
+            break;
+        case BinaryOpType::LBU:
+            do_load_op(SLJIT_MOV_U8, 3);
+            break;
+        case BinaryOpType::LDL:
+            do_unaligned_load_op(true, true);
+            break;
+        case BinaryOpType::LDR:
+            do_unaligned_load_op(false, true);
+            break;
+        case BinaryOpType::LWL:
+            do_unaligned_load_op(true, false);
+            break;
+        case BinaryOpType::LWR:
+            do_unaligned_load_op(false, false);
+            break;
+        default:
+            assert(false);
+            errored = true;
+            return;
+    }
+}
+
+int32_t do_round_w_s(float num) {
+    return lroundf(num);
+}
+
+int32_t do_round_w_d(double num) {
+    return lround(num);
+}
+
+int64_t do_round_l_s(float num) {
+    return llroundf(num);
+}
+
+int64_t do_round_l_d(double num) {
+    return llround(num);
+}
+
+int32_t do_ceil_w_s(float num) {
+    return (int32_t)ceilf(num);
+}
+
+int32_t do_ceil_w_d(double num) {
+    return (int32_t)ceil(num);
+}
+
+int64_t do_ceil_l_s(float num) {
+    return (int64_t)ceilf(num);
+}
+
+int64_t do_ceil_l_d(double num) {
+    return (int64_t)ceil(num);
+}
+
+int32_t do_floor_w_s(float num) {
+    return (int32_t)floorf(num);
+}
+
+int32_t do_floor_w_d(double num) {
+    return (int32_t)floor(num);
+}
+
+int64_t do_floor_l_s(float num) {
+    return (int64_t)floorf(num);
+}
+
+int64_t do_floor_l_d(double num) {
+    return (int64_t)floor(num);
+}
+
+void N64Recomp::LiveGenerator::load_relocated_address(const InstructionContext& ctx, int reg) const {
+    // Get the pointer to the section address.
+    int32_t* section_addr_ptr = (ctx.reloc_tag_as_reference ? inputs.reference_section_addresses : inputs.local_section_addresses) + ctx.reloc_section_index;
+
+    // Load the section's address into the target register.
+    sljit_emit_op1(compiler, SLJIT_MOV_S32, reg, 0, SLJIT_MEM0(), sljit_sw(section_addr_ptr));
+
+    // Don't emit the add if the offset is zero (small optimization).
+    if (ctx.reloc_target_section_offset != 0) {
+        // Add the reloc section offset to the section's address and put the result in R0.
+        sljit_emit_op2(compiler, SLJIT_ADD, reg, 0, reg, 0, SLJIT_IMM, ctx.reloc_target_section_offset);
+    }
+}
+
+void N64Recomp::LiveGenerator::process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const {
+    // Skip instructions that output to $zero
+    if (outputs_to_zero(op.output, ctx)) {
+        return;
+    }
+
+    sljit_sw dst;
+    sljit_sw dstw;
+    sljit_sw src;
+    sljit_sw srcw;
+    bool output_good = get_operand_values(op.output, ctx, dst, dstw);
+    bool input_good = get_operand_values(op.input, ctx, src, srcw);
+
+    if (!output_good || !input_good) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // If a relocation is needed for the input operand, perform the relocation and store the result directly.
+    if (ctx.reloc_type != RelocType::R_MIPS_NONE) {
+        // Only allow relocation of lui with an immediate.
+        if (op.operation != UnaryOpType::Lui || op.input != Operand::ImmU16) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Only allow HI16 relocs.
+        if (ctx.reloc_type != RelocType::R_MIPS_HI16) {
+            assert(false);
+            errored = true;
+            return;
+        }
+        // Load the relocated address into temp1.
+        load_relocated_address(ctx, Registers::arithmetic_temp1);
+
+        // HI16 reloc on a lui
+        // The 32-bit address (a) is equal to section address + section offset
+        // The 16-bit immediate is equal to (a - (int16_t)a) >> 16
+        // Therefore, the register should be set to (int32_t)(a - (int16_t)a) as the shifts cancel out and the lower 16 bits are zero.
+
+        // Extract a sign extended 16-bit value from the lower half of the relocated address and put it in temp2.
+        sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0);
+
+        // Subtract the sign extended 16-bit value from the full address to get the HI16 value and place it in the destination.
+        sljit_emit_op2(compiler, SLJIT_SUB, dst, dstw, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0);
+        return;
+    }
+
+    sljit_s32 jit_op = SLJIT_BREAKPOINT;
+
+    bool float_op = false;
+    bool func_float_op = false;
+
+    auto emit_s_func = [this, src, srcw, dst, dstw, &func_float_op](float (*func)(float)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(F32, F32), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, dst, dstw, SLJIT_RETURN_FREG, 0);
+    };
+
+    auto emit_d_func = [this, src, srcw, dst, dstw, &func_float_op](double (*func)(double)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(F64, F64), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, dst, dstw, SLJIT_RETURN_FREG, 0);
+    };
+
+    auto emit_l_from_s_func = [this, src, srcw, dst, dstw, &func_float_op](int64_t (*func)(float)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, F32), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    auto emit_w_from_s_func = [this, src, srcw, dst, dstw, &func_float_op](int32_t (*func)(float)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F32, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(32, F32), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    auto emit_l_from_d_func = [this, src, srcw, dst, dstw, &func_float_op](int64_t (*func)(double)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, F64), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    auto emit_w_from_d_func = [this, src, srcw, dst, dstw, &func_float_op](int32_t (*func)(double)) {
+        func_float_op = true;
+
+        sljit_emit_fop1(compiler, SLJIT_MOV_F64, SLJIT_FR0, 0, src, srcw);
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(32, F64), SLJIT_IMM, sljit_sw(func));
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0);
+    };
+
+    switch (op.operation) {
+        case UnaryOpType::Lui:
+            if (src != SLJIT_IMM) {
+                assert(false);
+                errored = true;
+                break;
+            }
+            src = SLJIT_IMM;
+            srcw = (sljit_sw)(int32_t)(srcw << 16);
+            jit_op = SLJIT_MOV;
+            break;
+        case UnaryOpType::NegateFloat:
+            jit_op = SLJIT_NEG_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::NegateDouble:
+            jit_op = SLJIT_NEG_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::AbsFloat:
+            jit_op = SLJIT_ABS_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::AbsDouble:
+            jit_op = SLJIT_ABS_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::SqrtFloat:
+            emit_s_func(sqrtf);
+            break;
+        case UnaryOpType::SqrtDouble:
+            emit_d_func(sqrt);
+            break;
+        case UnaryOpType::ConvertSFromW:
+            jit_op = SLJIT_CONV_F32_FROM_S32;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertWFromS:
+            emit_w_from_s_func(do_cvt_w_s);
+            break;
+        case UnaryOpType::ConvertDFromW:
+            jit_op = SLJIT_CONV_F64_FROM_S32;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertWFromD:
+            emit_w_from_d_func(do_cvt_w_d);
+            break;
+        case UnaryOpType::ConvertDFromS:
+            jit_op = SLJIT_CONV_F64_FROM_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertSFromD:
+            // SLJIT_CONV_F32_FROM_F64 uses the current rounding mode, just as CVT_S_D does.
+            jit_op = SLJIT_CONV_F32_FROM_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertDFromL:
+            jit_op = SLJIT_CONV_F64_FROM_SW;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertLFromD:
+            emit_l_from_d_func(do_cvt_l_d);
+            break;
+        case UnaryOpType::ConvertSFromL:
+            jit_op = SLJIT_CONV_F32_FROM_SW;
+            float_op = true;
+            break;
+        case UnaryOpType::ConvertLFromS:
+            emit_l_from_s_func(do_cvt_l_s);
+            break;
+        case UnaryOpType::TruncateWFromS:
+            // SLJIT_CONV_S32_FROM_F32 rounds towards zero, just as TRUNC_W_S does.
+            jit_op = SLJIT_CONV_S32_FROM_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::TruncateWFromD:
+            // SLJIT_CONV_S32_FROM_F64 rounds towards zero, just as TRUNC_W_D does.
+            jit_op = SLJIT_CONV_S32_FROM_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::TruncateLFromS:
+            // SLJIT_CONV_SW_FROM_F32 rounds towards zero, just as TRUNC_L_S does.
+            jit_op = SLJIT_CONV_SW_FROM_F32;
+            float_op = true;
+            break;
+        case UnaryOpType::TruncateLFromD:
+            // SLJIT_CONV_SW_FROM_F64 rounds towards zero, just as TRUNC_L_D does.
+            jit_op = SLJIT_CONV_SW_FROM_F64;
+            float_op = true;
+            break;
+        case UnaryOpType::RoundWFromS:
+            emit_w_from_s_func(do_round_w_s);
+            break;
+        case UnaryOpType::RoundWFromD:
+            emit_w_from_d_func(do_round_w_d);
+            break;
+        case UnaryOpType::RoundLFromS:
+            emit_l_from_s_func(do_round_l_s);
+            break;
+        case UnaryOpType::RoundLFromD:
+            emit_l_from_d_func(do_round_l_d);
+            break;
+        case UnaryOpType::CeilWFromS:
+            emit_w_from_s_func(do_ceil_w_s);
+            break;
+        case UnaryOpType::CeilWFromD:
+            emit_w_from_d_func(do_ceil_w_d);
+            break;
+        case UnaryOpType::CeilLFromS:
+            emit_l_from_s_func(do_ceil_l_s);
+            break;
+        case UnaryOpType::CeilLFromD:
+            emit_l_from_d_func(do_ceil_l_d);
+            break;
+        case UnaryOpType::FloorWFromS:
+            emit_w_from_s_func(do_floor_w_s);
+            break;
+        case UnaryOpType::FloorWFromD:
+            emit_w_from_d_func(do_floor_w_d);
+            break;
+        case UnaryOpType::FloorLFromS:
+            emit_l_from_s_func(do_floor_l_s);
+            break;
+        case UnaryOpType::FloorLFromD:
+            emit_l_from_d_func(do_floor_l_d);
+            break;
+        case UnaryOpType::None:
+            jit_op = SLJIT_MOV;
+            break;
+        case UnaryOpType::ToS32:
+        case UnaryOpType::ToInt32:
+            jit_op = SLJIT_MOV_S32;
+            break;
+        // Unary ops that can't be used as a standalone operation
+        case UnaryOpType::ToU32:
+        case UnaryOpType::ToS64:
+        case UnaryOpType::ToU64:
+        case UnaryOpType::Mask5:
+        case UnaryOpType::Mask6:
+            assert(false && "Unsupported unary op");
+            errored = true;
+            return;
+    }
+
+    if (func_float_op) {
+        // Already handled by the lambda.
+    }
+    else if (float_op) {
+        sljit_emit_fop1(compiler, jit_op, dst, dstw, src, srcw);
+    }
+    else {
+        sljit_emit_op1(compiler, jit_op, dst, dstw, src, srcw);
+    }
+}
+
+void N64Recomp::LiveGenerator::process_store_op(const StoreOp& op, const InstructionContext& ctx) const {
+    sljit_sw src;
+    sljit_sw srcw;
+    sljit_sw imm = (sljit_sw)(int16_t)ctx.imm16;
+
+    get_operand_values(op.value_input, ctx, src, srcw);
+
+    // Only LO16 relocs are valid on stores.
+    if (ctx.reloc_type != RelocType::R_MIPS_NONE && ctx.reloc_type != RelocType::R_MIPS_LO16) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    if (ctx.reloc_type == RelocType::R_MIPS_LO16) {
+        // Load the relocated address into temp1.
+        load_relocated_address(ctx, Registers::arithmetic_temp1);
+        // Extract the LO16 value from the full address (sign extended lower 16 bits).
+        sljit_emit_op1(compiler, SLJIT_MOV_S16, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+        // Add the base register (rs) to the LO16 immediate.
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(ctx.rs));
+    }
+    else {
+        // TODO 0 immediate optimization.
+
+        // Add the base register (rs) and the immediate to get the address and store it in the arithemtic temp.
+        sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(ctx.rs), SLJIT_IMM, imm);
+    }
+
+    auto do_unaligned_store_op = [src, srcw, this](bool left, bool doubleword) {
+        // Determine the shift direction to use for calculating the mask and shifting the loaded value.
+        sljit_sw shift_op = left ? SLJIT_LSHR : SLJIT_SHL;
+        // Determine the operation's word size.
+        sljit_sw word_size = doubleword ? 8 : 4;
+
+        // Mask the address with the alignment mask to get the misalignment and put it in temp2.
+        // misalignment = addr & (word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, word_size - 1);
+
+        // Mask the address with ~alignment_mask to get the aligned address and put it in temp1.
+        // addr = addr & ~(word_size - 1);
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, ~(word_size - 1));
+
+        // Load the word at rdram + aligned address into the temp1 with sign-extension.
+        // loaded_value = *addr
+        if (doubleword) {
+            // Rotate the loaded doubleword by 32 bits to swap the two words into the right order.
+            sljit_emit_op2(compiler, SLJIT_ROTL, Registers::arithmetic_temp3, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, SLJIT_IMM, 32);
+        }
+        else {
+            // Use MOV_S32 to sign-extend the loaded word.
+            sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::arithmetic_temp3, 0, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0);
+        }
+
+        // Inverse the misalignment if this is a right load.
+        if (!left) {
+            // misalignment = (word_size - 1 - misalignment) * 8
+            sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp2, 0, SLJIT_IMM, word_size - 1, Registers::arithmetic_temp2, 0);
+        }
+
+        // Calculate the misalignment shift and put it into temp2.
+        // misalignment_shift = misalignment * 8
+        sljit_emit_op2(compiler, SLJIT_SHL, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, 3);
+
+        // Shift the input value by the misalignment shift and put it into temp4.
+        // input_value SHIFT= misalignment_shift
+        sljit_emit_op2(compiler, shift_op, Registers::arithmetic_temp4, 0, src, srcw, Registers::arithmetic_temp2, 0);
+
+        // Calculate the misalignment mask and put it into temp2. Use a 32-bit shift if this is a 32-bit operation.
+        // misalignment_mask = word(-1) SHIFT misalignment_shift
+        sljit_emit_op2(compiler, doubleword ? shift_op : (shift_op | SLJIT_32),
+            Registers::arithmetic_temp2, 0,
+            SLJIT_IMM, doubleword ? uint64_t(-1) : uint32_t(-1),
+            Registers::arithmetic_temp2, 0);
+
+        // Mask the input value with the misalignment mask and place it into temp4.
+        // masked_value = shifted_value & misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp2, 0);
+
+        // Invert the misalignment mask and store it into temp2.
+        // misalignment_mask = ~misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp2, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, sljit_sw(-1));
+
+        // Mask the loaded value by the misalignment mask.
+        // input_value &= misalignment_mask
+        sljit_emit_op2(compiler, SLJIT_AND, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp2, 0);
+
+        // Combine the masked initial value with the shifted loaded value and store it in the destination.
+        // out = masked_value | input_value
+        if (doubleword) {
+            // Combine the values into a temp so that it can be rotated to the correct word order.
+            sljit_emit_op2(compiler, SLJIT_OR, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp3, 0);
+            sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, Registers::arithmetic_temp4, 0, SLJIT_IMM, 32);
+        }
+        else {
+            sljit_emit_op2(compiler, SLJIT_OR32, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp3, 0);
+        }
+    };
+
+    switch (op.type) {
+        case StoreOpType::SD:
+        case StoreOpType::SDC1:        
+            // Rotate the arithmetic temp by 32 bits to swap the words and move it into the destination.
+            sljit_emit_op2(compiler, SLJIT_ROTL, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw, SLJIT_IMM, 32);
+            break;
+        case StoreOpType::SDL:
+            do_unaligned_store_op(true, true);
+            break;
+        case StoreOpType::SDR:
+            do_unaligned_store_op(false, true);
+            break;
+        case StoreOpType::SW:
+        case StoreOpType::SWC1:
+            // store the 32-bit value at address + rdram
+            sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw);
+            break;
+        case StoreOpType::SWL:
+            do_unaligned_store_op(true, false);
+            break;
+        case StoreOpType::SWR:
+            do_unaligned_store_op(false, false);
+            break;
+        case StoreOpType::SH:
+            // xor the address with 2
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, 2);
+            // store the 16-bit value at address + rdram
+            sljit_emit_op1(compiler, SLJIT_MOV_U16, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw);
+            break;
+        case StoreOpType::SB:
+            // xor the address with 3
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, 3);
+            // store the 8-bit value at address + rdram
+            sljit_emit_op1(compiler, SLJIT_MOV_U8, SLJIT_MEM2(Registers::rdram, Registers::arithmetic_temp1), 0, src, srcw);
+            break;
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_function_start(const std::string& function_name, size_t func_index) const {
+    context->function_name = function_name;
+    context->func_labels[func_index] = sljit_emit_label(compiler);
+    // sljit_emit_op0(compiler, SLJIT_BREAKPOINT);
+    sljit_emit_enter(compiler, 0, SLJIT_ARGS2V(P, P), 4 | SLJIT_ENTER_FLOAT(1), 5 | SLJIT_ENTER_FLOAT(0), 0);
+    sljit_emit_op2(compiler, SLJIT_SUB, Registers::rdram, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+}
+
+void N64Recomp::LiveGenerator::emit_function_end() const {
+    // Check that all jumps have been paired to a label.
+    if (!context->pending_jumps.empty()) {
+        assert(false);
+        errored = true;
+    }
+    
+    // Populate the labels for pending switches and move them into the unlinked jump tables.
+    bool invalid_switch = false;
+    for (size_t switch_index = 0; switch_index < context->switch_jump_labels.size(); switch_index++) {
+        const std::vector<std::string>& cur_labels = context->switch_jump_labels[switch_index];
+        std::vector<sljit_label*> cur_label_addrs{};
+        cur_label_addrs.resize(cur_labels.size());
+        for (size_t case_index = 0; case_index < cur_labels.size(); case_index++) {
+            // Find the label.
+            auto find_it = context->labels.find(cur_labels[case_index]);
+            if (find_it == context->labels.end()) {
+                // Label not found, invalid switch.
+                // Track this in a variable instead of returning immediately so that the pending labels are still cleared.
+                invalid_switch = true;
+                break;
+            }
+            cur_label_addrs[case_index] = find_it->second;
+        }
+        context->unlinked_jump_tables.emplace_back(
+            std::make_pair<std::vector<sljit_label*>, std::unique_ptr<void*[]>>(
+                std::move(cur_label_addrs),
+                std::move(context->pending_jump_tables[switch_index])
+            )
+        );
+    }
+    context->switch_jump_labels.clear();
+    context->pending_jump_tables.clear();
+
+    // Clear the labels to prevent labels from one function being jumped to by another.
+    context->labels.clear();
+
+    if (invalid_switch) {
+        assert(false);
+        errored = true;
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_function_call_lookup(uint32_t addr) const {
+    // Load the address immediate into the first argument. 
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, int32_t(addr));
+    
+    // Call get_function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, 32), SLJIT_IMM, sljit_sw(inputs.get_function));
+    
+    // Copy the return value into R2 so that it can be used for icall
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_R0, 0);
+    
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+
+    // Call the function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P), SLJIT_R2, 0);
+}
+
+void N64Recomp::LiveGenerator::emit_function_call_by_register(int reg) const {
+    // Load the register's value into the first argument. 
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg));
+
+    // Call get_function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1(P, 32), SLJIT_IMM, sljit_sw(inputs.get_function));
+
+    // Copy the return value into R2 so that it can be used for icall
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R2, 0, SLJIT_R0, 0);
+
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+
+    // Call the function.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P), SLJIT_R2, 0);
+}
+
+void N64Recomp::LiveGenerator::emit_function_call_reference_symbol(const Context&, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const {
+    (void)symbol_index;
+
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // sljit_emit_op0(compiler, SLJIT_BREAKPOINT);
+    // Call the function and save the jump to set its label later on.
+    sljit_jump* call_jump = sljit_emit_call(compiler, SLJIT_CALL | SLJIT_REWRITABLE_JUMP, SLJIT_ARGS2V(P, P));
+    // Set a dummy jump value, this will get replaced during reference/import symbol jump population.
+    if (section_index == N64Recomp::SectionImport) {
+        sljit_set_target(call_jump, sljit_uw(-1));
+        context->import_jumps_by_index.emplace(symbol_index, call_jump);
+    }
+    else {
+        sljit_set_target(call_jump, sljit_uw(-2));
+        context->reference_symbol_jumps.emplace_back(std::make_pair(
+            ReferenceJumpDetails{
+                .section = section_index,
+                .section_offset = target_section_offset
+            },
+            call_jump
+        ));
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_function_call(const Context&, size_t function_index) const {
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // Call the function and save the jump to set its label later on.
+    sljit_jump* call_jump = sljit_emit_call(compiler, SLJIT_CALL, SLJIT_ARGS2V(P, P));
+    context->inner_calls.emplace_back(InnerCall{ .target_func_index = function_index, .jump = call_jump });
+}
+
+void N64Recomp::LiveGenerator::emit_named_function_call(const std::string& function_name) const {
+    // The live recompiler can't call functions by name. This is only used for statics, so it's not an issue.
+    assert(false);
+    errored = true;
+}
+
+void N64Recomp::LiveGenerator::emit_goto(const std::string& target) const {
+    sljit_jump* jump = sljit_emit_jump(compiler, SLJIT_JUMP);
+    // Check if the label already exists.
+    auto find_it = context->labels.find(target);
+    if (find_it != context->labels.end()) {
+        sljit_set_label(jump, find_it->second);
+    }
+    // It doesn't, so queue this as a pending jump to be resolved later.
+    else {
+        context->pending_jumps[target].push_back(jump);
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_label(const std::string& label_name) const {
+    sljit_label* label = sljit_emit_label(compiler);
+
+    // Check if there are any pending jumps for this label and assign them if so.
+    auto find_it = context->pending_jumps.find(label_name);
+    if (find_it != context->pending_jumps.end()) {
+        for (sljit_jump* jump : find_it->second) {
+            sljit_set_label(jump, label);
+        }
+
+        // Remove the pending jumps for this label.
+        context->pending_jumps.erase(find_it);
+    }
+
+    context->labels.emplace(label_name, label);
+}
+
+void N64Recomp::LiveGenerator::emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const {
+    (void)jtbl;
+    (void)reg;
+    // Nothing to do here, the live recompiler performs a subtraction to get the switch's case.
+}
+
+void N64Recomp::LiveGenerator::emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const {
+    // Make sure there's no pending jump.
+    if(context->cur_branch_jump != nullptr) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // Branch conditions do not allow unary ops, except for ToS64 on the first operand to indicate the branch comparison is signed.
+    if(op.operands.operand_operations[0] != UnaryOpType::None && op.operands.operand_operations[0] != UnaryOpType::ToS64) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    if (op.operands.operand_operations[1] != UnaryOpType::None) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    sljit_s32 condition_type;
+    bool cmp_signed = op.operands.operand_operations[0] == UnaryOpType::ToS64;
+    // Comparisons need to be inverted to account for the fact that the generator is expected to generate a code block that only runs if
+    // the condition is met, meaning the branch should be taken if the condition isn't met.
+    switch (op.comparison) {
+        case BinaryOpType::Equal:
+            condition_type = SLJIT_NOT_EQUAL;
+            break;
+        case BinaryOpType::NotEqual:
+            condition_type = SLJIT_EQUAL;
+            break;
+        case BinaryOpType::GreaterEq:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_LESS;
+            }
+            else {
+                condition_type = SLJIT_LESS;
+            }
+            break;
+        case BinaryOpType::Greater:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_LESS_EQUAL;
+            }
+            else {
+                condition_type = SLJIT_LESS_EQUAL;
+            }
+            break;
+        case BinaryOpType::LessEq:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_GREATER;
+            }
+            else {
+                condition_type = SLJIT_GREATER;
+            }
+            break;
+        case BinaryOpType::Less:
+            if (cmp_signed) {
+                condition_type = SLJIT_SIG_GREATER_EQUAL;
+            }
+            else {
+                condition_type = SLJIT_GREATER_EQUAL;
+            }
+            break;
+        default:
+            assert(false && "Invalid branch condition comparison operation!");
+            errored = true;
+            return;
+    }
+    sljit_sw src1;
+    sljit_sw src1w;
+    sljit_sw src2;
+    sljit_sw src2w;
+
+    get_operand_values(op.operands.operands[0], ctx, src1, src1w);
+    get_operand_values(op.operands.operands[1], ctx, src2, src2w);
+
+    // Relocations aren't valid on conditional branches.
+    if(ctx.reloc_type != RelocType::R_MIPS_NONE) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // Create a compare jump and track it as the pending branch jump.
+    context->cur_branch_jump = sljit_emit_cmp(compiler, condition_type, src1, src1w, src2, src2w);
+}
+
+void N64Recomp::LiveGenerator::emit_branch_close() const {
+    // Make sure there's a pending branch jump.
+    if(context->cur_branch_jump == nullptr) {
+        assert(false);
+        errored = true;
+        return;
+    }
+
+    // Assign a label at this point to the pending branch jump and clear it.
+    sljit_set_label(context->cur_branch_jump, sljit_emit_label(compiler));
+    context->cur_branch_jump = nullptr;
+}
+
+void N64Recomp::LiveGenerator::emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const {
+    // Populate the switch's labels.
+    std::vector<std::string> cur_labels{};
+    cur_labels.resize(jtbl.entries.size());
+    for (size_t i = 0; i < cur_labels.size(); i++) {
+        cur_labels[i] = fmt::format("L_{:08X}", jtbl.entries[i]);
+    }
+    context->switch_jump_labels.emplace_back(std::move(cur_labels));
+
+    // Allocate the jump table.
+    std::unique_ptr<void* []> cur_jump_table = std::make_unique<void* []>(jtbl.entries.size());
+
+    /// Codegen
+
+    // Load the jump target register. The lw instruction was patched into an addiu, so this holds
+    // the address of the jump table entry instead of the actual jump target.
+    sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp1, 0, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg));
+    // Subtract the jump table's address from the jump target to get the jump table addend.
+    // Sign extend the jump table address to 64 bits so that the entire register's contents are used instead of just the lower 32 bits.
+    const auto& jtbl_section = recompiler_context.sections[jtbl.section_index];
+    if (jtbl_section.relocatable) {
+        // Make a dummy instruction context to pass to `load_relocated_address`.
+        InstructionContext dummy_context{};
+        
+        // Get the relocated address of the jump table.
+        uint32_t section_offset = jtbl.vram - jtbl_section.ram_addr;
+
+        // Populate the necessary fields of the dummy context and load the relocated address into temp2.
+        dummy_context.reloc_section_index = jtbl.section_index;
+        dummy_context.reloc_target_section_offset = section_offset;
+        load_relocated_address(dummy_context, Registers::arithmetic_temp2);
+
+        // Subtract the relocated jump table start address from the loaded address. 
+        sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp2, 0);
+    }
+    else {
+        sljit_emit_op2(compiler, SLJIT_SUB, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, (sljit_sw)((int32_t)jtbl.vram));
+    }
+    
+    // Bounds check the addend. If it's greater than or equal to the jump table size (entries * sizeof(u32)) then jump to the switch error.
+    sljit_jump* switch_error_jump = sljit_emit_cmp(compiler, SLJIT_GREATER_EQUAL, Registers::arithmetic_temp1, 0, SLJIT_IMM, jtbl.entries.size() * sizeof(uint32_t));
+    context->switch_error_jumps.emplace_back(SwitchErrorJump{.instr_vram = jtbl.jr_vram, .jtbl_vram = jtbl.vram, .jump = switch_error_jump});
+
+    // Multiply the jump table addend by 2 to get the addend for the real jump table. (4 bytes per entry to 8 bytes per entry).
+    sljit_emit_op2(compiler, SLJIT_ADD, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0, Registers::arithmetic_temp1, 0);
+    // Load the real jump table address.
+    sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp2, 0, SLJIT_IMM, (sljit_sw)cur_jump_table.get());
+    // Load the real jump entry.
+    sljit_emit_op1(compiler, SLJIT_MOV, Registers::arithmetic_temp1, 0, SLJIT_MEM2(Registers::arithmetic_temp1, Registers::arithmetic_temp2), 0);
+    // Jump to the loaded entry.
+    sljit_emit_ijump(compiler, SLJIT_JUMP, Registers::arithmetic_temp1, 0);
+
+    // Move the jump table into the pending jump tables.
+    context->pending_jump_tables.emplace_back(std::move(cur_jump_table));
+}
+
+void N64Recomp::LiveGenerator::emit_case(int case_index, const std::string& target_label) const {
+    (void)case_index;
+    (void)target_label;
+    // Nothing to do here, the jump table is built in emit_switch.
+}
+
+void N64Recomp::LiveGenerator::emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const {
+    (void)instr_vram;
+    (void)jtbl_vram;
+    // Nothing to do here, the jump table is built in emit_switch.
+}
+
+void N64Recomp::LiveGenerator::emit_switch_close() const {
+    // Nothing to do here, the jump table is built in emit_switch.
+}
+
+void N64Recomp::LiveGenerator::emit_return() const {
+    sljit_emit_return_void(compiler);
+}
+
+void N64Recomp::LiveGenerator::emit_check_fr(int fpr) const {
+    (void)fpr;
+    // Nothing to do here.
+}
+
+void N64Recomp::LiveGenerator::emit_check_nan(int fpr, bool is_double) const {
+    (void)fpr;
+    (void)is_double;
+    // Nothing to do here.
+}
+
+void N64Recomp::LiveGenerator::emit_cop0_status_read(int reg) const {
+    // Skip the read if the target is the zero register.
+    if (reg != 0) {
+        // Load ctx into R0.
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, Registers::ctx, 0);
+
+        // Call cop0_status_read.
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P,32), SLJIT_IMM, sljit_sw(inputs.cop0_status_read));
+
+        // Store the result in the output register.
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(Registers::ctx), get_gpr_context_offset(reg), SLJIT_R0, 0);
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_cop0_status_write(int reg) const {
+    sljit_sw src;
+    sljit_sw srcw;
+    get_gpr_values(reg, src, srcw);
+    
+    // Load ctx and the input register value into R0 and R1
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, Registers::ctx, 0);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, src, srcw);
+
+    // Call cop0_status_write.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2V(P,32), SLJIT_IMM, sljit_sw(inputs.cop0_status_write));
+}
+
+void N64Recomp::LiveGenerator::emit_cop1_cs_read(int reg) const {
+    // Skip the read if the target is the zero register.
+    if (reg != 0) {
+        sljit_sw dst;
+        sljit_sw dstw;
+        get_gpr_values(reg, dst, dstw);
+
+        // Call get_cop1_cs.
+        sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS0(32), SLJIT_IMM, sljit_sw(get_cop1_cs));
+
+        // Store the result in the output register.
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, dst, dstw, SLJIT_RETURN_REG, 0);
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_cop1_cs_write(int reg) const {
+    sljit_sw src;
+    sljit_sw srcw;
+    get_gpr_values(reg, src, srcw);
+
+    // Load the input register value into R0.
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src, srcw);
+
+    // Call set_cop1_cs.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(32), SLJIT_IMM, sljit_sw(set_cop1_cs));
+}
+
+void N64Recomp::LiveGenerator::emit_muldiv(InstrId instr_id, int reg1, int reg2) const {
+    sljit_sw src1;
+    sljit_sw src1w;
+    sljit_sw src2;
+    sljit_sw src2w;
+    get_gpr_values(reg1, src1, src1w);
+    get_gpr_values(reg2, src2, src2w);
+    
+    auto do_mul32_op = [src1, src1w, src2, src2w, this](bool is_signed) {
+        // Load the two inputs into the multiplication input registers (R0/R1).
+        if (is_signed) {
+            // 32-bit signed multiplication is really 64 bits * 35 bits, so load accordingly.
+            sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src1, src1w); 
+
+            // Sign extend to 35 bits by shifting left by 64 - 35 and then shifting right by the same amount.
+            sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, src2, src2w, SLJIT_IMM, 64 - 35);
+            sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 64 - 35);
+        }
+        else {
+            sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R0, 0, src1, src1w);
+            sljit_emit_op1(compiler, SLJIT_MOV_U32, SLJIT_R1, 0, src2, src2w);
+        }
+
+        // Perform the multiplication.
+        sljit_emit_op0(compiler, is_signed ? SLJIT_LMUL_SW : SLJIT_LMUL_UW);
+
+        // Move the results into hi and lo with sign extension.
+        sljit_emit_op2(compiler, SLJIT_ASHR, Registers::hi, 0, SLJIT_R0, 0, SLJIT_IMM, 32);
+        sljit_emit_op1(compiler, SLJIT_MOV_S32, Registers::lo, 0, SLJIT_R0, 0);
+    };
+    
+    auto do_mul64_op = [src1, src1w, src2, src2w, this](bool is_signed) {
+        // Load the two inputs into the multiplication input registers (R0/R1).
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R0, 0, src1, src1w); 
+        sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, src2, src2w);
+
+        // Perform the multiplication.
+        sljit_emit_op0(compiler, is_signed ? SLJIT_LMUL_SW : SLJIT_LMUL_UW);
+
+        // Move the results into hi and lo.
+        sljit_emit_op1(compiler, SLJIT_MOV, Registers::hi, 0, SLJIT_R1, 0);
+        sljit_emit_op1(compiler, SLJIT_MOV, Registers::lo, 0, SLJIT_R0, 0);
+    };
+    
+    auto do_div_op = [src1, src1w, src2, src2w, this](bool doubleword, bool is_signed) {
+        // Pick the division opcode based on the bit width and signedness.
+        // Note that the 64-bit division opcode is used for 32-bit signed division to match hardware behavior and prevent overflow.
+        sljit_sw div_opcode = doubleword ?
+            (is_signed ? SLJIT_DIVMOD_SW : SLJIT_DIVMOD_UW) :
+            (is_signed ? SLJIT_DIVMOD_SW : SLJIT_DIVMOD_U32);
+
+        // Pick the move opcode to use for loading the operands.
+        sljit_sw load_opcode = doubleword ? SLJIT_MOV :
+            (is_signed ? SLJIT_MOV_S32 : SLJIT_MOV_U32);
+
+        // Pick the move opcode to use for saving the results.
+        sljit_sw save_opcode = doubleword ? SLJIT_MOV : SLJIT_MOV_S32;
+
+        // Load the two inputs into R0 and R1 (the numerator and denominator).
+        sljit_emit_op1(compiler, load_opcode, SLJIT_R0, 0, src1, src1w); 
+
+        // TODO figure out 32-bit signed division behavior when inputs aren't properly sign extended.
+        // if (!doubleword && is_signed) {
+        //     // Sign extend to 35 bits by shifting left by 64 - 35 and then shifting right by the same amount.
+        //     sljit_emit_op2(compiler, SLJIT_SHL, SLJIT_R1, 0, src2, src2w, SLJIT_IMM, 64 - 35);
+        //     sljit_emit_op2(compiler, SLJIT_ASHR, SLJIT_R1, 0, SLJIT_R1, 0, SLJIT_IMM, 64 - 35);
+        // }
+        // else {
+            sljit_emit_op1(compiler, load_opcode, SLJIT_R1, 0, src2, src2w);
+        // }
+
+        // Prevent overflow on 64-bit signed division.
+        if (doubleword && is_signed) {
+            // If the numerator is INT64_MIN and the denominator is -1, an overflow will occur. To prevent an exception and
+            // behave as the original hardware would, check if either of those conditions are false.
+            // If neither condition is false (i.e. both are true), set the denominator to 1.
+
+            // Xor the numerator with INT64_MIN. This will be zero if they're equal.
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp1, 0, SLJIT_IMM, sljit_sw(INT64_MIN));
+
+            // Invert the denominator. This will be zero if it's -1.
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::arithmetic_temp4, 0, Registers::arithmetic_temp2, 0, SLJIT_IMM, sljit_sw(-1)); 
+
+            // Or the results of the previous two calculations and set the zero flag. This will be zero if both conditions were met.
+            sljit_emit_op2(compiler, SLJIT_OR | SLJIT_SET_Z, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp3, 0, Registers::arithmetic_temp4, 0);
+
+            // If the zero flag is 0, meaning both conditions were true, replace the denominator with 1.
+            // i.e. conditionally move an immediate of 1 into arithmetic temp 2 if the zero flag is 0.
+            sljit_emit_select(compiler, SLJIT_ZERO, SLJIT_R1, SLJIT_IMM, 1, SLJIT_R1);
+        }
+
+        // If the denominator is 0, skip the division and jump the special handling for that case.
+        // Branch past the division if the divisor is 0.
+        sljit_jump* jump_skip_division = sljit_emit_cmp(compiler, SLJIT_EQUAL, SLJIT_R1, 0, SLJIT_IMM, 0);// sljit_emit_jump(compiler, SLJIT_ZERO);
+
+        // Perform the division.
+        sljit_emit_op0(compiler, div_opcode);
+
+        // Extract the remainder and quotient into the high and low registers respectively.
+        sljit_emit_op1(compiler, save_opcode, Registers::hi, 0, SLJIT_R1, 0);
+        sljit_emit_op1(compiler, save_opcode, Registers::lo, 0, SLJIT_R0, 0);
+
+        // Jump to the end of this routine.
+        sljit_jump* jump_to_end = sljit_emit_jump(compiler, SLJIT_JUMP);
+
+        // Emit a label and set it as the target of the jump if the denominator was zero.
+        sljit_label* after_division = sljit_emit_label(compiler);
+        sljit_set_label(jump_skip_division, after_division);
+
+        // Move the numerator into hi.
+        sljit_emit_op1(compiler, save_opcode, Registers::hi, 0, SLJIT_R0, 0);
+
+        if (is_signed) {
+            // Calculate the negative signum of the numerator and place it in lo.
+            // neg_signum = ((int64_t)(~x) >> (bit width - 1)) | 1
+            sljit_emit_op2(compiler, SLJIT_XOR, Registers::lo, 0, SLJIT_R0, 0, SLJIT_IMM, sljit_sw(-1));
+            sljit_emit_op2(compiler, SLJIT_ASHR, Registers::lo, 0, Registers::lo, 0, SLJIT_IMM, 64 - 1);
+            sljit_emit_op2(compiler, SLJIT_OR, Registers::lo, 0, Registers::lo, 0, SLJIT_IMM, 1);
+        }
+        else {
+            // Move -1 into lo.
+            sljit_emit_op1(compiler, SLJIT_MOV, Registers::lo, 0, SLJIT_IMM, sljit_sw(-1));
+        }
+
+        // Emit a label and set it as the target of the jump after the divison.
+        sljit_label* end_label = sljit_emit_label(compiler);
+        sljit_set_label(jump_to_end, end_label);
+    };
+    
+
+    switch (instr_id) {
+        case InstrId::cpu_mult:
+            do_mul32_op(true);
+            break;
+        case InstrId::cpu_multu:
+            do_mul32_op(false);
+            break;
+        case InstrId::cpu_dmult:
+            do_mul64_op(true);
+            break;
+        case InstrId::cpu_dmultu:
+            do_mul64_op(false);
+            break;
+        case InstrId::cpu_div:
+            do_div_op(false, true);
+            break;
+        case InstrId::cpu_divu:
+            do_div_op(false, false);
+            break;
+        case InstrId::cpu_ddiv:
+            do_div_op(true, true);
+            break;
+        case InstrId::cpu_ddivu:
+            do_div_op(true, false);
+            break;
+        default:
+            assert(false && "Invalid mul/div instruction id!");
+            break;
+    }
+}
+
+void N64Recomp::LiveGenerator::emit_syscall(uint32_t instr_vram) const {
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // Load the vram into R2.
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, instr_vram);
+    // Call syscall_handler.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS3V(P, P, 32), SLJIT_IMM, sljit_sw(inputs.syscall_handler));
+}
+
+void N64Recomp::LiveGenerator::emit_do_break(uint32_t instr_vram) const {
+    // Load the vram into R0.
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R0, 0, SLJIT_IMM, instr_vram);
+    // Call do_break.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(32), SLJIT_IMM, sljit_sw(inputs.do_break));
+}
+
+void N64Recomp::LiveGenerator::emit_pause_self() const {
+    // Load rdram into R0.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    // Call pause_self.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(P), SLJIT_IMM, sljit_sw(inputs.pause_self));
+}
+
+void N64Recomp::LiveGenerator::emit_trigger_event(uint32_t event_index) const {
+    // Load rdram and ctx into R0 and R1.
+    sljit_emit_op2(compiler, SLJIT_ADD, SLJIT_R0, 0, Registers::rdram, 0, SLJIT_IMM, rdram_offset);
+    sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_R1, 0, Registers::ctx, 0);
+    // Load the global event index into R2.
+    sljit_emit_op1(compiler, SLJIT_MOV32, SLJIT_R2, 0, SLJIT_IMM, event_index + inputs.base_event_index);
+    // Call trigger_event.
+    sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS1V(P), SLJIT_IMM, sljit_sw(inputs.trigger_event));
+}
+
+void N64Recomp::LiveGenerator::emit_comment(const std::string& comment) const {
+    (void)comment;
+    // Nothing to do here.
+}
+
+bool N64Recomp::recompile_function_live(LiveGenerator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    return recompile_function_custom(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs);
+}
+
diff --git a/LiveRecomp/live_recompiler_test.cpp b/LiveRecomp/live_recompiler_test.cpp
new file mode 100644
index 0000000..c5673eb
--- /dev/null
+++ b/LiveRecomp/live_recompiler_test.cpp
@@ -0,0 +1,364 @@
+#include <fstream>
+#include <chrono>
+#include <filesystem>
+#include <cinttypes>
+
+#include "sljitLir.h"
+#include "recompiler/live_recompiler.h"
+#include "recomp.h"
+
+static std::vector<uint8_t> read_file(const std::filesystem::path& path, bool& found) {
+    std::vector<uint8_t> ret;
+    found = false;
+
+    std::ifstream file{ path, std::ios::binary};
+
+    if (file.good()) {
+        file.seekg(0, std::ios::end);
+        ret.resize(file.tellg());
+        file.seekg(0, std::ios::beg);
+
+        file.read(reinterpret_cast<char*>(ret.data()), ret.size());
+        found = true;
+    }
+
+    return ret;
+}
+
+
+uint32_t read_u32_swap(const std::vector<uint8_t>& vec, size_t offset) {
+    return byteswap(*reinterpret_cast<const uint32_t*>(&vec[offset]));
+}
+
+uint32_t read_u32(const std::vector<uint8_t>& vec, size_t offset) {
+    return *reinterpret_cast<const uint32_t*>(&vec[offset]);
+}
+
+std::vector<uint8_t> rdram;
+
+void byteswap_copy(uint8_t* dst, uint8_t* src, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        dst[i ^ 3] = src[i];
+    }
+}
+
+bool byteswap_compare(uint8_t* a, uint8_t* b, size_t count) {
+    for (size_t i = 0; i < count; i++) {
+        if (a[i ^ 3] != b[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+enum class TestError {
+    Success,
+    FailedToOpenInput,
+    FailedToRecompile,
+    UnknownStructType,
+    DataDifference
+};
+
+struct TestStats {
+    TestError error;
+    uint64_t codegen_microseconds;
+    uint64_t execution_microseconds;
+    uint64_t code_size;
+};
+
+void write1(uint8_t* rdram, recomp_context* ctx) {
+    MEM_B(0, ctx->r4) = 1;
+}
+
+recomp_func_t* test_get_function(int32_t vram) {
+    if (vram == 0x80100000) {
+        return write1;
+    }
+    assert(false);
+    return nullptr;
+}
+
+void test_switch_error(const char* func, uint32_t vram, uint32_t jtbl) {
+    printf("  Switch-case out of bounds in %s at 0x%08X for jump table at 0x%08X\n", func, vram, jtbl);
+}
+
+TestStats run_test(const std::filesystem::path& tests_dir, const std::string& test_name) {
+    std::filesystem::path input_path = tests_dir / (test_name + "_data.bin");
+    std::filesystem::path data_dump_path = tests_dir / (test_name + "_data_out.bin");
+
+    bool found;
+    std::vector<uint8_t> file_data = read_file(input_path, found);
+
+    if (!found) {
+        printf("Failed to open file: %s\n", input_path.string().c_str());
+        return { TestError::FailedToOpenInput };
+    }
+
+    // Parse the test file.
+    uint32_t text_offset = read_u32_swap(file_data, 0x00);
+    uint32_t text_length = read_u32_swap(file_data, 0x04);
+    uint32_t init_data_offset = read_u32_swap(file_data, 0x08);
+    uint32_t good_data_offset = read_u32_swap(file_data, 0x0C);
+    uint32_t data_length = read_u32_swap(file_data, 0x10);
+    uint32_t text_address = read_u32_swap(file_data, 0x14);
+    uint32_t data_address = read_u32_swap(file_data, 0x18);
+    uint32_t next_struct_address = read_u32_swap(file_data, 0x1C);
+
+    recomp_context ctx{};
+
+    byteswap_copy(&rdram[text_address - 0x80000000], &file_data[text_offset], text_length);
+    byteswap_copy(&rdram[data_address - 0x80000000], &file_data[init_data_offset], data_length);
+
+    // Build recompiler context.
+    N64Recomp::Context context{};
+
+    // Move the file data into the context.
+    context.rom = std::move(file_data);
+
+    context.sections.resize(2);
+    // Create a section for the function to exist in.
+    context.sections[0].ram_addr = text_address;
+    context.sections[0].rom_addr = text_offset;
+    context.sections[0].size = text_length;
+    context.sections[0].name = ".text";
+    context.sections[0].executable = true;
+    context.sections[0].relocatable = true;
+    context.section_functions.resize(context.sections.size());
+    // Create a section for .data (used for relocations)
+    context.sections[1].ram_addr = data_address;
+    context.sections[1].rom_addr = init_data_offset;
+    context.sections[1].size = data_length;
+    context.sections[1].name = ".data";
+    context.sections[1].executable = false;
+    context.sections[1].relocatable = true;
+
+    size_t start_func_index;
+    uint32_t function_desc_address = 0;
+    uint32_t reloc_desc_address = 0;
+
+    // Read any extra structs.
+    while (next_struct_address != 0) {
+        uint32_t cur_struct_address = next_struct_address;
+        uint32_t struct_type = read_u32_swap(context.rom, next_struct_address + 0x00);
+        next_struct_address = read_u32_swap(context.rom, next_struct_address + 0x04);
+
+        switch (struct_type) {
+            case 1: // Function desc
+                function_desc_address = cur_struct_address;
+                break;
+            case 2: // Relocation
+                reloc_desc_address = cur_struct_address;
+                break;
+            default:
+                printf("Unknown struct type %u\n", struct_type);
+                return { TestError::UnknownStructType };
+        }
+    }
+
+    // Check if a function description exists.
+    if (function_desc_address == 0) {
+        // No function description, so treat the whole thing as one function.
+
+        // Get the function's instruction words.
+        std::vector<uint32_t> text_words{};
+        text_words.resize(text_length / sizeof(uint32_t));
+        for (size_t i = 0; i < text_words.size(); i++) {
+            text_words[i] = read_u32(context.rom, text_offset + i * sizeof(uint32_t));
+        }
+
+        // Add the function to the context.
+        context.functions_by_vram[text_address].emplace_back(context.functions.size());
+        context.section_functions.emplace_back(context.functions.size());
+        context.sections[0].function_addrs.emplace_back(text_address);
+        context.functions.emplace_back(
+            text_address,
+            text_offset,
+            text_words,
+            "test_func",
+            0
+        );
+        start_func_index = 0;
+    }
+    else {
+        // Use the function description.
+        uint32_t num_funcs = read_u32_swap(context.rom, function_desc_address + 0x08);
+        start_func_index = read_u32_swap(context.rom, function_desc_address + 0x0C);
+
+        for (size_t func_index = 0; func_index < num_funcs; func_index++) {
+            uint32_t cur_func_address = read_u32_swap(context.rom, function_desc_address + 0x10 + 0x00 + 0x08 * func_index);
+            uint32_t cur_func_length = read_u32_swap(context.rom, function_desc_address + 0x10 + 0x04 + 0x08 * func_index);
+            uint32_t cur_func_offset = cur_func_address - text_address + text_offset;
+
+            // Get the function's instruction words.
+            std::vector<uint32_t> text_words{};
+            text_words.resize(cur_func_length / sizeof(uint32_t));
+            for (size_t i = 0; i < text_words.size(); i++) {
+                text_words[i] = read_u32(context.rom, cur_func_offset + i * sizeof(uint32_t));
+            }
+
+            // Add the function to the context.
+            context.functions_by_vram[cur_func_address].emplace_back(context.functions.size());
+            context.section_functions.emplace_back(context.functions.size());
+            context.sections[0].function_addrs.emplace_back(cur_func_address);
+            context.functions.emplace_back(
+                cur_func_address,
+                cur_func_offset,
+                std::move(text_words),
+                "test_func_" + std::to_string(func_index),
+                0
+            );
+        }
+    }
+
+    // Check if a relocation description exists.
+    if (reloc_desc_address != 0) {
+        uint32_t num_relocs = read_u32_swap(context.rom, reloc_desc_address + 0x08);
+        for (uint32_t reloc_index = 0; reloc_index < num_relocs; reloc_index++) {
+            uint32_t cur_desc_address = reloc_desc_address + 0x0C + reloc_index * 4 * sizeof(uint32_t);
+            uint32_t reloc_type = read_u32_swap(context.rom, cur_desc_address + 0x00);
+            uint32_t reloc_section = read_u32_swap(context.rom, cur_desc_address + 0x04);
+            uint32_t reloc_address = read_u32_swap(context.rom, cur_desc_address + 0x08);
+            uint32_t reloc_target_offset = read_u32_swap(context.rom, cur_desc_address + 0x0C);
+
+            context.sections[0].relocs.emplace_back(N64Recomp::Reloc{
+                .address = reloc_address,
+                .target_section_offset = reloc_target_offset,
+                .symbol_index = 0,
+                .target_section = static_cast<uint16_t>(reloc_section),
+                .type = static_cast<N64Recomp::RelocType>(reloc_type),
+                .reference_symbol = false
+            });
+        }
+    }
+
+    std::vector<std::vector<uint32_t>> dummy_static_funcs{};
+    std::vector<int32_t> section_addresses{};
+    section_addresses.emplace_back(text_address);
+    section_addresses.emplace_back(data_address);
+
+    auto before_codegen = std::chrono::system_clock::now();
+
+    N64Recomp::LiveGeneratorInputs generator_inputs {
+        .switch_error = test_switch_error,
+        .get_function = test_get_function,
+        .reference_section_addresses = nullptr,
+        .local_section_addresses = section_addresses.data()
+    };
+
+    // Create the sljit compiler and the generator.
+    N64Recomp::LiveGenerator generator{ context.functions.size(), generator_inputs };
+
+    for (size_t func_index = 0; func_index < context.functions.size(); func_index++) {
+        std::ostringstream dummy_ostream{};
+
+        //sljit_emit_op0(compiler, SLJIT_BREAKPOINT);
+
+        if (!N64Recomp::recompile_function_live(generator, context, func_index, dummy_ostream, dummy_static_funcs, true)) {
+            return { TestError::FailedToRecompile };
+        }
+    }
+
+    // Generate the code.
+    N64Recomp::LiveGeneratorOutput output = generator.finish();
+
+    auto after_codegen = std::chrono::system_clock::now();
+
+    auto before_execution = std::chrono::system_clock::now();
+
+    int old_rounding = fegetround();
+
+    // Run the generated code.
+    ctx.r29 = 0xFFFFFFFF80000000 + rdram.size() - 0x10; // Set the stack pointer.
+    output.functions[start_func_index](rdram.data(), &ctx);
+
+    fesetround(old_rounding);
+
+    auto after_execution = std::chrono::system_clock::now();
+
+    // Check the result of running the code.
+    bool good = byteswap_compare(&rdram[data_address - 0x80000000], &context.rom[good_data_offset], data_length);
+
+    // Dump the data if the results don't match.
+    if (!good) {
+        std::ofstream data_dump_file{ data_dump_path, std::ios::binary };
+        std::vector<uint8_t> data_swapped;
+        data_swapped.resize(data_length);
+        byteswap_copy(data_swapped.data(), &rdram[data_address - 0x80000000], data_length);
+        data_dump_file.write(reinterpret_cast<char*>(data_swapped.data()), data_length);
+        return { TestError::DataDifference };
+    }
+
+    // Return the test's stats.
+    TestStats ret{};
+    ret.error = TestError::Success;
+    ret.codegen_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(after_codegen - before_codegen).count();
+    ret.execution_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(after_execution - before_execution).count();
+    ret.code_size = output.code_size;
+
+    return ret;
+}
+
+int main(int argc, const char** argv) {
+    if (argc < 3) {
+        printf("Usage: %s [test directory] [test 1] ...\n", argv[0]);
+        return EXIT_SUCCESS;
+    }
+
+    N64Recomp::live_recompiler_init();
+
+    rdram.resize(0x8000000);
+
+    // Skip the first argument (program name) and second argument (test directory).
+    int count = argc - 1 - 1;
+    int passed_count = 0;
+
+    std::vector<size_t> failed_tests{};
+
+    for (size_t test_index = 0; test_index < count; test_index++) {
+        const char* cur_test_name = argv[2 + test_index];
+        printf("Running test: %s\n", cur_test_name);
+        TestStats stats = run_test(argv[1], cur_test_name);
+
+        switch (stats.error) {
+        case TestError::Success:
+            printf("  Success\n");
+            printf("  Generated %" PRIu64 " bytes in %" PRIu64 " microseconds and ran in %" PRIu64 " microseconds\n",
+                stats.code_size, stats.codegen_microseconds, stats.execution_microseconds);
+            passed_count++;
+            break;
+        case TestError::FailedToOpenInput:
+            printf("  Failed to open input data file\n");
+            break;
+        case TestError::FailedToRecompile:
+            printf("  Failed to recompile\n");
+            break;
+        case TestError::UnknownStructType:
+            printf("  Unknown additional data struct type in test data\n");
+            break;
+        case TestError::DataDifference:
+            printf("  Output data did not match, dumped to file\n");
+            break;
+        }
+
+        if (stats.error != TestError::Success) {
+            failed_tests.emplace_back(test_index);
+        }
+
+        printf("\n");
+    }
+
+    printf("Passed %d/%d tests\n", passed_count, count);
+    if (!failed_tests.empty()) {
+        printf("  Failed: ");
+        for (size_t i = 0; i < failed_tests.size(); i++) {
+            size_t test_index = failed_tests[i];
+
+            printf("%s", argv[2 + test_index]);
+            if (i != failed_tests.size() - 1) {
+                printf(", ");
+            }
+        }
+        printf("\n");
+    }
+    return 0;
+}
diff --git a/OfflineModRecomp/main.cpp b/OfflineModRecomp/main.cpp
index aa25dc8..29e5232 100644
--- a/OfflineModRecomp/main.cpp
+++ b/OfflineModRecomp/main.cpp
@@ -3,7 +3,7 @@
 #include <vector>
 #include <span>
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "rabbitizer.hpp"
 
 static std::vector<uint8_t> read_file(const std::filesystem::path& path, bool& found) {
@@ -221,8 +221,7 @@ int main(int argc, const char** argv) {
 
     // Perform a second pass for recompiling all the functions.
     for (size_t func_index = 0; func_index < mod_context.functions.size(); func_index++) {
-        auto& func = mod_context.functions[func_index];
-        if (!N64Recomp::recompile_function(mod_context, func, output_file, static_funcs_by_section, true)) {
+        if (!N64Recomp::recompile_function(mod_context, func_index, output_file, static_funcs_by_section, true)) {
             output_file.close();
             std::error_code ec;
             std::filesystem::remove(output_file_path, ec);
diff --git a/RecompModTool/main.cpp b/RecompModTool/main.cpp
index 78649ef..9fbb7d1 100644
--- a/RecompModTool/main.cpp
+++ b/RecompModTool/main.cpp
@@ -7,7 +7,7 @@
 #include <cstdlib>
 #include "fmt/format.h"
 #include "fmt/ostream.h"
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include <toml++/toml.hpp>
 
 #ifdef _WIN32
diff --git a/include/generator.h b/include/generator.h
deleted file mode 100644
index 5afcc57..0000000
--- a/include/generator.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef __GENERATOR_H__
-#define __GENERATOR_H__
-
-#include "n64recomp.h"
-#include "operations.h"
-
-namespace N64Recomp {
-    struct InstructionContext {
-        int rd;
-        int rs;
-        int rt;
-        int sa;
-
-        int fd;
-        int fs;
-        int ft;
-
-        int cop1_cs;
-
-        uint16_t imm16;
-
-        bool reloc_tag_as_reference;
-        RelocType reloc_type;
-        uint32_t reloc_section_index;
-        uint32_t reloc_target_section_offset;
-    };
-
-    class Generator {
-    public:
-        virtual void process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const = 0;
-        virtual void process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const = 0;
-        virtual void process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const = 0;
-        virtual void emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const = 0;
-        virtual void emit_branch_close(std::ostream& output_file) const = 0;
-        virtual void emit_check_fr(std::ostream& output_file, int fpr) const = 0;
-        virtual void emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const = 0;
-    };
-
-    class CGenerator final : Generator {
-    public:
-        CGenerator() = default;
-        void process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const final;
-        void process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const final;
-        void process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const final;
-        void emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const final;
-        void emit_branch_close(std::ostream& output_file) const final;
-        void emit_check_fr(std::ostream& output_file, int fpr) const final;
-        void emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const final;
-    private:
-        void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const;
-        void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const;
-        void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const;
-    };
-}
-
-#endif
diff --git a/include/recomp.h b/include/recomp.h
new file mode 100644
index 0000000..d291eec
--- /dev/null
+++ b/include/recomp.h
@@ -0,0 +1,397 @@
+#ifndef __RECOMP_H__
+#define __RECOMP_H__
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+#include <fenv.h>
+#include <assert.h>
+
+// Compiler definition to disable inter-procedural optimization, allowing multiple functions to be in a single file without breaking interposition.
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+    // MSVC's __declspec(noinline) seems to disable inter-procedural optimization entirely, so it's all that's needed.
+    #define RECOMP_FUNC __declspec(noinline)
+    
+    // Use MSVC's fenv_access pragma.
+    #define SET_FENV_ACCESS() _Pragma("fenv_access(on)")
+#elif defined(__clang__)
+    // Clang has no dedicated IPO attribute, so we use a combination of other attributes to give the desired behavior.
+    // The inline keyword allows multiple definitions during linking, and extern forces clang to emit an externally visible definition.
+    // Weak forces Clang to not perform any IPO as the symbol can be interposed, which prevents actual inlining due to the inline keyword.
+    // Add noinline on for good measure, which doesn't conflict with the inline keyword as they have different meanings.
+    #define RECOMP_FUNC extern inline __attribute__((weak,noinline))
+
+    // Use the standard STDC FENV_ACCESS pragma.
+    #define SET_FENV_ACCESS() _Pragma("STDC FENV_ACCESS ON")
+#elif defined(__GNUC__) && !defined(__INTEL_COMPILER)
+    // Use GCC's attribute for disabling inter-procedural optimizations. Also enable the rounding-math compiler flag to disable
+    // constant folding so that arithmetic respects the floating point environment. This is needed because gcc doesn't implement
+    // any FENV_ACCESS pragma.
+    #define RECOMP_FUNC __attribute__((noipa, optimize("rounding-math")))
+
+    // There's no FENV_ACCESS pragma in gcc, so this can be empty.
+    #define SET_FENV_ACCESS()
+#else
+    #error "No RECOMP_FUNC definition for this compiler"
+#endif
+
+// Implementation of 64-bit multiply and divide instructions
+#if defined(__SIZEOF_INT128__)
+
+static inline void DMULT(int64_t a, int64_t b, int64_t* lo64, int64_t* hi64) {
+    __int128 full128 = ((__int128)a) * ((__int128)b);
+
+    *hi64 = (int64_t)(full128 >> 64);
+    *lo64 = (int64_t)(full128 >> 0);
+}
+
+static inline void DMULTU(uint64_t a, uint64_t b, uint64_t* lo64, uint64_t* hi64) {
+    unsigned __int128 full128 = ((unsigned __int128)a) * ((unsigned __int128)b);
+
+    *hi64 = (uint64_t)(full128 >> 64);
+    *lo64 = (uint64_t)(full128 >> 0);
+}
+
+#elif defined(_MSC_VER)
+
+#include <intrin.h>
+#pragma intrinsic(_mul128)
+#pragma intrinsic(_umul128)
+
+static inline void DMULT(int64_t a, int64_t b, int64_t* lo64, int64_t* hi64) {
+    *lo64 = _mul128(a, b, hi64);
+}
+
+static inline void DMULTU(uint64_t a, uint64_t b, uint64_t* lo64, uint64_t* hi64) {
+    *lo64 = _umul128(a, b, hi64);
+}
+
+#else
+#error "128-bit integer type not found"
+#endif
+
+static inline void DDIV(int64_t a, int64_t b, int64_t* quot, int64_t* rem) {
+    int overflow = ((uint64_t)a == 0x8000000000000000ull) && (b == -1ll);
+    *quot = overflow ? a : (a / b);
+    *rem = overflow ? 0 : (a % b);
+}
+
+static inline void DDIVU(uint64_t a, uint64_t b, uint64_t* quot, uint64_t* rem) {
+    *quot = a / b;
+    *rem = a % b;
+}
+
+typedef uint64_t gpr;
+
+#define SIGNED(val) \
+    ((int64_t)(val))
+
+#define ADD32(a, b) \
+    ((gpr)(int32_t)((a) + (b)))
+
+#define SUB32(a, b) \
+    ((gpr)(int32_t)((a) - (b)))
+
+#define MEM_W(offset, reg) \
+    (*(int32_t*)(rdram + ((((reg) + (offset))) - 0xFFFFFFFF80000000)))
+
+#define MEM_H(offset, reg) \
+    (*(int16_t*)(rdram + ((((reg) + (offset)) ^ 2) - 0xFFFFFFFF80000000)))
+
+#define MEM_B(offset, reg) \
+    (*(int8_t*)(rdram + ((((reg) + (offset)) ^ 3) - 0xFFFFFFFF80000000)))
+
+#define MEM_HU(offset, reg) \
+    (*(uint16_t*)(rdram + ((((reg) + (offset)) ^ 2) - 0xFFFFFFFF80000000)))
+
+#define MEM_BU(offset, reg) \
+    (*(uint8_t*)(rdram + ((((reg) + (offset)) ^ 3) - 0xFFFFFFFF80000000)))
+
+#define SD(val, offset, reg) { \
+    *(uint32_t*)(rdram + ((((reg) + (offset) + 4)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 0); \
+    *(uint32_t*)(rdram + ((((reg) + (offset) + 0)) - 0xFFFFFFFF80000000)) = (uint32_t)((gpr)(val) >> 32); \
+}
+
+static inline uint64_t load_doubleword(uint8_t* rdram, gpr reg, gpr offset) {
+    uint64_t ret = 0;
+    uint64_t lo = (uint64_t)(uint32_t)MEM_W(reg, offset + 4);
+    uint64_t hi = (uint64_t)(uint32_t)MEM_W(reg, offset + 0);
+    ret = (lo << 0) | (hi << 32);
+    return ret;
+}
+
+#define LD(offset, reg) \
+    load_doubleword(rdram, offset, reg)
+
+static inline gpr do_lwl(uint8_t* rdram, gpr initial_value, gpr offset, gpr reg) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+
+    // Load the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t loaded_value = MEM_W(0, word_address);
+
+    // Mask the existing value and shift the loaded value appropriately
+    gpr misalignment = address & 0x3;
+    gpr masked_value = initial_value & (gpr)(uint32_t)~(0xFFFFFFFFu << (misalignment * 8));
+    loaded_value <<= (misalignment * 8);
+
+    // Cast to int32_t to sign extend first
+    return (gpr)(int32_t)(masked_value | loaded_value);
+}
+
+static inline gpr do_lwr(uint8_t* rdram, gpr initial_value, gpr offset, gpr reg) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+    
+    // Load the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t loaded_value = MEM_W(0, word_address);
+
+    // Mask the existing value and shift the loaded value appropriately
+    gpr misalignment = address & 0x3;
+    gpr masked_value = initial_value & (gpr)(uint32_t)~(0xFFFFFFFFu >> (24 - misalignment * 8));
+    loaded_value >>= (24 - misalignment * 8);
+
+    // Cast to int32_t to sign extend first
+    return (gpr)(int32_t)(masked_value | loaded_value);
+}
+
+static inline void do_swl(uint8_t* rdram, gpr offset, gpr reg, gpr val) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+
+    // Get the initial value of the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t initial_value = MEM_W(0, word_address);
+
+    // Mask the initial value and shift the input value appropriately
+    gpr misalignment = address & 0x3;
+    uint32_t masked_initial_value = initial_value & ~(0xFFFFFFFFu >> (misalignment * 8));
+    uint32_t shifted_input_value = ((uint32_t)val) >> (misalignment * 8);
+    MEM_W(0, word_address) = masked_initial_value | shifted_input_value;
+}
+
+static inline void do_swr(uint8_t* rdram, gpr offset, gpr reg, gpr val) {
+    // Calculate the overall address
+    gpr address = (offset + reg);
+
+    // Get the initial value of the aligned word
+    gpr word_address = address & ~0x3;
+    uint32_t initial_value = MEM_W(0, word_address);
+
+    // Mask the initial value and shift the input value appropriately
+    gpr misalignment = address & 0x3;
+    uint32_t masked_initial_value = initial_value & ~(0xFFFFFFFFu << (24 - misalignment * 8));
+    uint32_t shifted_input_value = ((uint32_t)val) << (24 - misalignment * 8);
+    MEM_W(0, word_address) = masked_initial_value | shifted_input_value;
+}
+
+static inline uint32_t get_cop1_cs() {
+    uint32_t rounding_mode = 0;
+    switch (fegetround()) {
+        // round to nearest value
+        case FE_TONEAREST:
+        default:
+            rounding_mode = 0;
+            break;
+        // round to zero (truncate)
+        case FE_TOWARDZERO:
+            rounding_mode = 1;
+            break;
+        // round to positive infinity (ceil)
+        case FE_UPWARD:
+            rounding_mode = 2;
+            break;
+        // round to negative infinity (floor)
+        case FE_DOWNWARD:
+            rounding_mode = 3;
+            break;
+    }
+    return rounding_mode;
+}
+
+static inline void set_cop1_cs(uint32_t val) {
+    uint32_t rounding_mode = val & 0x3;
+    int round = FE_TONEAREST;
+    switch (rounding_mode) {
+        case 0: // round to nearest value
+            round = FE_TONEAREST;
+            break;
+        case 1: // round to zero (truncate)
+            round = FE_TOWARDZERO;
+            break;
+        case 2: // round to positive infinity (ceil)
+            round = FE_UPWARD;
+            break;
+        case 3: // round to negative infinity (floor)
+            round = FE_DOWNWARD;
+            break;
+    }
+    fesetround(round);
+}
+
+#define S32(val) \
+    ((int32_t)(val))
+    
+#define U32(val) \
+    ((uint32_t)(val))
+
+#define S64(val) \
+    ((int64_t)(val))
+
+#define U64(val) \
+    ((uint64_t)(val))
+
+#define MUL_S(val1, val2) \
+    ((val1) * (val2))
+
+#define MUL_D(val1, val2) \
+    ((val1) * (val2))
+
+#define DIV_S(val1, val2) \
+    ((val1) / (val2))
+
+#define DIV_D(val1, val2) \
+    ((val1) / (val2))
+
+#define CVT_S_W(val) \
+    ((float)((int32_t)(val)))
+
+#define CVT_D_W(val) \
+    ((double)((int32_t)(val)))
+
+#define CVT_D_L(val) \
+    ((double)((int64_t)(val)))
+
+#define CVT_S_L(val) \
+    ((float)((int64_t)(val)))
+
+#define CVT_D_S(val) \
+    ((double)(val))
+
+#define CVT_S_D(val) \
+    ((float)(val))
+
+#define TRUNC_W_S(val) \
+    ((int32_t)(val))
+
+#define TRUNC_W_D(val) \
+    ((int32_t)(val))
+
+#define TRUNC_L_S(val) \
+    ((int64_t)(val))
+
+#define TRUNC_L_D(val) \
+    ((int64_t)(val))
+
+#define DEFAULT_ROUNDING_MODE 0
+
+static inline int32_t do_cvt_w_s(float val) {
+    // Rounding mode aware float to 32-bit int conversion.
+    return (int32_t)lrintf(val);
+}
+
+#define CVT_W_S(val) \
+    do_cvt_w_s(val)
+
+static inline int64_t do_cvt_l_s(float val) {
+    // Rounding mode aware float to 64-bit int conversion.
+    return (int64_t)llrintf(val);
+}
+
+#define CVT_L_S(val) \
+    do_cvt_l_s(val);
+
+static inline int32_t do_cvt_w_d(double val) {
+    // Rounding mode aware double to 32-bit int conversion.
+    return (int32_t)lrint(val);
+}
+
+#define CVT_W_D(val) \
+    do_cvt_w_d(val)
+
+static inline int64_t do_cvt_l_d(double val) {
+    // Rounding mode aware double to 64-bit int conversion.
+    return (int64_t)llrint(val);
+}
+
+#define CVT_L_D(val) \
+    do_cvt_l_d(val)
+
+#define NAN_CHECK(val) \
+    assert(val == val)
+
+//#define NAN_CHECK(val)
+
+typedef union {
+    double d;
+    struct {
+        float fl;
+        float fh;
+    };
+    struct {
+        uint32_t u32l;
+        uint32_t u32h;
+    };
+    uint64_t u64;
+} fpr;
+
+typedef struct {
+    gpr r0,  r1,  r2,  r3,  r4,  r5,  r6,  r7,
+        r8,  r9,  r10, r11, r12, r13, r14, r15,
+        r16, r17, r18, r19, r20, r21, r22, r23,
+        r24, r25, r26, r27, r28, r29, r30, r31;
+    fpr f0,  f1,  f2,  f3,  f4,  f5,  f6,  f7,
+        f8,  f9,  f10, f11, f12, f13, f14, f15,
+        f16, f17, f18, f19, f20, f21, f22, f23,
+        f24, f25, f26, f27, f28, f29, f30, f31;
+    uint64_t hi, lo;
+    uint32_t* f_odd;
+    uint32_t status_reg;
+    uint8_t mips3_float_mode;
+} recomp_context;
+
+// Checks if the target is an even float register or that mips3 float mode is enabled
+#define CHECK_FR(ctx, idx) \
+    assert(((idx) & 1) == 0 || (ctx)->mips3_float_mode)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void cop0_status_write(recomp_context* ctx, gpr value);
+gpr cop0_status_read(recomp_context* ctx);
+void switch_error(const char* func, uint32_t vram, uint32_t jtbl);
+void do_break(uint32_t vram);
+
+typedef void (recomp_func_t)(uint8_t* rdram, recomp_context* ctx);
+
+recomp_func_t* get_function(int32_t vram);
+
+#define LOOKUP_FUNC(val) \
+    get_function((int32_t)(val))
+
+extern int32_t* section_addresses;
+
+#define LO16(x) \
+    ((x) & 0xFFFF)
+
+#define HI16(x) \
+    (((x) >> 16) + (((x) >> 15) & 1))
+
+#define RELOC_HI16(section_index, offset) \
+    HI16(section_addresses[section_index] + (offset))
+
+#define RELOC_LO16(section_index, offset) \
+    LO16(section_addresses[section_index] + (offset))
+
+void recomp_syscall_handler(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram);
+
+void pause_self(uint8_t *rdram);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/n64recomp.h b/include/recompiler/context.h
similarity index 93%
rename from include/n64recomp.h
rename to include/recompiler/context.h
index c214ac7..df5224d 100644
--- a/include/n64recomp.h
+++ b/include/recompiler/context.h
@@ -36,6 +36,20 @@ namespace N64Recomp {
                 : vram(vram), rom(rom), words(std::move(words)), name(std::move(name)), section_index(section_index), ignored(ignored), reimplemented(reimplemented), stubbed(stubbed) {}
         Function() = default;
     };
+    
+    struct JumpTable {
+        uint32_t vram;
+        uint32_t addend_reg;
+        uint32_t rom;
+        uint32_t lw_vram;
+        uint32_t addu_vram;
+        uint32_t jr_vram;
+        uint16_t section_index;
+        std::vector<uint32_t> entries;
+
+        JumpTable(uint32_t vram, uint32_t addend_reg, uint32_t rom, uint32_t lw_vram, uint32_t addu_vram, uint32_t jr_vram, uint16_t section_index, std::vector<uint32_t>&& entries)
+                : vram(vram), addend_reg(addend_reg), rom(rom), lw_vram(lw_vram), addu_vram(addu_vram), jr_vram(jr_vram), section_index(section_index), entries(std::move(entries)) {}
+    };
 
     enum class RelocType : uint8_t {
         R_MIPS_NONE = 0,
@@ -175,6 +189,8 @@ namespace N64Recomp {
         std::vector<ReferenceSymbol> reference_symbols;
         // Mapping of symbol name to reference symbol index.
         std::unordered_map<std::string, SymbolReference> reference_symbols_by_name;
+        // Whether all reference sections should be treated as relocatable (used in live recompilation).
+        bool all_reference_sections_relocatable = false;
     public:
         std::vector<Section> sections;
         std::vector<Function> functions;
@@ -187,6 +203,8 @@ namespace N64Recomp {
         // The target ROM being recompiled, TODO move this outside of the context to avoid making a copy for mod contexts.
         // Used for reading relocations and for the output binary feature.
         std::vector<uint8_t> rom;
+        // Whether reference symbols should be validated when emitting function calls during recompilation.
+        bool skip_validating_reference_symbols = true;
 
         //// Only used by the CLI, TODO move this to a struct in the internal headers.
         // A mapping of function name to index in the functions vector
@@ -359,6 +377,9 @@ namespace N64Recomp {
         }
 
         bool is_reference_section_relocatable(uint16_t section_index) const {
+            if (all_reference_sections_relocatable) {
+                return true;
+            }
             if (section_index == SectionAbsolute) {
                 return false;
             }
@@ -518,9 +539,15 @@ namespace N64Recomp {
         void copy_reference_sections_from(const Context& rhs) {
             reference_sections = rhs.reference_sections;
         }
+
+        void set_all_reference_sections_relocatable() {
+            all_reference_sections_relocatable = true;
+        }
     };
 
-    bool recompile_function(const Context& context, const Function& func, std::ofstream& output_file, std::span<std::vector<uint32_t>> static_funcs, bool tag_reference_relocs);
+    class Generator;
+    bool recompile_function(const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs, bool tag_reference_relocs);
+    bool recompile_function_custom(Generator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs);
 
     enum class ModSymbolsError {
         Good,
diff --git a/include/recompiler/generator.h b/include/recompiler/generator.h
new file mode 100644
index 0000000..0ffde0b
--- /dev/null
+++ b/include/recompiler/generator.h
@@ -0,0 +1,109 @@
+#ifndef __GENERATOR_H__
+#define __GENERATOR_H__
+
+#include "recompiler/context.h"
+#include "operations.h"
+
+namespace N64Recomp {
+    struct InstructionContext {
+        int rd;
+        int rs;
+        int rt;
+        int sa;
+
+        int fd;
+        int fs;
+        int ft;
+
+        int cop1_cs;
+
+        uint16_t imm16;
+
+        bool reloc_tag_as_reference;
+        RelocType reloc_type;
+        uint32_t reloc_section_index;
+        uint32_t reloc_target_section_offset;
+    };
+
+    class Generator {
+    public:
+        virtual void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const = 0;
+        virtual void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const = 0;
+        virtual void process_store_op(const StoreOp& op, const InstructionContext& ctx) const = 0;
+        virtual void emit_function_start(const std::string& function_name, size_t func_index) const = 0;
+        virtual void emit_function_end() const = 0;
+        virtual void emit_function_call_lookup(uint32_t addr) const = 0;
+        virtual void emit_function_call_by_register(int reg) const = 0;
+        // target_section_offset can each be deduced from symbol_index if the full context is available,
+        // but for live recompilation the reference symbol list is unavailable so it's still provided.
+        virtual void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const = 0;
+        virtual void emit_function_call(const Context& context, size_t function_index) const = 0;
+        virtual void emit_named_function_call(const std::string& function_name) const = 0;
+        virtual void emit_goto(const std::string& target) const = 0;
+        virtual void emit_label(const std::string& label_name) const = 0;
+        virtual void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const = 0;
+        virtual void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const = 0;
+        virtual void emit_branch_close() const = 0;
+        virtual void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const = 0;
+        virtual void emit_case(int case_index, const std::string& target_label) const = 0;
+        virtual void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const = 0;
+        virtual void emit_switch_close() const = 0;
+        virtual void emit_return() const = 0;
+        virtual void emit_check_fr(int fpr) const = 0;
+        virtual void emit_check_nan(int fpr, bool is_double) const = 0;
+        virtual void emit_cop0_status_read(int reg) const = 0;
+        virtual void emit_cop0_status_write(int reg) const = 0;
+        virtual void emit_cop1_cs_read(int reg) const = 0;
+        virtual void emit_cop1_cs_write(int reg) const = 0;
+        virtual void emit_muldiv(InstrId instr_id, int reg1, int reg2) const = 0;
+        virtual void emit_syscall(uint32_t instr_vram) const = 0;
+        virtual void emit_do_break(uint32_t instr_vram) const = 0;
+        virtual void emit_pause_self() const = 0;
+        virtual void emit_trigger_event(uint32_t event_index) const = 0;
+        virtual void emit_comment(const std::string& comment) const = 0;
+    };
+
+    class CGenerator final : Generator {
+    public:
+        CGenerator(std::ostream& output_file) : output_file(output_file) {};
+        void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const final;
+        void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const final;
+        void process_store_op(const StoreOp& op, const InstructionContext& ctx) const final;
+        void emit_function_start(const std::string& function_name, size_t func_index) const final;
+        void emit_function_end() const final;
+        void emit_function_call_lookup(uint32_t addr) const final;
+        void emit_function_call_by_register(int reg) const final;
+        void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const final;
+        void emit_function_call(const Context& context, size_t function_index) const final;
+        void emit_named_function_call(const std::string& function_name) const final;
+        void emit_goto(const std::string& target) const final;
+        void emit_label(const std::string& label_name) const final;
+        void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const final;
+        void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const final;
+        void emit_branch_close() const final;
+        void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const final;
+        void emit_case(int case_index, const std::string& target_label) const final;
+        void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const final;
+        void emit_switch_close() const final;
+        void emit_return() const final;
+        void emit_check_fr(int fpr) const final;
+        void emit_check_nan(int fpr, bool is_double) const final;
+        void emit_cop0_status_read(int reg) const final;
+        void emit_cop0_status_write(int reg) const final;
+        void emit_cop1_cs_read(int reg) const final;
+        void emit_cop1_cs_write(int reg) const final;
+        void emit_muldiv(InstrId instr_id, int reg1, int reg2) const final;
+        void emit_syscall(uint32_t instr_vram) const final;
+        void emit_do_break(uint32_t instr_vram) const final;
+        void emit_pause_self() const final;
+        void emit_trigger_event(uint32_t event_index) const final;
+        void emit_comment(const std::string& comment) const final;
+    private:
+        void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const;
+        void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const;
+        void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const;
+        std::ostream& output_file;
+    };
+}
+
+#endif
diff --git a/include/recompiler/live_recompiler.h b/include/recompiler/live_recompiler.h
new file mode 100644
index 0000000..1b92d95
--- /dev/null
+++ b/include/recompiler/live_recompiler.h
@@ -0,0 +1,141 @@
+#ifndef __LIVE_RECOMPILER_H__
+#define __LIVE_RECOMPILER_H__
+
+#include <unordered_map>
+#include "recompiler/generator.h"
+#include "recomp.h"
+
+struct sljit_compiler;
+
+namespace N64Recomp {
+    struct LiveGeneratorContext;
+    struct ReferenceJumpDetails {
+        uint16_t section;
+        uint32_t section_offset;
+    };
+    struct LiveGeneratorOutput {
+        LiveGeneratorOutput() = default;
+        LiveGeneratorOutput(const LiveGeneratorOutput& rhs) = delete;
+        LiveGeneratorOutput(LiveGeneratorOutput&& rhs) { *this = std::move(rhs); }
+        LiveGeneratorOutput& operator=(const LiveGeneratorOutput& rhs) = delete;
+        LiveGeneratorOutput& operator=(LiveGeneratorOutput&& rhs) {
+            good = rhs.good;
+            string_literals = std::move(rhs.string_literals);
+            jump_tables = std::move(rhs.jump_tables);
+            code = rhs.code;
+            code_size = rhs.code_size;
+            functions = std::move(rhs.functions);
+            reference_symbol_jumps = std::move(rhs.reference_symbol_jumps);
+            import_jumps_by_index = std::move(rhs.import_jumps_by_index);
+            executable_offset = rhs.executable_offset;
+
+            rhs.good = false;
+            rhs.code = nullptr;
+            rhs.code_size = 0;
+            rhs.reference_symbol_jumps.clear();
+            rhs.executable_offset = 0;
+
+            return *this;
+        }
+        ~LiveGeneratorOutput();
+        size_t num_reference_symbol_jumps() const;
+        void set_reference_symbol_jump(size_t jump_index, recomp_func_t* func);
+        ReferenceJumpDetails get_reference_symbol_jump_details(size_t jump_index);
+        void populate_import_symbol_jumps(size_t import_index, recomp_func_t* func);
+        bool good = false;
+        // Storage for string literals referenced by recompiled code. These are allocated as unique_ptr arrays
+        // to prevent them from moving, as the referenced address is baked into the recompiled code.
+        std::vector<std::unique_ptr<char[]>> string_literals;
+        // Storage for jump tables referenced by recompiled code (vector of arrays of pointers). These are also
+        // allocated as unique_ptr arrays for the same reason as strings.
+        std::vector<std::unique_ptr<void*[]>> jump_tables;
+        // Recompiled code.
+        void* code;
+        // Size of the recompiled code.
+        size_t code_size;
+        // Pointers to each individual function within the recompiled code.
+        std::vector<recomp_func_t*> functions;
+    private:
+        // List of jump details and the corresponding jump instruction address. These jumps get populated after recompilation is complete
+        // during dependency resolution.
+        std::vector<std::pair<ReferenceJumpDetails, void*>> reference_symbol_jumps;
+        // Mapping of import symbol index to any jumps to that import symbol.
+        std::unordered_multimap<size_t, void*> import_jumps_by_index;
+        // sljit executable offset.
+        int64_t executable_offset;
+
+        friend class LiveGenerator;
+    };
+    struct LiveGeneratorInputs {
+        uint32_t base_event_index;
+        void (*cop0_status_write)(recomp_context* ctx, gpr value);
+        gpr (*cop0_status_read)(recomp_context* ctx);
+        void (*switch_error)(const char* func, uint32_t vram, uint32_t jtbl);
+        void (*do_break)(uint32_t vram);
+        recomp_func_t* (*get_function)(int32_t vram);
+        void (*syscall_handler)(uint8_t* rdram, recomp_context* ctx, int32_t instruction_vram);
+        void (*pause_self)(uint8_t* rdram);
+        void (*trigger_event)(uint8_t* rdram, recomp_context* ctx, uint32_t event_index);
+        int32_t *reference_section_addresses;
+        int32_t *local_section_addresses;
+    };
+    class LiveGenerator final : public Generator {
+    public:
+        LiveGenerator(size_t num_funcs, const LiveGeneratorInputs& inputs);
+        ~LiveGenerator();
+        // Prevent moving or copying.
+        LiveGenerator(const LiveGenerator& rhs) = delete;
+        LiveGenerator(LiveGenerator&& rhs) = delete;
+        LiveGenerator& operator=(const LiveGenerator& rhs) = delete;
+        LiveGenerator& operator=(LiveGenerator&& rhs) = delete;
+
+        LiveGeneratorOutput finish();
+        void process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const final;
+        void process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const final;
+        void process_store_op(const StoreOp& op, const InstructionContext& ctx) const final;
+        void emit_function_start(const std::string& function_name, size_t func_index) const final;
+        void emit_function_end() const final;
+        void emit_function_call_lookup(uint32_t addr) const final;
+        void emit_function_call_by_register(int reg) const final;
+        void emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const final;
+        void emit_function_call(const Context& context, size_t function_index) const final;
+        void emit_named_function_call(const std::string& function_name) const final;
+        void emit_goto(const std::string& target) const final;
+        void emit_label(const std::string& label_name) const final;
+        void emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const final;
+        void emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const final;
+        void emit_branch_close() const final;
+        void emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const final;
+        void emit_case(int case_index, const std::string& target_label) const final;
+        void emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const final;
+        void emit_switch_close() const final;
+        void emit_return() const final;
+        void emit_check_fr(int fpr) const final;
+        void emit_check_nan(int fpr, bool is_double) const final;
+        void emit_cop0_status_read(int reg) const final;
+        void emit_cop0_status_write(int reg) const final;
+        void emit_cop1_cs_read(int reg) const final;
+        void emit_cop1_cs_write(int reg) const final;
+        void emit_muldiv(InstrId instr_id, int reg1, int reg2) const final;
+        void emit_syscall(uint32_t instr_vram) const final;
+        void emit_do_break(uint32_t instr_vram) const final;
+        void emit_pause_self() const final;
+        void emit_trigger_event(uint32_t event_index) const final;
+        void emit_comment(const std::string& comment) const final;
+    private:
+        void get_operand_string(Operand operand, UnaryOpType operation, const InstructionContext& context, std::string& operand_string) const;
+        void get_binary_expr_string(BinaryOpType type, const BinaryOperands& operands, const InstructionContext& ctx, const std::string& output, std::string& expr_string) const;
+        void get_notation(BinaryOpType op_type, std::string& func_string, std::string& infix_string) const;
+        // Loads the relocated address specified by the instruction context into the target register.
+        void load_relocated_address(const InstructionContext& ctx, int reg) const;
+        sljit_compiler* compiler;
+        LiveGeneratorInputs inputs;
+        mutable std::unique_ptr<LiveGeneratorContext> context;
+        mutable bool errored;
+    };
+
+    void live_recompiler_init();
+    bool recompile_function_live(LiveGenerator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs);
+}
+
+#endif
\ No newline at end of file
diff --git a/include/operations.h b/include/recompiler/operations.h
similarity index 92%
rename from include/operations.h
rename to include/recompiler/operations.h
index 5cb407e..65f2ed7 100644
--- a/include/operations.h
+++ b/include/recompiler/operations.h
@@ -28,13 +28,12 @@ namespace N64Recomp {
         ToU32,
         ToS64,
         ToU64,
-        NegateS32,
-        NegateS64,
         Lui,
         Mask5, // Mask to 5 bits
         Mask6, // Mask to 5 bits
         ToInt32, // Functionally equivalent to ToS32, only exists for parity with old codegen
-        Negate,
+        NegateFloat,
+        NegateDouble,
         AbsFloat,
         AbsDouble,
         SqrtFloat,
@@ -51,12 +50,20 @@ namespace N64Recomp {
         ConvertLFromS,
         TruncateWFromS,
         TruncateWFromD,
+        TruncateLFromS,
+        TruncateLFromD,
         RoundWFromS,
         RoundWFromD,
+        RoundLFromS,
+        RoundLFromD,
         CeilWFromS,
         CeilWFromD,
+        CeilLFromS,
+        CeilLFromD,
         FloorWFromS,
-        FloorWFromD
+        FloorWFromD,
+        FloorLFromS,
+        FloorLFromD
     };
 
     enum class BinaryOpType {
@@ -92,6 +99,12 @@ namespace N64Recomp {
         LessEq,
         Greater,
         GreaterEq,
+        EqualFloat,
+        LessFloat,
+        LessEqFloat,
+        EqualDouble,
+        LessDouble,
+        LessEqDouble,
         // Loads
         LD,
         LW,
diff --git a/lib/sljit b/lib/sljit
new file mode 160000
index 0000000..f632608
--- /dev/null
+++ b/lib/sljit
@@ -0,0 +1 @@
+Subproject commit f6326087b3404efb07c6d3deed97b3c3b8098c0c
diff --git a/src/analysis.cpp b/src/analysis.cpp
index 5dfd955..92a421e 100644
--- a/src/analysis.cpp
+++ b/src/analysis.cpp
@@ -4,7 +4,7 @@
 #include "rabbitizer.hpp"
 #include "fmt/format.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "analysis.h"
 
 extern "C" const char* RabbitizerRegister_getNameGpr(uint8_t regValue);
@@ -194,21 +194,11 @@ bool analyze_instruction(const rabbitizer::InstructionCpu& instr, const N64Recom
                 reg_states[rs].loaded_lw_vram,
                 reg_states[rs].loaded_addu_vram,
                 instr.getVram(),
+                0, // section index gets filled in later
                 std::vector<uint32_t>{}
             );
-        } else if (reg_states[rs].valid_lui && reg_states[rs].valid_addiu && !reg_states[rs].valid_addend && !reg_states[rs].valid_loaded) {
-            uint32_t address = reg_states[rs].prev_addiu_vram + reg_states[rs].prev_lui;
-            stats.absolute_jumps.emplace_back(
-                address,
-                instr.getVram()
-            );
-        }
-        // Allow tail calls (TODO account for trailing nops due to bad function splits)
-        else if (instr.getVram() != func.vram + (func.words.size() - 2) * sizeof(func.words[0])) {
-            // Inconclusive analysis
-            fmt::print(stderr, "Failed to to find jump table for `jr {}` at 0x{:08X} in {}\n", RabbitizerRegister_getNameGpr(rs), instr.getVram(), func.name);
-            return false;
         }
+        // TODO stricter validation on tail calls, since not all indirect jumps can be treated as one.
         break;
     default:
         if (instr.modifiesRd()) {
@@ -256,6 +246,7 @@ bool N64Recomp::analyze_function(const N64Recomp::Context& context, const N64Rec
 
         // TODO this assumes that the jump table is in the same section as the function itself
         cur_jtbl.rom = cur_jtbl.vram + func.rom - func.vram;
+        cur_jtbl.section_index = func.section_index;
 
         while (vram < end_address) {
             // Retrieve the current entry of the jump table
diff --git a/src/analysis.h b/src/analysis.h
index eafd1e7..9e0562e 100644
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -4,22 +4,9 @@
 #include <cstdint>
 #include <vector>
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 namespace N64Recomp {
-    struct JumpTable {
-        uint32_t vram;
-        uint32_t addend_reg;
-        uint32_t rom;
-        uint32_t lw_vram;
-        uint32_t addu_vram;
-        uint32_t jr_vram;
-        std::vector<uint32_t> entries;
-
-        JumpTable(uint32_t vram, uint32_t addend_reg, uint32_t rom, uint32_t lw_vram, uint32_t addu_vram, uint32_t jr_vram, std::vector<uint32_t>&& entries)
-                : vram(vram), addend_reg(addend_reg), rom(rom), lw_vram(lw_vram), addu_vram(addu_vram), jr_vram(jr_vram), entries(std::move(entries)) {}
-    };
-
     struct AbsoluteJump {
         uint32_t jump_target;
         uint32_t instruction_vram;
@@ -29,7 +16,6 @@ namespace N64Recomp {
 
     struct FunctionStats {
         std::vector<JumpTable> jump_tables;
-        std::vector<AbsoluteJump> absolute_jumps;
     };
 
     bool analyze_function(const Context& context, const Function& function, const std::vector<rabbitizer::InstructionCpu>& instructions, FunctionStats& stats);
diff --git a/src/cgenerator.cpp b/src/cgenerator.cpp
index 7751568..596ad60 100644
--- a/src/cgenerator.cpp
+++ b/src/cgenerator.cpp
@@ -4,11 +4,11 @@
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-#include "generator.h"
+#include "recompiler/generator.h"
 
 struct BinaryOpFields { std::string func_string; std::string infix_string; };
 
-std::vector<BinaryOpFields> c_op_fields = []() {
+static std::vector<BinaryOpFields> c_op_fields = []() {
     std::vector<BinaryOpFields> ret{};
     ret.resize(static_cast<size_t>(N64Recomp::BinaryOpType::COUNT));
     std::vector<char> ops_setup{};
@@ -45,9 +45,15 @@ std::vector<BinaryOpFields> c_op_fields = []() {
     setup_op(N64Recomp::BinaryOpType::Sra32,     "S32",    ">>"); // Arithmetic aspect will be taken care of by unary op for first operand.
     setup_op(N64Recomp::BinaryOpType::Sra64,     "",       ">>"); // Arithmetic aspect will be taken care of by unary op for first operand.
     setup_op(N64Recomp::BinaryOpType::Equal,     "",       "==");
+    setup_op(N64Recomp::BinaryOpType::EqualFloat,"",       "==");
+    setup_op(N64Recomp::BinaryOpType::EqualDouble,"",      "==");
     setup_op(N64Recomp::BinaryOpType::NotEqual,  "",       "!=");
     setup_op(N64Recomp::BinaryOpType::Less,      "",       "<");
+    setup_op(N64Recomp::BinaryOpType::LessFloat, "",       "<");
+    setup_op(N64Recomp::BinaryOpType::LessDouble,"",       "<");
     setup_op(N64Recomp::BinaryOpType::LessEq,    "",       "<=");
+    setup_op(N64Recomp::BinaryOpType::LessEqFloat,"",      "<=");
+    setup_op(N64Recomp::BinaryOpType::LessEqDouble,"",     "<=");
     setup_op(N64Recomp::BinaryOpType::Greater,   "",       ">");
     setup_op(N64Recomp::BinaryOpType::GreaterEq, "",       ">=");
     setup_op(N64Recomp::BinaryOpType::LD,        "LD",     "");
@@ -72,22 +78,22 @@ std::vector<BinaryOpFields> c_op_fields = []() {
     return ret;
 }();
 
-std::string gpr_to_string(int gpr_index) {
+static std::string gpr_to_string(int gpr_index) {
     if (gpr_index == 0) {
         return "0";
     }
     return fmt::format("ctx->r{}", gpr_index);
 }
 
-std::string fpr_to_string(int fpr_index) {
+static std::string fpr_to_string(int fpr_index) {
     return fmt::format("ctx->f{}.fl", fpr_index);
 }
 
-std::string fpr_double_to_string(int fpr_index) {
+static std::string fpr_double_to_string(int fpr_index) {
     return fmt::format("ctx->f{}.d", fpr_index);
 }
 
-std::string fpr_u32l_to_string(int fpr_index) {
+static std::string fpr_u32l_to_string(int fpr_index) {
     if (fpr_index & 1) {
         return fmt::format("ctx->f_odd[({} - 1) * 2]", fpr_index);
     }
@@ -96,11 +102,11 @@ std::string fpr_u32l_to_string(int fpr_index) {
     }
 }
 
-std::string fpr_u64_to_string(int fpr_index) {
+static std::string fpr_u64_to_string(int fpr_index) {
     return fmt::format("ctx->f{}.u64", fpr_index);
 }
 
-std::string unsigned_reloc(const N64Recomp::InstructionContext& context) {
+static std::string unsigned_reloc(const N64Recomp::InstructionContext& context) {
     switch (context.reloc_type) {
         case N64Recomp::RelocType::R_MIPS_HI16:
             return fmt::format("{}RELOC_HI16({}, {:#X})",
@@ -113,7 +119,7 @@ std::string unsigned_reloc(const N64Recomp::InstructionContext& context) {
     }
 }
 
-std::string signed_reloc(const N64Recomp::InstructionContext& context) {
+static std::string signed_reloc(const N64Recomp::InstructionContext& context) {
     return "(int16_t)" + unsigned_reloc(context);
 }
 
@@ -223,12 +229,6 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper
         case UnaryOpType::ToU64:
             // Nothing to do here, they're already U64
             break;
-        case UnaryOpType::NegateS32:
-            assert(false);
-            break;
-        case UnaryOpType::NegateS64:
-            assert(false);
-            break;
         case UnaryOpType::Lui:
             operand_string = "S32(" + operand_string + " << 16)"; 
             break;
@@ -241,7 +241,10 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper
         case UnaryOpType::ToInt32:
             operand_string = "(int32_t)" + operand_string; 
             break;
-        case UnaryOpType::Negate:
+        case UnaryOpType::NegateFloat:
+            operand_string = "-" + operand_string;
+            break;
+        case UnaryOpType::NegateDouble:
             operand_string = "-" + operand_string;
             break;
         case UnaryOpType::AbsFloat:
@@ -292,24 +295,48 @@ void N64Recomp::CGenerator::get_operand_string(Operand operand, UnaryOpType oper
         case UnaryOpType::TruncateWFromD:
             operand_string = "TRUNC_W_D(" + operand_string + ")";
             break;
+        case UnaryOpType::TruncateLFromS:
+            operand_string = "TRUNC_L_S(" + operand_string + ")";
+            break;
+        case UnaryOpType::TruncateLFromD:
+            operand_string = "TRUNC_L_D(" + operand_string + ")";
+            break;
         case UnaryOpType::RoundWFromS:
             operand_string = "lroundf(" + operand_string + ")";
             break;
         case UnaryOpType::RoundWFromD:
             operand_string = "lround(" + operand_string + ")";
             break;
+        case UnaryOpType::RoundLFromS:
+            operand_string = "llroundf(" + operand_string + ")";
+            break;
+        case UnaryOpType::RoundLFromD:
+            operand_string = "llround(" + operand_string + ")";
+            break;
         case UnaryOpType::CeilWFromS:
             operand_string = "S32(ceilf(" + operand_string + "))";
             break;
         case UnaryOpType::CeilWFromD:
             operand_string = "S32(ceil(" + operand_string + "))";
             break;
+        case UnaryOpType::CeilLFromS:
+            operand_string = "S64(ceilf(" + operand_string + "))";
+            break;
+        case UnaryOpType::CeilLFromD:
+            operand_string = "S64(ceil(" + operand_string + "))";
+            break;
         case UnaryOpType::FloorWFromS:
             operand_string = "S32(floorf(" + operand_string + "))";
             break;
         case UnaryOpType::FloorWFromD:
             operand_string = "S32(floor(" + operand_string + "))";
             break;
+        case UnaryOpType::FloorLFromS:
+            operand_string = "S64(floorf(" + operand_string + "))";
+            break;
+        case UnaryOpType::FloorLFromD:
+            operand_string = "S64(floor(" + operand_string + "))";
+            break;
     }
 }
 
@@ -333,10 +360,10 @@ void N64Recomp::CGenerator::get_binary_expr_string(BinaryOpType type, const Bina
         expr_string = fmt::format("{} {} {} ? 1 : 0", input_a, infix_string, input_b);
     }
     else if (type == BinaryOpType::Equal && operands.operands[1] == Operand::Zero && operands.operand_operations[1] == UnaryOpType::None) {
-        expr_string = input_a;
+        expr_string = "!" + input_a;
     }
     else if (type == BinaryOpType::NotEqual && operands.operands[1] == Operand::Zero && operands.operand_operations[1] == UnaryOpType::None) {
-        expr_string = "!" + input_a;
+        expr_string = input_a;
     }
     // End unnecessary cases.
 
@@ -365,7 +392,57 @@ void N64Recomp::CGenerator::get_binary_expr_string(BinaryOpType type, const Bina
     }
 }
 
-void N64Recomp::CGenerator::emit_branch_condition(std::ostream& output_file, const ConditionalBranchOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::emit_function_start(const std::string& function_name, size_t func_index) const {
+    fmt::print(output_file,
+        "RECOMP_FUNC void {}(uint8_t* rdram, recomp_context* ctx) {{\n"
+        // these variables shouldn't need to be preserved across function boundaries, so make them local for more efficient output
+        "    uint64_t hi = 0, lo = 0, result = 0;\n"
+        "    int c1cs = 0;\n", // cop1 conditional signal
+        function_name);
+}
+
+void N64Recomp::CGenerator::emit_function_end() const {
+    fmt::print(output_file, ";}}\n");
+}
+
+void N64Recomp::CGenerator::emit_function_call_lookup(uint32_t addr) const {
+    fmt::print(output_file, "LOOKUP_FUNC(0x{:08X})(rdram, ctx);\n", addr);
+}
+
+void N64Recomp::CGenerator::emit_function_call_by_register(int reg) const {
+    fmt::print(output_file, "LOOKUP_FUNC({})(rdram, ctx);\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_function_call_reference_symbol(const Context& context, uint16_t section_index, size_t symbol_index, uint32_t target_section_offset) const {
+    (void)target_section_offset;
+    const N64Recomp::ReferenceSymbol& sym = context.get_reference_symbol(section_index, symbol_index);
+    fmt::print(output_file, "{}(rdram, ctx);\n", sym.name);
+}
+
+void N64Recomp::CGenerator::emit_function_call(const Context& context, size_t function_index) const {
+    fmt::print(output_file, "{}(rdram, ctx);\n", context.functions[function_index].name);
+}
+
+void N64Recomp::CGenerator::emit_named_function_call(const std::string& function_name) const {
+    fmt::print(output_file, "{}(rdram, ctx);\n", function_name);
+}
+
+void N64Recomp::CGenerator::emit_goto(const std::string& target) const {
+    fmt::print(output_file,
+        "    goto {};\n", target);
+}
+
+void N64Recomp::CGenerator::emit_label(const std::string& label_name) const {
+    fmt::print(output_file,
+        "{}:\n", label_name);
+}
+
+void N64Recomp::CGenerator::emit_jtbl_addend_declaration(const JumpTable& jtbl, int reg) const {
+    std::string jump_variable = fmt::format("jr_addend_{:08X}", jtbl.jr_vram);
+    fmt::print(output_file, "gpr {} = {};\n", jump_variable, gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_branch_condition(const ConditionalBranchOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string expr_string{};
@@ -373,19 +450,114 @@ void N64Recomp::CGenerator::emit_branch_condition(std::ostream& output_file, con
     fmt::print(output_file, "if ({}) {{\n", expr_string);
 }
 
-void N64Recomp::CGenerator::emit_branch_close(std::ostream& output_file) const {
-    fmt::print(output_file, "    }}\n");
+void N64Recomp::CGenerator::emit_branch_close() const {
+    fmt::print(output_file, "}}\n");
 }
 
-void N64Recomp::CGenerator::emit_check_fr(std::ostream& output_file, int fpr) const {
+void N64Recomp::CGenerator::emit_switch_close() const {
+    fmt::print(output_file, "}}\n");
+}
+
+void N64Recomp::CGenerator::emit_switch(const Context& recompiler_context, const JumpTable& jtbl, int reg) const {
+    (void)recompiler_context;
+    (void)reg;
+    // TODO generate code to subtract the jump table address from the register's value instead.
+    // Once that's done, the addend temp can be deleted to simplify the generator interface.
+    std::string jump_variable = fmt::format("jr_addend_{:08X}", jtbl.jr_vram);
+
+    fmt::print(output_file, "switch ({} >> 2) {{\n", jump_variable);
+}
+
+void N64Recomp::CGenerator::emit_case(int case_index, const std::string& target_label) const {
+    fmt::print(output_file, "case {}: goto {}; break;\n", case_index, target_label);
+}
+
+void N64Recomp::CGenerator::emit_switch_error(uint32_t instr_vram, uint32_t jtbl_vram) const {
+    fmt::print(output_file, "default: switch_error(__func__, 0x{:08X}, 0x{:08X});\n", instr_vram, jtbl_vram);
+}
+
+void N64Recomp::CGenerator::emit_return() const {
+    fmt::print(output_file, "return;\n");
+}
+
+void N64Recomp::CGenerator::emit_check_fr(int fpr) const {
     fmt::print(output_file, "CHECK_FR(ctx, {});\n    ", fpr);
 }
 
-void N64Recomp::CGenerator::emit_check_nan(std::ostream& output_file, int fpr, bool is_double) const {
+void N64Recomp::CGenerator::emit_check_nan(int fpr, bool is_double) const {
     fmt::print(output_file, "NAN_CHECK(ctx->f{}.{}); ", fpr, is_double ? "d" : "fl");
 }
 
-void N64Recomp::CGenerator::process_binary_op(std::ostream& output_file, const BinaryOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::emit_cop0_status_read(int reg) const {
+    fmt::print(output_file, "{} = cop0_status_read(ctx);\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_cop0_status_write(int reg) const {
+    fmt::print(output_file, "cop0_status_write(ctx, {});", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_cop1_cs_read(int reg) const {
+    fmt::print(output_file, "{} = get_cop1_cs();\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_cop1_cs_write(int reg) const {
+    fmt::print(output_file, "set_cop1_cs({});\n", gpr_to_string(reg));
+}
+
+void N64Recomp::CGenerator::emit_muldiv(InstrId instr_id, int reg1, int reg2) const {
+    switch (instr_id) {
+        case InstrId::cpu_mult:
+            fmt::print(output_file, "result = S64(S32({})) * S64(S32({})); lo = S32(result >> 0); hi = S32(result >> 32);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_dmult:
+            fmt::print(output_file, "DMULT(S64({}), S64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_multu:
+            fmt::print(output_file, "result = U64(U32({})) * U64(U32({})); lo = S32(result >> 0); hi = S32(result >> 32);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_dmultu:
+            fmt::print(output_file, "DMULTU(U64({}), U64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_div:
+            // Cast to 64-bits before division to prevent artihmetic exception for s32(0x80000000) / -1
+            fmt::print(output_file, "lo = S32(S64(S32({0})) / S64(S32({1}))); hi = S32(S64(S32({0})) % S64(S32({1})));\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_ddiv:
+            fmt::print(output_file, "DDIV(S64({}), S64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_divu:
+            fmt::print(output_file, "lo = S32(U32({0}) / U32({1})); hi = S32(U32({0}) % U32({1}));\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        case InstrId::cpu_ddivu:
+            fmt::print(output_file, "DDIVU(U64({}), U64({}), &lo, &hi);\n", gpr_to_string(reg1), gpr_to_string(reg2));
+            break;
+        default:
+            assert(false);
+            break;
+    }
+}
+
+void N64Recomp::CGenerator::emit_syscall(uint32_t instr_vram) const {
+    fmt::print(output_file, "recomp_syscall_handler(rdram, ctx, 0x{:08X});\n", instr_vram);
+}
+
+void N64Recomp::CGenerator::emit_do_break(uint32_t instr_vram) const {
+    fmt::print(output_file, "do_break({});\n", instr_vram);
+}
+
+void N64Recomp::CGenerator::emit_pause_self() const {
+    fmt::print(output_file, "pause_self(rdram);\n");
+}
+
+void N64Recomp::CGenerator::emit_trigger_event(uint32_t event_index) const {
+    fmt::print(output_file, "recomp_trigger_event(rdram, ctx, base_event_index + {});\n", event_index);
+}
+
+void N64Recomp::CGenerator::emit_comment(const std::string& comment) const {
+    fmt::print(output_file, "// {}\n", comment);
+}
+
+void N64Recomp::CGenerator::process_binary_op(const BinaryOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string output{};
@@ -395,7 +567,7 @@ void N64Recomp::CGenerator::process_binary_op(std::ostream& output_file, const B
     fmt::print(output_file, "{} = {};\n", output, expression);
 }
 
-void N64Recomp::CGenerator::process_unary_op(std::ostream& output_file, const UnaryOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::process_unary_op(const UnaryOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string output{};
@@ -406,7 +578,7 @@ void N64Recomp::CGenerator::process_unary_op(std::ostream& output_file, const Un
     fmt::print(output_file, "{} = {};\n", output, input);
 }
 
-void N64Recomp::CGenerator::process_store_op(std::ostream& output_file, const StoreOp& op, const InstructionContext& ctx) const {
+void N64Recomp::CGenerator::process_store_op(const StoreOp& op, const InstructionContext& ctx) const {
     // Thread local variables to prevent allocations when possible.
     // TODO these thread locals probably don't actually help right now, so figure out a better way to prevent allocations.
     thread_local std::string base_str{};
diff --git a/src/config.cpp b/src/config.cpp
index d3b236f..f191ba5 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -3,7 +3,7 @@
 #include <toml++/toml.hpp>
 #include "fmt/format.h"
 #include "config.h"
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 std::filesystem::path concat_if_not_empty(const std::filesystem::path& parent, const std::filesystem::path& child) {
     if (!child.empty()) {
@@ -375,7 +375,7 @@ N64Recomp::Config::Config(const char* path) {
             recomp_include = recomp_include_opt.value();
         }
         else {
-            recomp_include = "#include \"librecomp/recomp.h\"";
+            recomp_include = "#include \"recomp.h\"";
         }
 
         std::optional<int32_t> funcs_per_file_opt = input_data["functions_per_output_file"].value<int32_t>();
diff --git a/src/elf.cpp b/src/elf.cpp
index a18fdbd..d83908c 100644
--- a/src/elf.cpp
+++ b/src/elf.cpp
@@ -3,7 +3,7 @@
 #include "fmt/format.h"
 // #include "fmt/ostream.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "elfio/elfio.hpp"
 
 bool read_symbols(N64Recomp::Context& context, const ELFIO::elfio& elf_file, ELFIO::section* symtab_section, const N64Recomp::ElfParsingConfig& elf_config, bool dumping_context, std::unordered_map<uint16_t, std::vector<N64Recomp::DataSymbol>>& data_syms) {
diff --git a/src/main.cpp b/src/main.cpp
index a2ccdc1..8a8fe91 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -9,7 +9,7 @@
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "config.h"
 #include <set>
 
@@ -111,7 +111,7 @@ bool compare_files(const std::filesystem::path& file1_path, const std::filesyste
     return std::equal(begin1, std::istreambuf_iterator<char>(), begin2); //Second argument is end-of-range iterator
 }
 
-bool recompile_single_function(const N64Recomp::Context& context, const N64Recomp::Function& func, const std::string& recomp_include, const std::filesystem::path& output_path, std::span<std::vector<uint32_t>> static_funcs_out) {
+bool recompile_single_function(const N64Recomp::Context& context, size_t func_index, const std::string& recomp_include, const std::filesystem::path& output_path, std::span<std::vector<uint32_t>> static_funcs_out) {
     // Open the temporary output file
     std::filesystem::path temp_path = output_path;
     temp_path.replace_extension(".tmp");
@@ -127,7 +127,7 @@ bool recompile_single_function(const N64Recomp::Context& context, const N64Recom
         "\n",
         recomp_include);
 
-    if (!N64Recomp::recompile_function(context, func, output_file, static_funcs_out, false)) {
+    if (!N64Recomp::recompile_function(context, func_index, output_file, static_funcs_out, false)) {
         return false;
     }
     
@@ -725,7 +725,7 @@ int main(int argc, char** argv) {
 
             // Recompile the function.
             if (config.single_file_output || config.functions_per_output_file > 1) {
-                result = N64Recomp::recompile_function(context, func, current_output_file, static_funcs_by_section, false);
+                result = N64Recomp::recompile_function(context, i, current_output_file, static_funcs_by_section, false);
                 if (!config.single_file_output) {
                     cur_file_function_count++;
                     if (cur_file_function_count >= config.functions_per_output_file) {
@@ -734,7 +734,7 @@ int main(int argc, char** argv) {
                 }
             }
             else {
-                result = recompile_single_function(context, func, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section);
+                result = recompile_single_function(context, i, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section);
             }
             if (result == false) {
                 fmt::print(stderr, "Error recompiling {}\n", func.name);
@@ -797,22 +797,25 @@ int main(int argc, char** argv) {
             std::vector<uint32_t> insn_words((cur_func_end - static_func_addr) / sizeof(uint32_t));
             insn_words.assign(func_rom_start, func_rom_start + insn_words.size());
 
-            N64Recomp::Function func {
+            // Create the new function and add it to the context.
+            size_t new_func_index = context.functions.size();
+            context.functions.emplace_back(
                 static_func_addr,
                 rom_addr,
                 std::move(insn_words),
                 fmt::format("static_{}_{:08X}", section_index, static_func_addr),
                 static_cast<uint16_t>(section_index),
                 false
-            };
+            );
+            const N64Recomp::Function& new_func = context.functions[new_func_index];
 
             fmt::print(func_header_file,
-                       "void {}(uint8_t* rdram, recomp_context* ctx);\n", func.name);
+                       "void {}(uint8_t* rdram, recomp_context* ctx);\n", new_func.name);
 
             bool result;
-            size_t prev_num_statics = static_funcs_by_section[func.section_index].size();
+            size_t prev_num_statics = static_funcs_by_section[new_func.section_index].size();
             if (config.single_file_output || config.functions_per_output_file > 1) {
-                result = N64Recomp::recompile_function(context, func, current_output_file, static_funcs_by_section, false);
+                result = N64Recomp::recompile_function(context, new_func_index, current_output_file, static_funcs_by_section, false);
                 if (!config.single_file_output) {
                     cur_file_function_count++;
                     if (cur_file_function_count >= config.functions_per_output_file) {
@@ -821,14 +824,14 @@ int main(int argc, char** argv) {
                 }
             }
             else {
-                result = recompile_single_function(context, func, config.recomp_include, config.output_func_path / (func.name + ".c"), static_funcs_by_section);
+                result = recompile_single_function(context, new_func_index, config.recomp_include, config.output_func_path / (new_func.name + ".c"), static_funcs_by_section);
             }
 
             // Add any new static functions that were found while recompiling this one.
-            size_t cur_num_statics = static_funcs_by_section[func.section_index].size();
+            size_t cur_num_statics = static_funcs_by_section[new_func.section_index].size();
             if (cur_num_statics != prev_num_statics) {
                 for (size_t new_static_index = prev_num_statics; new_static_index < cur_num_statics; new_static_index++) {
-                    uint32_t new_static_vram = static_funcs_by_section[func.section_index][new_static_index];
+                    uint32_t new_static_vram = static_funcs_by_section[new_func.section_index][new_static_index];
 
                     if (!statics_set.contains(new_static_vram)) {
                         statics_set.emplace(new_static_vram);
@@ -838,7 +841,7 @@ int main(int argc, char** argv) {
             }
 
             if (result == false) {
-                fmt::print(stderr, "Error recompiling {}\n", func.name);
+                fmt::print(stderr, "Error recompiling {}\n", new_func.name);
                 std::exit(EXIT_FAILURE);
             }
         }
diff --git a/src/mod_symbols.cpp b/src/mod_symbols.cpp
index 24675fe..fcfdead 100644
--- a/src/mod_symbols.cpp
+++ b/src/mod_symbols.cpp
@@ -1,6 +1,6 @@
 #include <cstring>
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 struct FileHeader {
     char magic[8]; // N64RSYMS
diff --git a/src/operations.cpp b/src/operations.cpp
index d73b278..70201d3 100644
--- a/src/operations.cpp
+++ b/src/operations.cpp
@@ -1,4 +1,4 @@
-#include "operations.h"
+#include "recompiler/operations.h"
 
 namespace N64Recomp {
     const std::unordered_map<InstrId, UnaryOp> unary_ops {
@@ -12,8 +12,8 @@ namespace N64Recomp {
         // Float operations
         { InstrId::cpu_mov_s,     { UnaryOpType::None,           Operand::Fd,       Operand::Fs,       true } },
         { InstrId::cpu_mov_d,     { UnaryOpType::None,           Operand::FdDouble, Operand::FsDouble, true } },
-        { InstrId::cpu_neg_s,     { UnaryOpType::Negate,         Operand::Fd,       Operand::Fs,       true, true } },
-        { InstrId::cpu_neg_d,     { UnaryOpType::Negate,         Operand::FdDouble, Operand::FsDouble, true, true } },
+        { InstrId::cpu_neg_s,     { UnaryOpType::NegateFloat,    Operand::Fd,       Operand::Fs,       true, true } },
+        { InstrId::cpu_neg_d,     { UnaryOpType::NegateDouble,   Operand::FdDouble, Operand::FsDouble, true, true } },
         { InstrId::cpu_abs_s,     { UnaryOpType::AbsFloat,       Operand::Fd,       Operand::Fs,       true, true } },
         { InstrId::cpu_abs_d,     { UnaryOpType::AbsDouble,      Operand::FdDouble, Operand::FsDouble, true, true } },
         { InstrId::cpu_sqrt_s,    { UnaryOpType::SqrtFloat,      Operand::Fd,       Operand::Fs,       true, true } },
@@ -65,24 +65,22 @@ namespace N64Recomp {
         { InstrId::cpu_ori,    { BinaryOpType::Or64,  Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rs, Operand::ImmU16 }}} },
         { InstrId::cpu_xori,   { BinaryOpType::Xor64, Operand::Rt, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Rs, Operand::ImmU16 }}} },
         // Shifts
-        /* BUG Should mask after (change op to Sll32 and input op to ToU32) */
-        { InstrId::cpu_sllv,   { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
+        { InstrId::cpu_sllv,   { BinaryOpType::Sll32, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_dsllv,  { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_srlv,   { BinaryOpType::Srl32, Operand::Rd, {{ UnaryOpType::ToU32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_dsrlv,  { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} },
-        /* BUG Should mask after (change op to Sra32 and input op to ToS64) */
-        { InstrId::cpu_srav,   { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
+        // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half.
+        { InstrId::cpu_srav,   { BinaryOpType::Sra32, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::Mask5 }, { Operand::Rt, Operand::Rs }}} },
         { InstrId::cpu_dsrav,  { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::Mask6 }, { Operand::Rt, Operand::Rs }}} },
         // Shifts (immediate)
-        /* BUG Should mask after (change op to Sll32 and input op to ToU32) */
-        { InstrId::cpu_sll,    { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
+        { InstrId::cpu_sll,    { BinaryOpType::Sll32, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsll,   { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsll32, { BinaryOpType::Sll64, Operand::Rd, {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} },
         { InstrId::cpu_srl,    { BinaryOpType::Srl32, Operand::Rd, {{ UnaryOpType::ToU32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsrl,   { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsrl32, { BinaryOpType::Srl64, Operand::Rd, {{ UnaryOpType::ToU64, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} },
-        /* BUG should cast after (change op to Sra32 and input op to ToS64) */
-        { InstrId::cpu_sra,    { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS32, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
+        // Hardware bug: The input is not masked to 32 bits before right shifting, so bits from the upper half of the register will bleed into the lower half.
+        { InstrId::cpu_sra,    { BinaryOpType::Sra32, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsra,   { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa }}} },
         { InstrId::cpu_dsra32, { BinaryOpType::Sra64, Operand::Rd, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rt, Operand::Sa32 }}} },
         // Comparisons
@@ -101,47 +99,47 @@ namespace N64Recomp {
         { InstrId::cpu_div_s, { BinaryOpType::DivFloat,  Operand::Fd,       {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true, true } },
         { InstrId::cpu_div_d, { BinaryOpType::DivDouble, Operand::FdDouble, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true, true } },
         // Float comparisons TODO remaining operations and investigate ordered/unordered and default values
-        { InstrId::cpu_c_lt_s,  { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_nge_s, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_olt_s, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ult_s, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_lt_d,  { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_nge_d, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_olt_d, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ult_d, { BinaryOpType::Less,   Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_lt_s,  { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_nge_s, { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_olt_s, { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ult_s, { BinaryOpType::LessFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_lt_d,  { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_nge_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_olt_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ult_d, { BinaryOpType::LessDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
 
-        { InstrId::cpu_c_le_s,  { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ngt_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ole_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ule_s, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_le_d,  { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ngt_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ole_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ule_d, { BinaryOpType::LessEq, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_le_s,  { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ngt_s, { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ole_s, { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ule_s, { BinaryOpType::LessEqFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_le_d,  { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ngt_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ole_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ule_d, { BinaryOpType::LessEqDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
 
-        { InstrId::cpu_c_eq_s,  { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ueq_s, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_ngl_s, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_seq_s, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
-        { InstrId::cpu_c_eq_d,  { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ueq_d, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
-        { InstrId::cpu_c_ngl_d, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_eq_s,  { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ueq_s, { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_ngl_s, { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_seq_s, { BinaryOpType::EqualFloat,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Fs, Operand::Ft }}, true } },
+        { InstrId::cpu_c_eq_d,  { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ueq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_ngl_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
         /* TODO rename to c_seq_d when fixed in rabbitizer */
-        { InstrId::cpu_c_deq_d, { BinaryOpType::Equal,  Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
+        { InstrId::cpu_c_deq_d, { BinaryOpType::EqualDouble, Operand::Cop1cs, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::FsDouble, Operand::FtDouble }}, true } },
         // Loads
-        { InstrId::cpu_ld,   { BinaryOpType::LD,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lw,   { BinaryOpType::LW,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwu,  { BinaryOpType::LWU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lh,   { BinaryOpType::LH,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lhu,  { BinaryOpType::LHU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lb,   { BinaryOpType::LB,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lbu,  { BinaryOpType::LBU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_ldl,  { BinaryOpType::LDL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_ldr,  { BinaryOpType::LDR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwl,  { BinaryOpType::LWL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwr,  { BinaryOpType::LWR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_lwc1, { BinaryOpType::LW, Operand::FtU32L, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}} },
-        { InstrId::cpu_ldc1, { BinaryOpType::LD, Operand::FtU64,  {{ UnaryOpType::None, UnaryOpType::None }, { Operand::ImmS16, Operand::Base }}, true } },
+        { InstrId::cpu_ld,   { BinaryOpType::LD,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lw,   { BinaryOpType::LW,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwu,  { BinaryOpType::LWU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lh,   { BinaryOpType::LH,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lhu,  { BinaryOpType::LHU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lb,   { BinaryOpType::LB,  Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lbu,  { BinaryOpType::LBU, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_ldl,  { BinaryOpType::LDL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_ldr,  { BinaryOpType::LDR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwl,  { BinaryOpType::LWL, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwr,  { BinaryOpType::LWR, Operand::Rt,    {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_lwc1, { BinaryOpType::LW, Operand::FtU32L, {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}} },
+        { InstrId::cpu_ldc1, { BinaryOpType::LD, Operand::FtU64,  {{ UnaryOpType::None, UnaryOpType::None }, { Operand::Base, Operand::ImmS16 }}, true } },
     };
 
     const std::unordered_map<InstrId, ConditionalBranchOp> conditional_branch_ops {
@@ -159,10 +157,12 @@ namespace N64Recomp {
         { InstrId::cpu_bltzl,   { BinaryOpType::Less,      {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, false, true }},
         { InstrId::cpu_bgezal,  { BinaryOpType::GreaterEq, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, false }},
         { InstrId::cpu_bgezall, { BinaryOpType::GreaterEq, {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, true }},
-        { InstrId::cpu_bc1f,    { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
-        { InstrId::cpu_bc1fl,   { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
-        { InstrId::cpu_bc1t,    { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
-        { InstrId::cpu_bc1tl,   { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
+        { InstrId::cpu_bltzal,  { BinaryOpType::Less,      {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, false }},
+        { InstrId::cpu_bltzall, { BinaryOpType::Less,      {{ UnaryOpType::ToS64, UnaryOpType::None }, { Operand::Rs, Operand::Zero }}, true, true }},
+        { InstrId::cpu_bc1f,    { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
+        { InstrId::cpu_bc1fl,   { BinaryOpType::Equal,     {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
+        { InstrId::cpu_bc1t,    { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, false }},
+        { InstrId::cpu_bc1tl,   { BinaryOpType::NotEqual,  {{ UnaryOpType::None,  UnaryOpType::None }, { Operand::Cop1cs, Operand::Zero }}, false, true }},
     };
 
     const std::unordered_map<InstrId, StoreOp> store_ops {
diff --git a/src/recompilation.cpp b/src/recompilation.cpp
index 1faef25..cf12c49 100644
--- a/src/recompilation.cpp
+++ b/src/recompilation.cpp
@@ -8,10 +8,10 @@
 #include "fmt/format.h"
 #include "fmt/ostream.h"
 
-#include "n64recomp.h"
+#include "recompiler/context.h"
 #include "analysis.h"
-#include "operations.h"
-#include "generator.h"
+#include "recompiler/operations.h"
+#include "recompiler/generator.h"
 
 enum class JalResolutionResult {
     NoMatch,
@@ -28,7 +28,6 @@ JalResolutionResult resolve_jal(const N64Recomp::Context& context, size_t cur_se
     uint32_t section_vram_start = cur_section.ram_addr;
     uint32_t section_vram_end = cur_section.ram_addr + cur_section.size;
     bool in_current_section = target_func_vram >= section_vram_start && target_func_vram < section_vram_end;
-    bool needs_static = false;
     bool exact_match_found = false;
 
     // Use a thread local to prevent reallocation across runs and to allow multi-threading in the future.
@@ -109,8 +108,8 @@ std::string_view ctx_gpr_prefix(int reg) {
     return "";
 }
 
-// Major TODO, this function grew very organically and needs to be cleaned up. Ideally, it'll get split up into some sort of lookup table grouped by similar instruction types.
-bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Function& func, const N64Recomp::FunctionStats& stats, const std::unordered_set<uint32_t>& skipped_insns, size_t instr_index, const std::vector<rabbitizer::InstructionCpu>& instructions, std::ofstream& output_file, bool indent, bool emit_link_branch, int link_branch_index, size_t reloc_index, bool& needs_link_branch, bool& is_branch_likely, bool tag_reference_relocs, std::span<std::vector<uint32_t>> static_funcs_out) {
+template <typename GeneratorType>
+bool process_instruction(GeneratorType& generator, const N64Recomp::Context& context, const N64Recomp::Function& func, const N64Recomp::FunctionStats& stats, const std::unordered_set<uint32_t>& jtbl_lw_instructions, size_t instr_index, const std::vector<rabbitizer::InstructionCpu>& instructions, std::ostream& output_file, bool indent, bool emit_link_branch, int link_branch_index, size_t reloc_index, bool& needs_link_branch, bool& is_branch_likely, bool tag_reference_relocs, std::span<std::vector<uint32_t>> static_funcs_out) {
     using namespace N64Recomp;
 
     const auto& section = context.sections[func.section_index];
@@ -118,6 +117,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
     needs_link_branch = false;
     is_branch_likely = false;
     uint32_t instr_vram = instr.getVram();
+    InstrId instr_id = instr.getUniqueId();
 
     auto print_indent = [&]() {
         fmt::print(output_file, "    ");
@@ -132,16 +132,20 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
     }
 
     // Output a comment with the original instruction
-    if (instr.isBranch() || instr.getUniqueId() == InstrId::cpu_j) {
-        fmt::print(output_file, "    // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric())));
-    } else if (instr.getUniqueId() == InstrId::cpu_jal) {
-        fmt::print(output_file, "    // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric())));
+    print_indent();
+    if (instr.isBranch() || instr_id == InstrId::cpu_j) {
+        generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0, fmt::format("L_{:08X}", (uint32_t)instr.getBranchVramGeneric()))));
+    } else if (instr_id == InstrId::cpu_jal) {
+        generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0, fmt::format("0x{:08X}", (uint32_t)instr.getBranchVramGeneric()))));
     } else {
-        fmt::print(output_file, "    // 0x{:08X}: {}\n", instr_vram, instr.disassemble(0));
+        generator.emit_comment(fmt::format("0x{:08X}: {}", instr_vram, instr.disassemble(0)));
     }
 
-    if (skipped_insns.contains(instr_vram)) {
-        return true;
+    // Replace loads for jump table entries into addiu. This leaves the jump table entry's address in the output register
+    // instead of the entry's value, which can then be used to determine the offset from the start of the jump table.
+    if (jtbl_lw_instructions.contains(instr_vram)) {
+        assert(instr_id == InstrId::cpu_lw);
+        instr_id = InstrId::cpu_addiu;
     }
 
     N64Recomp::RelocType reloc_type = N64Recomp::RelocType::R_MIPS_NONE;
@@ -178,9 +182,9 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
                     // Don't try to relocate special section symbols.
                     if (context.is_regular_reference_section(reloc.target_section) || reloc_section == N64Recomp::SectionAbsolute) {
                         bool ref_section_relocatable = context.is_reference_section_relocatable(reloc.target_section);
-                        uint32_t ref_section_vram = context.get_reference_section_vram(reloc.target_section);
                         // Resolve HI16 and LO16 reference symbol relocs to non-relocatable sections by patching the instruction immediate.
                         if (!ref_section_relocatable && (reloc_type == N64Recomp::RelocType::R_MIPS_HI16 || reloc_type == N64Recomp::RelocType::R_MIPS_LO16)) {
+                            uint32_t ref_section_vram = context.get_reference_section_vram(reloc.target_section);
                             uint32_t full_immediate = reloc.target_section_offset + ref_section_vram;
 
                             if (reloc_type == N64Recomp::RelocType::R_MIPS_HI16) {
@@ -206,13 +210,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         }
     }
 
-    auto print_line = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
-        print_indent();
-        fmt::vprint(output_file, fmt_str, fmt::make_format_args(args...));
-        fmt::print(output_file, ";\n");
-    };
-
-    auto print_unconditional_branch = [&]<typename... Ts>(fmt::format_string<Ts...> fmt_str, Ts ...args) {
+    auto process_delay_slot = [&](bool use_indent) {
         if (instr_index < instructions.size() - 1) {
             bool dummy_needs_link_branch;
             bool dummy_is_branch_likely;
@@ -221,56 +219,87 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) {
                 next_reloc_index++;
             }
-            if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, false, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
+            if (!process_instruction(generator, context, func, stats, jtbl_lw_instructions, instr_index + 1, instructions, output_file, use_indent, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
                 return false;
             }
         }
-        print_indent();
-        fmt::vprint(output_file, fmt_str, fmt::make_format_args(args...));
-        if (needs_link_branch) {
-            fmt::print(output_file, ";\n    goto after_{};\n", link_branch_index);
-        } else {
-            fmt::print(output_file, ";\n");
-        }
         return true;
     };
 
-    auto print_func_call = [reloc_target_section_offset, reloc_section, reloc_reference_symbol, reloc_type, &context, &section, &func, &static_funcs_out, &needs_link_branch, &print_unconditional_branch]
-        (uint32_t target_func_vram, bool link_branch = true, bool indent = false)
+    auto print_link_branch = [&]() {
+        if (needs_link_branch) {
+            print_indent();
+            generator.emit_goto(fmt::format("after_{}", link_branch_index));
+        }
+    };
+
+    auto print_return_with_delay_slot = [&]() {
+        if (!process_delay_slot(false)) {
+            return false;
+        }
+        print_indent();
+        generator.emit_return();
+        print_link_branch();
+        return true;
+    };
+
+    auto print_goto_with_delay_slot = [&](const std::string& target) {
+        if (!process_delay_slot(false)) {
+            return false;
+        }
+        print_indent();
+        generator.emit_goto(target);
+        print_link_branch();
+        return true;
+    };
+
+    auto print_func_call_by_register = [&](int reg) {
+        if (!process_delay_slot(false)) {
+            return false;
+        }
+        print_indent();
+        generator.emit_function_call_by_register(reg);
+        print_link_branch();
+        return true;
+    };
+
+    auto print_func_call_by_address = [&generator, reloc_target_section_offset, reloc_section, reloc_reference_symbol, reloc_type, &context, &func, &static_funcs_out, &needs_link_branch, &print_indent, &process_delay_slot, &print_link_branch]
+        (uint32_t target_func_vram, bool tail_call = false, bool indent = false)
     {
+        bool call_by_lookup = false;
+        bool call_by_name = false;
         // Event symbol, emit a call to the runtime to trigger this event.
         if (reloc_section == N64Recomp::SectionEvent) {
-            needs_link_branch = link_branch;
+            needs_link_branch = !tail_call;
             if (indent) {
-                if (!print_unconditional_branch("    recomp_trigger_event(rdram, ctx, base_event_index + {})", reloc_reference_symbol)) {
-                    return false;
-                }
-            } else {
-                if (!print_unconditional_branch("recomp_trigger_event(rdram, ctx, base_event_index + {})", reloc_reference_symbol)) {
-                    return false;
-                }
+                print_indent();
             }
+            if (!process_delay_slot(false)) {
+                return false;
+            }
+            print_indent();
+            generator.emit_trigger_event((uint32_t)reloc_reference_symbol);
+            print_link_branch();
         }
         // Normal symbol or reference symbol, 
         else {
             std::string jal_target_name{};
+            size_t matched_func_index = (size_t)-1;
             if (reloc_reference_symbol != (size_t)-1) {
-                const auto& ref_symbol = context.get_reference_symbol(reloc_section, reloc_reference_symbol);
-
                 if (reloc_type != N64Recomp::RelocType::R_MIPS_26) {
                     fmt::print(stderr, "Unsupported reloc type {} on jal instruction in {}\n", (int)reloc_type, func.name);
                     return false;
                 }
 
-                if (ref_symbol.section_offset != reloc_target_section_offset) {
-                    fmt::print(stderr, "Function {} uses a MIPS_R_26 addend, which is not supported yet\n", func.name);
-                    return false;
+                if (!context.skip_validating_reference_symbols) {
+                    const auto& ref_symbol = context.get_reference_symbol(reloc_section, reloc_reference_symbol);
+                    if (ref_symbol.section_offset != reloc_target_section_offset) {
+                        fmt::print(stderr, "Function {} uses a MIPS_R_26 addend, which is not supported yet\n", func.name);
+                        return false;
+                    }
                 }
-
-                jal_target_name = ref_symbol.name;
             }
             else {
-                size_t matched_func_index = 0;
                 JalResolutionResult jal_result = resolve_jal(context, func.section_index, target_func_vram, matched_func_index);
 
                 switch (jal_result) {
@@ -284,65 +313,78 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
                         // Create a static function add it to the static function list for this section.
                         jal_target_name = fmt::format("static_{}_{:08X}", func.section_index, target_func_vram);
                         static_funcs_out[func.section_index].push_back(target_func_vram);
+                        call_by_name = true;
                         break;
                     case JalResolutionResult::Ambiguous:
                         fmt::print(stderr, "[Info] Ambiguous jal target 0x{:08X} in function {}, falling back to function lookup\n", target_func_vram, func.name);
                         // Relocation isn't necessary for jumps inside a relocatable section, as this code path will never run if the target vram
                         // is in the current function's section (see the branch for `in_current_section` above).
                         // If a game ever needs to jump between multiple relocatable sections, relocation will be necessary here.
-                        jal_target_name = fmt::format("LOOKUP_FUNC(0x{:08X})", target_func_vram);
+                        call_by_lookup = true;
                         break;
                     case JalResolutionResult::Error:
                         fmt::print(stderr, "Internal error when resolving jal to address 0x{:08X} in function {}. Please report this issue.\n", target_func_vram, func.name);
                         return false;
                 }
             }
-            needs_link_branch = link_branch;
+            needs_link_branch = !tail_call;
             if (indent) {
-                if (!print_unconditional_branch("    {}(rdram, ctx)", jal_target_name)) {
-                    return false;
-                }
-            } else {
-                if (!print_unconditional_branch("{}(rdram, ctx)", jal_target_name)) {
-                    return false;
-                }
+                print_indent();
             }
+            if (!process_delay_slot(false)) {
+                return false;
+            }
+            print_indent();
+            if (reloc_reference_symbol != (size_t)-1) {
+                generator.emit_function_call_reference_symbol(context, reloc_section, reloc_reference_symbol, reloc_target_section_offset);
+            }
+            else if (call_by_lookup) {
+                generator.emit_function_call_lookup(target_func_vram);
+            }
+            else if (call_by_name) {
+                generator.emit_named_function_call(jal_target_name);
+            }
+            else {
+                generator.emit_function_call(context, matched_func_index);
+            }
+            print_link_branch();
         }
         return true;
     };
 
     auto print_branch = [&](uint32_t branch_target) {
+        // If the branch target is outside the current function, check if it can be treated as a tail call.
         if (branch_target < func.vram || branch_target >= func_vram_end) {
+            // If the branch target is the start of some known function, this can be handled as a tail call.
             // FIXME: how to deal with static functions?
             if (context.functions_by_vram.find(branch_target) != context.functions_by_vram.end()) {
                 fmt::print("Tail call in {} to 0x{:08X}\n", func.name, branch_target);
-                if (!print_func_call(branch_target, false, true)) {
+                if (!print_func_call_by_address(branch_target, true, true)) {
                     return false;
                 }
-                print_line("    return");
-                fmt::print(output_file, "    }}\n");
+                print_indent();
+                generator.emit_return();
+                // TODO check if this branch close should exist.
+                // print_indent();
+                // generator.emit_branch_close();
                 return true;
             }
 
             fmt::print(stderr, "[Warn] Function {} is branching outside of the function (to 0x{:08X})\n", func.name, branch_target);
         }
 
-        if (instr_index < instructions.size() - 1) {
-            bool dummy_needs_link_branch;
-            bool dummy_is_branch_likely;
-            size_t next_reloc_index = reloc_index;
-            uint32_t next_vram = instr_vram + 4;
-            if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) {
-                next_reloc_index++;
-            }
-            if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, true, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
-                return false;
-            }
+        if (!process_delay_slot(true)) {
+            return false;
         }
 
-        fmt::print(output_file, "        goto L_{:08X};\n", branch_target);
+        print_indent();
+        print_indent();
+        generator.emit_goto(fmt::format("L_{:08X}", branch_target));
+        // TODO check if this link branch ever exists.
         if (needs_link_branch) {
-            fmt::print(output_file, "        goto after_{};\n", link_branch_index);
+            print_indent();
+            print_indent();
+            generator.emit_goto(fmt::format("after_{}", link_branch_index));
         }
         return true;
     };
@@ -353,7 +395,6 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
     int rd = (int)instr.GetO32_rd();
     int rs = (int)instr.GetO32_rs();
-    int base = rs;
     int rt = (int)instr.GetO32_rt();
     int sa = (int)instr.Get_sa();
 
@@ -365,7 +406,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
     bool handled = true;
 
-    switch (instr.getUniqueId()) {
+    switch (instr_id) {
     case InstrId::cpu_nop:
         fmt::print(output_file, "\n");
         break;
@@ -375,7 +416,8 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             Cop0Reg reg = instr.Get_cop0d();
             switch (reg) {
             case Cop0Reg::COP0_Status:
-                print_line("{}{} = cop0_status_read(ctx)", ctx_gpr_prefix(rt), rt);
+                print_indent();
+                generator.emit_cop0_status_read(rt);
                 break;
             default:
                 fmt::print(stderr, "Unhandled cop0 register in mfc0: {}\n", (int)reg);
@@ -388,7 +430,8 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             Cop0Reg reg = instr.Get_cop0d();
             switch (reg) {
             case Cop0Reg::COP0_Status:
-                print_line("cop0_status_write(ctx, {}{})", ctx_gpr_prefix(rt), rt);
+                print_indent();
+                generator.emit_cop0_status_write(rt);
                 break;
             default:
                 fmt::print(stderr, "Unhandled cop0 register in mtc0: {}\n", (int)reg);
@@ -408,38 +451,25 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             // If so, create a temp to preserve the addend register's value
             if (find_result != stats.jump_tables.end()) {
                 const N64Recomp::JumpTable& cur_jtbl = *find_result;
-                print_line("gpr jr_addend_{:08X} = {}{}", cur_jtbl.jr_vram, ctx_gpr_prefix(cur_jtbl.addend_reg), cur_jtbl.addend_reg);
+                print_indent();
+                generator.emit_jtbl_addend_declaration(cur_jtbl, cur_jtbl.addend_reg);
             }
         }
         break;
     case InstrId::cpu_mult:
-        print_line("result = S64(S32({}{})) * S64(S32({}{})); lo = S32(result >> 0); hi = S32(result >> 32)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_dmult:
-        print_line("DMULT(S64({}{}), S64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_multu:
-        print_line("result = U64(U32({}{})) * U64(U32({}{})); lo = S32(result >> 0); hi = S32(result >> 32)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_dmultu:
-        print_line("DMULTU(U64({}{}), U64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_div:
-        // Cast to 64-bits before division to prevent artihmetic exception for s32(0x80000000) / -1
-        print_line("lo = S32(S64(S32({}{})) / S64(S32({}{}))); hi = S32(S64(S32({}{})) % S64(S32({}{})))", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_ddiv:
-        print_line("DDIV(S64({}{}), S64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_divu:
-        print_line("lo = S32(U32({}{}) / U32({}{})); hi = S32(U32({}{}) % U32({}{}))", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt, ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
-        break;
     case InstrId::cpu_ddivu:
-        print_line("DDIVU(U64({}{}), U64({}{}), &lo, &hi)", ctx_gpr_prefix(rs), rs, ctx_gpr_prefix(rt), rt);
+        print_indent();
+        generator.emit_muldiv(instr_id, rs, rt);
         break;
     // Branches
     case InstrId::cpu_jal:
-        if (!print_func_call(instr.getBranchVramGeneric())) {
+        if (!print_func_call_by_address(instr.getBranchVramGeneric())) {
             return false;
         }
         break;
@@ -450,18 +480,19 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             return false;
         }
         needs_link_branch = true;
-        print_unconditional_branch("LOOKUP_FUNC({}{})(rdram, ctx)", ctx_gpr_prefix(rs), rs);
+        print_func_call_by_register(rs);
         break;
     case InstrId::cpu_j:
     case InstrId::cpu_b:
         {
             uint32_t branch_target = instr.getBranchVramGeneric();
             if (branch_target == instr_vram) {
-                print_line("pause_self(rdram)");
+                print_indent();
+                generator.emit_pause_self();
             }
             // Check if the branch is within this function
             else if (branch_target >= func.vram && branch_target < func_vram_end) {
-                print_unconditional_branch("goto L_{:08X}", branch_target);
+                print_goto_with_delay_slot(fmt::format("L_{:08X}", branch_target));
             }
             // This may be a tail call in the middle of the control flow due to a previous check
             // For example:
@@ -476,11 +507,12 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             // ```
             // FIXME: how to deal with static functions?
             else if (context.functions_by_vram.find(branch_target) != context.functions_by_vram.end()) {
-                fmt::print("Tail call in {} to 0x{:08X}\n", func.name, branch_target);
-                if (!print_func_call(branch_target, false)) {
+                fmt::print("[Info] Tail call in {} to 0x{:08X}\n", func.name, branch_target);
+                if (!print_func_call_by_address(branch_target, true)) {
                     return false;
                 }
-                print_line("return");
+                print_indent();
+                generator.emit_return();
             }
             else {
                 fmt::print(stderr, "Unhandled branch in {} at 0x{:08X} to 0x{:08X}\n", func.name, instr_vram, branch_target);
@@ -490,7 +522,7 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         break;
     case InstrId::cpu_jr:
         if (rs == (int)rabbitizer::Registers::Cpu::GprO32::GPR_O32_ra) {
-            print_unconditional_branch("return");
+            print_return_with_delay_slot();
         } else {
             auto jtbl_find_result = std::find_if(stats.jump_tables.begin(), stats.jump_tables.end(),
                 [instr_vram](const N64Recomp::JumpTable& jtbl) {
@@ -499,58 +531,41 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
             if (jtbl_find_result != stats.jump_tables.end()) {
                 const N64Recomp::JumpTable& cur_jtbl = *jtbl_find_result;
-                bool dummy_needs_link_branch, dummy_is_branch_likely;
-                size_t next_reloc_index = reloc_index;
-                uint32_t next_vram = instr_vram + 4;
-                if (reloc_index + 1 < section.relocs.size() && next_vram > section.relocs[reloc_index].address) {
-                    next_reloc_index++;
-                }
-                if (!process_instruction(context, func, stats, skipped_insns, instr_index + 1, instructions, output_file, false, false, link_branch_index, next_reloc_index, dummy_needs_link_branch, dummy_is_branch_likely, tag_reference_relocs, static_funcs_out)) {
+                if (!process_delay_slot(false)) {
                     return false;
                 }
                 print_indent();
-                fmt::print(output_file, "switch (jr_addend_{:08X} >> 2) {{\n", cur_jtbl.jr_vram);
+                generator.emit_switch(context, cur_jtbl, rs);
                 for (size_t entry_index = 0; entry_index < cur_jtbl.entries.size(); entry_index++) {
                     print_indent();
-                    print_line("case {}: goto L_{:08X}; break", entry_index, cur_jtbl.entries[entry_index]);
+                    print_indent();
+                    generator.emit_case(entry_index, fmt::format("L_{:08X}", cur_jtbl.entries[entry_index]));
                 }
                 print_indent();
-                print_line("default: switch_error(__func__, 0x{:08X}, 0x{:08X})", instr_vram, cur_jtbl.vram);
                 print_indent();
-                fmt::print(output_file, "}}\n");
+                generator.emit_switch_error(instr_vram, cur_jtbl.vram);
+                print_indent();
+                generator.emit_switch_close();
                 break;
             }
 
-            auto jump_find_result = std::find_if(stats.absolute_jumps.begin(), stats.absolute_jumps.end(),
-                [instr_vram](const N64Recomp::AbsoluteJump& jump) {
-                return jump.instruction_vram == instr_vram;
-            });
-
-            if (jump_find_result != stats.absolute_jumps.end()) {
-                print_unconditional_branch("LOOKUP_FUNC({})(rdram, ctx)", (uint64_t)(int32_t)jump_find_result->jump_target);
-                // jr doesn't link so it acts like a tail call, meaning we should return directly after the jump returns
-                print_line("return");
-                break;
-            }
-
-            bool is_tail_call = instr_vram == func_vram_end - 2 * sizeof(func.words[0]);
-            if (is_tail_call) {
-                fmt::print("Indirect tail call in {}\n", func.name);
-                print_unconditional_branch("LOOKUP_FUNC({}{})(rdram, ctx)", ctx_gpr_prefix(rs), rs);
-                print_line("return");
-                break;
-            }
-
-            fmt::print(stderr, "No jump table found for jr at 0x{:08X} and not tail call\n", instr_vram);
+            fmt::print("[Info] Indirect tail call in {}\n", func.name);
+            print_func_call_by_register(rs);
+            print_indent();
+            generator.emit_return();
+            break;
         }
         break;
     case InstrId::cpu_syscall:
-        print_line("recomp_syscall_handler(rdram, ctx, 0x{:08X})", instr_vram);
+        print_indent();
+        generator.emit_syscall(instr_vram);
         // syscalls don't link, so treat it like a tail call
-        print_line("return");
+        print_indent();
+        generator.emit_return();
         break;
     case InstrId::cpu_break:
-        print_line("do_break({})", instr_vram);
+        print_indent();
+        generator.emit_do_break(instr_vram);
         break;
 
     // Cop1 rounding mode
@@ -559,21 +574,22 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             fmt::print(stderr, "Invalid FP control register for ctc1: {}\n", cop1_cs);
             return false;
         }
-        print_line("rounding_mode = ({}{}) & 0x3", ctx_gpr_prefix(rt), rt);
+        print_indent();
+        generator.emit_cop1_cs_write(rt);
         break;
     case InstrId::cpu_cfc1:
         if (cop1_cs != 31) {
             fmt::print(stderr, "Invalid FP control register for cfc1: {}\n", cop1_cs);
             return false;
         }
-        print_line("{}{} = rounding_mode", ctx_gpr_prefix(rt), rt);
+        print_indent();
+        generator.emit_cop1_cs_read(rt);
         break;
     default:
         handled = false;
         break;
     }
 
-    CGenerator generator{};
     InstructionContext instruction_context{};
     instruction_context.rd = rd;
     instruction_context.rs = rs;
@@ -589,28 +605,28 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
     instruction_context.reloc_section_index = reloc_section;
     instruction_context.reloc_target_section_offset = reloc_target_section_offset;
     
-    auto do_check_fr = [](std::ostream& output_file, const CGenerator& generator, const InstructionContext& ctx, Operand operand) {
+    auto do_check_fr = [](const GeneratorType& generator, const InstructionContext& ctx, Operand operand) {
         switch (operand) {
             case Operand::Fd:
             case Operand::FdDouble:
             case Operand::FdU32L:
             case Operand::FdU32H:
             case Operand::FdU64:
-                generator.emit_check_fr(output_file, ctx.fd);
+                generator.emit_check_fr(ctx.fd);
                 break;
             case Operand::Fs:
             case Operand::FsDouble:
             case Operand::FsU32L:
             case Operand::FsU32H:
             case Operand::FsU64:
-                generator.emit_check_fr(output_file, ctx.fs);
+                generator.emit_check_fr(ctx.fs);
                 break;
             case Operand::Ft:
             case Operand::FtDouble:
             case Operand::FtU32L:
             case Operand::FtU32H:
             case Operand::FtU64:
-                generator.emit_check_fr(output_file, ctx.ft);
+                generator.emit_check_fr(ctx.ft);
                 break;
             default:
                 // No MIPS3 float check needed for non-float operands.
@@ -618,25 +634,25 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         }
     };
     
-    auto do_check_nan = [](std::ostream& output_file, const CGenerator& generator, const InstructionContext& ctx, Operand operand) {
+    auto do_check_nan = [](const GeneratorType& generator, const InstructionContext& ctx, Operand operand) {
         switch (operand) {
             case Operand::Fd:
-                generator.emit_check_nan(output_file, ctx.fd, false);
+                generator.emit_check_nan(ctx.fd, false);
                 break;
             case Operand::Fs:
-                generator.emit_check_nan(output_file, ctx.fs, false);
+                generator.emit_check_nan(ctx.fs, false);
                 break;
             case Operand::Ft:
-                generator.emit_check_nan(output_file, ctx.ft, false);
+                generator.emit_check_nan(ctx.ft, false);
                 break;
             case Operand::FdDouble:
-                generator.emit_check_nan(output_file, ctx.fd, true);
+                generator.emit_check_nan(ctx.fd, true);
                 break;
             case Operand::FsDouble:
-                generator.emit_check_nan(output_file, ctx.fs, true);
+                generator.emit_check_nan(ctx.fs, true);
                 break;
             case Operand::FtDouble:
-                generator.emit_check_nan(output_file, ctx.ft, true);
+                generator.emit_check_nan(ctx.ft, true);
                 break;
             default:
                 // No NaN checks needed for non-float operands.
@@ -644,54 +660,58 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
         }
     };
 
-    auto find_binary_it = binary_ops.find(instr.getUniqueId());
+    auto find_binary_it = binary_ops.find(instr_id);
     if (find_binary_it != binary_ops.end()) {
         print_indent();
         const BinaryOp& op = find_binary_it->second;
         
         if (op.check_fr) {
-            do_check_fr(output_file, generator, instruction_context, op.output);
-            do_check_fr(output_file, generator, instruction_context, op.operands.operands[0]);
-            do_check_fr(output_file, generator, instruction_context, op.operands.operands[1]);
+            do_check_fr(generator, instruction_context, op.output);
+            do_check_fr(generator, instruction_context, op.operands.operands[0]);
+            do_check_fr(generator, instruction_context, op.operands.operands[1]);
         }
 
         if (op.check_nan) {
-            do_check_nan(output_file, generator, instruction_context, op.operands.operands[0]);
-            do_check_nan(output_file, generator, instruction_context, op.operands.operands[1]);
-            fmt::print(output_file, "\n    ");
+            do_check_nan(generator, instruction_context, op.operands.operands[0]);
+            do_check_nan(generator, instruction_context, op.operands.operands[1]);
+            fmt::print(output_file, "\n");
+            print_indent();
         }
 
-        generator.process_binary_op(output_file, op, instruction_context);
+        generator.process_binary_op(op, instruction_context);
         handled = true;
     }
 
-    auto find_unary_it = unary_ops.find(instr.getUniqueId());
+    auto find_unary_it = unary_ops.find(instr_id);
     if (find_unary_it != unary_ops.end()) {
         print_indent();
         const UnaryOp& op = find_unary_it->second;
         
         if (op.check_fr) {
-            do_check_fr(output_file, generator, instruction_context, op.output);
-            do_check_fr(output_file, generator, instruction_context, op.input);
+            do_check_fr(generator, instruction_context, op.output);
+            do_check_fr(generator, instruction_context, op.input);
         }
 
         if (op.check_nan) {
-            do_check_nan(output_file, generator, instruction_context, op.input);
-            fmt::print(output_file, "\n    ");
+            do_check_nan(generator, instruction_context, op.input);
+            fmt::print(output_file, "\n");
+            print_indent();
         }
 
-        generator.process_unary_op(output_file, op, instruction_context);
+        generator.process_unary_op(op, instruction_context);
         handled = true;
     }
 
-    auto find_conditional_branch_it = conditional_branch_ops.find(instr.getUniqueId());
+    auto find_conditional_branch_it = conditional_branch_ops.find(instr_id);
     if (find_conditional_branch_it != conditional_branch_ops.end()) {
         print_indent();
-        generator.emit_branch_condition(output_file, find_conditional_branch_it->second, instruction_context);
+        // TODO combining the branch condition and branch target into one generator call would allow better optimization in the runtime's JIT generator.
+        // This would require splitting into a conditional jump method and conditional function call method.
+        generator.emit_branch_condition(find_conditional_branch_it->second, instruction_context);
 
         print_indent();
         if (find_conditional_branch_it->second.link) {
-            if (!print_func_call(instr.getBranchVramGeneric())) {
+            if (!print_func_call_by_address(instr.getBranchVramGeneric())) {
                 return false;
             }
         }
@@ -701,22 +721,23 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
             }
         }
 
-        generator.emit_branch_close(output_file);
+        print_indent();
+        generator.emit_branch_close();
         
         is_branch_likely = find_conditional_branch_it->second.likely;
         handled = true;
     }
 
-    auto find_store_it = store_ops.find(instr.getUniqueId());
+    auto find_store_it = store_ops.find(instr_id);
     if (find_store_it != store_ops.end()) {
         print_indent();
         const StoreOp& op = find_store_it->second;
 
         if (op.type == StoreOpType::SDC1) {
-            do_check_fr(output_file, generator, instruction_context, op.value_input);
+            do_check_fr(generator, instruction_context, op.value_input);
         }
 
-        generator.process_store_op(output_file, op, instruction_context);
+        generator.process_store_op(op, instruction_context);
         handled = true;
     }
 
@@ -727,23 +748,20 @@ bool process_instruction(const N64Recomp::Context& context, const N64Recomp::Fun
 
     // TODO is this used?
     if (emit_link_branch) {
-        fmt::print(output_file, "    after_{}:\n", link_branch_index);
+        print_indent();
+        generator.emit_label(fmt::format("after_{}", link_branch_index));
     }
 
     return true;
 }
 
-bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64Recomp::Function& func, std::ofstream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+template <typename GeneratorType>
+bool recompile_function_impl(GeneratorType& generator, const N64Recomp::Context& context, size_t func_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    const N64Recomp::Function& func = context.functions[func_index];
     //fmt::print("Recompiling {}\n", func.name);
     std::vector<rabbitizer::InstructionCpu> instructions;
 
-    fmt::print(output_file,
-        "RECOMP_FUNC void {}(uint8_t* rdram, recomp_context* ctx) {{\n"
-        // these variables shouldn't need to be preserved across function boundaries, so make them local for more efficient output
-        "    uint64_t hi = 0, lo = 0, result = 0;\n"
-        "    unsigned int rounding_mode = DEFAULT_ROUNDING_MODE;\n"
-        "    int c1cs = 0;\n", // cop1 conditional signal
-        func.name);
+    generator.emit_function_start(func.name, func_index);
 
     if (context.trace_mode) {
         fmt::print(output_file,
@@ -784,11 +802,11 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             return false;
         }
 
-        std::unordered_set<uint32_t> skipped_insns{};
+        std::unordered_set<uint32_t> jtbl_lw_instructions{};
 
         // Add jump table labels into function
         for (const auto& jtbl : stats.jump_tables) {
-            skipped_insns.insert(jtbl.lw_vram);
+            jtbl_lw_instructions.insert(jtbl.lw_vram);
             for (uint32_t jtbl_entry : jtbl.entries) {
                 branch_labels.insert(jtbl_entry);
             }
@@ -808,11 +826,11 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             bool is_branch_likely = false;
             // If we're in the delay slot of a likely instruction, emit a goto to skip the instruction before any labels
             if (in_likely_delay_slot) {
-                fmt::print(output_file, "    goto skip_{};\n", num_likely_branches);
+                generator.emit_goto(fmt::format("skip_{}", num_likely_branches));
             }
             // If there are any other branch labels to insert and we're at the next one, insert it
             if (cur_label != branch_labels.end() && vram >= *cur_label) {
-                fmt::print(output_file, "L_{:08X}:\n", *cur_label);
+                generator.emit_label(fmt::format("L_{:08X}", *cur_label));
                 ++cur_label;
             }
 
@@ -822,7 +840,7 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             }
 
             // Process the current instruction and check for errors
-            if (process_instruction(context, func, stats, skipped_insns, instr_index, instructions, output_file, false, needs_link_branch, num_link_branches, reloc_index, needs_link_branch, is_branch_likely, tag_reference_relocs, static_funcs_out) == false) {
+            if (process_instruction(generator, context, func, stats, jtbl_lw_instructions, instr_index, instructions, output_file, false, needs_link_branch, num_link_branches, reloc_index, needs_link_branch, is_branch_likely, tag_reference_relocs, static_funcs_out) == false) {
                 fmt::print(stderr, "Error in recompiling {}, clearing output file\n", func.name);
                 output_file.clear();
                 return false;
@@ -833,7 +851,8 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
             }
             // Now that the instruction has been processed, emit a skip label for the likely branch if needed
             if (in_likely_delay_slot) {
-                fmt::print(output_file, "    skip_{}:\n", num_likely_branches);
+                fmt::print(output_file, "    ");
+                generator.emit_label(fmt::format("skip_{}", num_likely_branches));
                 num_likely_branches++;
             }
             // Mark the next instruction as being in a likely delay slot if the 
@@ -844,7 +863,17 @@ bool N64Recomp::recompile_function(const N64Recomp::Context& context, const N64R
     }
 
     // Terminate the function
-    fmt::print(output_file, ";}}\n");
+    generator.emit_function_end();
     
     return true;
 }
+
+// Wrap the templated function with CGenerator as the template parameter.
+bool N64Recomp::recompile_function(const N64Recomp::Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    CGenerator generator{output_file};
+    return recompile_function_impl(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs);
+}
+
+bool N64Recomp::recompile_function_custom(Generator& generator, const Context& context, size_t function_index, std::ostream& output_file, std::span<std::vector<uint32_t>> static_funcs_out, bool tag_reference_relocs) {
+    return recompile_function_impl(generator, context, function_index, output_file, static_funcs_out, tag_reference_relocs);
+}
diff --git a/src/symbol_lists.cpp b/src/symbol_lists.cpp
index 4b4eff9..cbe5ff5 100644
--- a/src/symbol_lists.cpp
+++ b/src/symbol_lists.cpp
@@ -1,4 +1,4 @@
-#include "n64recomp.h"
+#include "recompiler/context.h"
 
 const std::unordered_set<std::string> N64Recomp::reimplemented_funcs {
     // OS initialize functions