Commit 350ad90cee

David Rubin <daviru007@icloud.com>
2024-03-28 23:59:28
riscv: totally rewrite how we do loads and stores
this commit is a little too large to document fully, however the main gist of it this - finish the `genInlineMemcpy` implement - rename `setValue` to `genCopy` as I agree with jacob that it's a better name - add in `genVarDbgInfo` for a better gdb experience - follow the x86_64's method for genCall, as the procedure is very similar for us - add `airSliceLen` as it's trivial - change up the `airAddWithOverflow implementation a bit - make sure to not spill of the elem_ty is 0 size - correctly follow the RISC-V calling convention and spill the used calle saved registers in the prologue and restore them in the epilogue - add `address`, `deref`, and `offset` helper functions for MCValue. I must say I love these, they make the code very readable and super verbose :) - fix a `register_manager.zig` issue where when using the last register in the set, the value would overflow at comptime. was happening because we were adding to `max_id` before subtracting from it.
1 parent cbf62bd
lib/std/builtin.zig
@@ -775,14 +775,14 @@ pub fn default_panic(msg: []const u8, error_return_trace: ?*StackTrace, ret_addr
     }
 
     if (builtin.zig_backend == .stage2_riscv64) {
-        asm volatile ("ecall"
-            :
-            : [number] "{a7}" (64),
-              [arg1] "{a0}" (1),
-              [arg2] "{a1}" (@intFromPtr(msg.ptr)),
-              [arg3] "{a2}" (msg.len),
-            : "rcx", "r11", "memory"
-        );
+        // asm volatile ("ecall"
+        //     :
+        //     : [number] "{a7}" (64),
+        //       [arg1] "{a0}" (1),
+        //       [arg2] "{a1}" (@intFromPtr(msg.ptr)),
+        //       [arg3] "{a2}" (msg.len),
+        //     : "rcx", "r11", "memory"
+        // );
         std.posix.exit(127);
     }
 
src/arch/riscv64/abi.zig
@@ -92,15 +92,18 @@ pub fn classifyType(ty: Type, mod: *Module) Class {
 }
 
 pub const callee_preserved_regs = [_]Register{
-    // NOTE: we use s0 as a psuedo stack pointer, so it's not included.
-    .s1, .s2, .s3, .s4, .s5, .s6, .s7, .s8, .s9, .s10, .s11,
+    .s0, .s1, .s2, .s3, .s4, .s5, .s6, .s7, .s8, .s9, .s10, .s11,
 };
 
 pub const function_arg_regs = [_]Register{
     .a0, .a1, .a2, .a3, .a4, .a5, .a6, .a7,
 };
 
-const allocatable_registers = callee_preserved_regs ++ function_arg_regs;
+pub const temporary_regs = [_]Register{
+    .t0, .t1, .t2, .t3, .t4, .t5, .t6,
+};
+
+const allocatable_registers = callee_preserved_regs ++ function_arg_regs ++ temporary_regs;
 pub const RegisterManager = RegisterManagerFn(@import("CodeGen.zig"), Register, &allocatable_registers);
 
 // Register classes
@@ -123,4 +126,13 @@ pub const RegisterClass = struct {
         }, true);
         break :blk set;
     };
+
+    pub const tp: RegisterBitSet = blk: {
+        var set = RegisterBitSet.initEmpty();
+        set.setRangeValue(.{
+            .start = callee_preserved_regs.len + function_arg_regs.len,
+            .end = callee_preserved_regs.len + function_arg_regs.len + temporary_regs.len,
+        }, true);
+        break :blk set;
+    };
 };
src/arch/riscv64/bits.zig
@@ -404,14 +404,14 @@ pub const Register = enum(u6) {
     t3, t4, t5, t6, // caller saved
     // zig fmt: on
 
-    /// Returns the unique 4-bit ID of this register which is used in
+    /// Returns the unique 5-bit ID of this register which is used in
     /// the machine code
     pub fn id(self: Register) u5 {
         return @as(u5, @truncate(@intFromEnum(self)));
     }
 
     pub fn dwarfLocOp(reg: Register) u8 {
-        return @as(u8, reg.id()) + DW.OP.reg0;
+        return @as(u8, reg.id());
     }
 };
 
src/arch/riscv64/CodeGen.zig
@@ -38,9 +38,16 @@ const callee_preserved_regs = abi.callee_preserved_regs;
 const gp = abi.RegisterClass.gp;
 /// Function Args
 const fa = abi.RegisterClass.fa;
+/// Temporary Use
+const tp = abi.RegisterClass.tp;
 
 const InnerError = CodeGenError || error{OutOfRegisters};
 
+const RegisterView = enum(u1) {
+    caller,
+    callee,
+};
+
 gpa: Allocator,
 air: Air,
 liveness: Liveness,
@@ -82,8 +89,8 @@ branch_stack: *std.ArrayList(Branch),
 
 // Key is the block instruction
 blocks: std.AutoHashMapUnmanaged(Air.Inst.Index, BlockData) = .{},
-
 register_manager: RegisterManager = .{},
+
 /// Maps offset to what is stored there.
 stack: std.AutoHashMapUnmanaged(u32, StackAllocation) = .{},
 
@@ -99,6 +106,7 @@ air_bookkeeping: @TypeOf(air_bookkeeping_init) = air_bookkeeping_init,
 const air_bookkeeping_init = if (std.debug.runtime_safety) @as(usize, 0) else {};
 
 const SymbolOffset = struct { sym: u32, off: i32 = 0 };
+const RegisterOffset = struct { reg: Register, off: i32 = 0 };
 
 const MCValue = union(enum) {
     /// No runtime bits. `void` types, empty structs, u0, enums with 1 tag, etc.
@@ -119,6 +127,8 @@ const MCValue = union(enum) {
     load_symbol: SymbolOffset,
     /// The value is in a target-specific register.
     register: Register,
+    /// The value is split across two registers
+    register_pair: [2]Register,
     /// The value is in memory at a hard-coded address.
     /// If the type is a pointer, it means the pointer address is at this memory location.
     memory: u64,
@@ -127,10 +137,15 @@ const MCValue = union(enum) {
     stack_offset: u32,
     /// The value is a pointer to one of the stack variables (payload is stack offset).
     ptr_stack_offset: u32,
+    air_ref: Air.Inst.Ref,
+    /// The value is in memory at a constant offset from the address in a register.
+    indirect: RegisterOffset,
+    /// The value is a constant offset from the value in a register.
+    register_offset: RegisterOffset,
 
     fn isMemory(mcv: MCValue) bool {
         return switch (mcv) {
-            .memory, .stack_offset => true,
+            .memory, .indirect, .load_frame => true,
             else => false,
         };
     }
@@ -151,15 +166,85 @@ const MCValue = union(enum) {
             .immediate,
             .memory,
             .ptr_stack_offset,
+            .indirect,
             .undef,
             .load_symbol,
+            .air_ref,
             => false,
 
             .register,
+            .register_pair,
+            .register_offset,
             .stack_offset,
             => true,
         };
     }
+
+    fn address(mcv: MCValue) MCValue {
+        return switch (mcv) {
+            .none,
+            .unreach,
+            .dead,
+            .immediate,
+            .ptr_stack_offset,
+            .register_offset,
+            .undef,
+            .air_ref,
+            => unreachable, // not in memory
+
+            .memory => |addr| .{ .immediate = addr },
+            .stack_offset => |off| .{ .ptr_stack_offset = off },
+            .indirect => |reg_off| switch (reg_off.off) {
+                0 => .{ .register = reg_off.reg },
+                else => .{ .register_offset = reg_off },
+            },
+        };
+    }
+
+    fn deref(mcv: MCValue) MCValue {
+        return switch (mcv) {
+            .none,
+            .unreach,
+            .dead,
+            .memory,
+            .indirect,
+            .undef,
+            .air_ref,
+            .stack_offset,
+            .register_pair,
+            .load_symbol,
+            => unreachable, // not a pointer
+
+            .immediate => |addr| .{ .memory = addr },
+            .ptr_stack_offset => |off| .{ .stack_offset = off },
+            .register => |reg| .{ .indirect = .{ .reg = reg } },
+            .register_offset => |reg_off| .{ .indirect = reg_off },
+        };
+    }
+
+    fn offset(mcv: MCValue, off: i32) MCValue {
+        return switch (mcv) {
+            .none,
+            .unreach,
+            .dead,
+            .undef,
+            .air_ref,
+            => unreachable, // not valid
+            .register_pair,
+            .memory,
+            .indirect,
+            .stack_offset,
+            .load_symbol,
+            => switch (off) {
+                0 => mcv,
+                else => unreachable, // not offsettable
+            },
+            .immediate => |imm| .{ .immediate = @bitCast(@as(i64, @bitCast(imm)) +% off) },
+            .register => |reg| .{ .register_offset = .{ .reg = reg, .off = off } },
+            .register_offset => |reg_off| .{ .register_offset = .{ .reg = reg_off.reg, .off = reg_off.off + off } },
+            .ptr_stack_offset => |stack_off| .{ .ptr_stack_offset = @intCast(@as(i64, @intCast(stack_off)) +% off) },
+        };
+    }
 };
 
 const Branch = struct {
@@ -211,6 +296,11 @@ const BigTomb = struct {
 
 const Self = @This();
 
+const CallView = enum(u1) {
+    callee,
+    caller,
+};
+
 pub fn generate(
     lf: *link.File,
     src_loc: Module.SrcLoc,
@@ -261,7 +351,7 @@ pub fn generate(
     defer function.blocks.deinit(gpa);
     defer function.exitlude_jump_relocs.deinit(gpa);
 
-    var call_info = function.resolveCallingConventionValues(fn_type) catch |err| switch (err) {
+    var call_info = function.resolveCallingConventionValues(fn_type, .callee) catch |err| switch (err) {
         error.CodegenFail => return Result{ .fail = function.err_msg.? },
         error.OutOfRegisters => return Result{
             .fail = try ErrorMsg.create(gpa, src_loc, "CodeGen ran out of registers. This is a bug in the Zig compiler.", .{}),
@@ -284,6 +374,14 @@ pub fn generate(
         else => |e| return e,
     };
 
+    // Create list of registers to save in the prologue.
+    var save_reg_list = Mir.RegisterList{};
+    for (callee_preserved_regs) |reg| {
+        if (function.register_manager.isRegAllocated(reg)) {
+            save_reg_list.push(&callee_preserved_regs, reg);
+        }
+    }
+
     var mir = Mir{
         .instructions = function.mir_instructions.toOwnedSlice(),
         .extra = try function.mir_extra.toOwnedSlice(gpa),
@@ -300,8 +398,10 @@ pub fn generate(
         .prev_di_pc = 0,
         .prev_di_line = func.lbrace_line,
         .prev_di_column = func.lbrace_column,
-        .stack_size = @max(32, function.max_end_stack),
         .code_offset_mapping = .{},
+        // need to at least decrease the sp by -8
+        .stack_size = @max(8, mem.alignForward(u32, function.max_end_stack, 16)),
+        .save_reg_list = save_reg_list,
     };
     defer emit.deinit();
 
@@ -629,6 +729,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
     }
 }
 
+fn feed(self: *Self, bt: *Liveness.BigTomb, operand: Air.Inst.Ref) !void {
+    if (bt.feed()) if (operand.toIndex()) |inst| self.processDeath(inst);
+}
+
 /// Asserts there is already capacity to insert into top branch inst_table.
 fn processDeath(self: *Self, inst: Air.Inst.Index) void {
     // When editing this function, note that the logic must synchronize with `reuseOperand`.
@@ -639,7 +743,7 @@ fn processDeath(self: *Self, inst: Air.Inst.Index) void {
         .register => |reg| {
             self.register_manager.freeReg(reg);
         },
-        else => {}, // TODO process stack allocation death
+        else => {}, // TODO process stack allocation death by freeing it to be reused later
     }
 }
 
@@ -650,17 +754,11 @@ fn finishAirBookkeeping(self: *Self) void {
     }
 }
 
-fn finishAir(self: *Self, inst: Air.Inst.Index, result: MCValue, operands: [Liveness.bpi - 1]Air.Inst.Ref) void {
-    var tomb_bits = self.liveness.getTombBits(inst);
-    for (operands) |op| {
-        const dies = @as(u1, @truncate(tomb_bits)) != 0;
-        tomb_bits >>= 1;
-        if (!dies) continue;
-        const op_index = op.toIndex() orelse continue;
-        self.processDeath(op_index);
-    }
-    const is_used = @as(u1, @truncate(tomb_bits)) == 0;
-    if (is_used) {
+fn finishAirResult(self: *Self, inst: Air.Inst.Index, result: MCValue) void {
+    if (self.liveness.isUnused(inst)) switch (result) {
+        .none, .dead, .unreach => {},
+        else => unreachable, // Why didn't the result die?
+    } else {
         log.debug("%{d} => {}", .{ inst, result });
         const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
         branch.inst_table.putAssumeCapacityNoClobber(inst, result);
@@ -682,6 +780,22 @@ fn finishAir(self: *Self, inst: Air.Inst.Index, result: MCValue, operands: [Live
     self.finishAirBookkeeping();
 }
 
+fn finishAir(
+    self: *Self,
+    inst: Air.Inst.Index,
+    result: MCValue,
+    operands: [Liveness.bpi - 1]Air.Inst.Ref,
+) !void {
+    var tomb_bits = self.liveness.getTombBits(inst);
+    for (operands) |op| {
+        const dies = @as(u1, @truncate(tomb_bits)) != 0;
+        tomb_bits >>= 1;
+        if (!dies) continue;
+        self.processDeath(op.toIndexAllowNone() orelse continue);
+    }
+    self.finishAirResult(inst, result);
+}
+
 fn ensureProcessDeathCapacity(self: *Self, additional_count: usize) !void {
     const table = &self.branch_stack.items[self.branch_stack.items.len - 1].inst_table;
     try table.ensureUnusedCapacity(self.gpa, additional_count);
@@ -716,6 +830,7 @@ fn allocMemPtr(self: *Self, inst: Air.Inst.Index) !u32 {
 fn allocRegOrMem(self: *Self, inst: Air.Inst.Index, reg_ok: bool) !MCValue {
     const mod = self.bin_file.comp.module.?;
     const elem_ty = self.typeOfIndex(inst);
+
     const abi_size = math.cast(u32, elem_ty.abiSize(mod)) orelse {
         return self.fail("type '{}' too big to fit into stack frame", .{elem_ty.fmt(mod)});
     };
@@ -728,12 +843,12 @@ fn allocRegOrMem(self: *Self, inst: Air.Inst.Index, reg_ok: bool) !MCValue {
         const ptr_bytes: u64 = @divExact(ptr_bits, 8);
         if (abi_size <= ptr_bytes) {
             if (self.register_manager.tryAllocReg(inst, gp)) |reg| {
-                return MCValue{ .register = reg };
+                return .{ .register = reg };
             }
         }
     }
     const stack_offset = try self.allocMem(inst, abi_size, abi_align);
-    return MCValue{ .stack_offset = stack_offset };
+    return .{ .stack_offset = stack_offset };
 }
 
 /// Allocates a register from the general purpose set and returns the Register and the Lock.
@@ -746,6 +861,12 @@ fn allocReg(self: *Self) !struct { Register, RegisterLock } {
 }
 
 pub fn spillInstruction(self: *Self, reg: Register, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.comp.module.?;
+    const elem_ty = self.typeOfIndex(inst);
+
+    // there isn't anything to spill
+    if (!elem_ty.hasRuntimeBitsIgnoreComptime(mod)) return;
+
     const stack_mcv = try self.allocRegOrMem(inst, false);
     log.debug("spilling {d} to stack mcv {any}", .{ inst, stack_mcv });
     const reg_mcv = self.getResolvedInstValue(inst);
@@ -759,7 +880,7 @@ pub fn spillInstruction(self: *Self, reg: Register, inst: Air.Inst.Index) !void
 /// allocated. A second call to `copyToTmpRegister` may return the same register.
 /// This can have a side effect of spilling instructions to the stack to free up a register.
 fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register {
-    const reg = try self.register_manager.allocReg(null, gp);
+    const reg = try self.register_manager.allocReg(null, tp);
     try self.genSetReg(ty, reg, mcv);
     return reg;
 }
@@ -830,7 +951,7 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
             math.divCeil(u32, src_storage_bits, 64) catch unreachable and
             self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) src_mcv else dst: {
             const dst_mcv = try self.allocRegOrMem(inst, true);
-            try self.setValue(min_ty, dst_mcv, src_mcv);
+            try self.genCopy(min_ty, dst_mcv, src_mcv);
             break :dst dst_mcv;
         };
 
@@ -1261,43 +1382,48 @@ fn airAddWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
 
         if (int_info.bits >= 8 and math.isPowerOfTwo(int_info.bits)) {
             if (int_info.signedness == .unsigned) {
-                const overflow_offset = tuple_ty.structFieldOffset(1, mod) + offset;
-
-                const max_val = std.math.pow(u16, 2, int_info.bits) - 1;
-
-                const overflow_reg, const overflow_lock = try self.allocReg();
-                defer self.register_manager.unlockReg(overflow_lock);
+                switch (int_info.bits) {
+                    1...8 => {
+                        const max_val = std.math.pow(u16, 2, int_info.bits) - 1;
 
-                const add_reg, const add_lock = blk: {
-                    if (add_result_mcv == .register) break :blk .{ add_result_mcv.register, null };
-
-                    const add_reg, const add_lock = try self.allocReg();
-                    try self.genSetReg(lhs_ty, add_reg, add_result_mcv);
-                    break :blk .{ add_reg, add_lock };
-                };
-                defer if (add_lock) |lock| self.register_manager.unlockReg(lock);
-
-                _ = try self.addInst(.{
-                    .tag = .andi,
-                    .data = .{ .i_type = .{
-                        .rd = overflow_reg,
-                        .rs1 = add_reg,
-                        .imm12 = @intCast(max_val),
-                    } },
-                });
+                        const overflow_reg, const overflow_lock = try self.allocReg();
+                        defer self.register_manager.unlockReg(overflow_lock);
 
-                const overflow_mcv = try self.binOp(
-                    .cmp_neq,
-                    null,
-                    .{ .register = overflow_reg },
-                    .{ .register = add_reg },
-                    lhs_ty,
-                    lhs_ty,
-                );
+                        const add_reg, const add_lock = blk: {
+                            if (add_result_mcv == .register) break :blk .{ add_result_mcv.register, null };
 
-                try self.genSetStack(Type.u1, @intCast(overflow_offset), overflow_mcv);
+                            const add_reg, const add_lock = try self.allocReg();
+                            try self.genSetReg(lhs_ty, add_reg, add_result_mcv);
+                            break :blk .{ add_reg, add_lock };
+                        };
+                        defer if (add_lock) |lock| self.register_manager.unlockReg(lock);
+
+                        _ = try self.addInst(.{
+                            .tag = .andi,
+                            .data = .{ .i_type = .{
+                                .rd = overflow_reg,
+                                .rs1 = add_reg,
+                                .imm12 = @intCast(max_val),
+                            } },
+                        });
+
+                        const overflow_mcv = try self.binOp(
+                            .cmp_neq,
+                            null,
+                            .{ .register = overflow_reg },
+                            .{ .register = add_reg },
+                            lhs_ty,
+                            lhs_ty,
+                        );
+
+                        const overflow_offset = tuple_ty.structFieldOffset(1, mod) + offset;
+                        try self.genSetStack(Type.u1, @intCast(overflow_offset), overflow_mcv);
+
+                        break :result result_mcv;
+                    },
 
-                break :result result_mcv;
+                    else => return self.fail("TODO: addWithOverflow check for size {d}", .{int_info.bits}),
+                }
             } else {
                 return self.fail("TODO: airAddWithOverFlow calculate carry for signed addition", .{});
             }
@@ -1367,6 +1493,7 @@ fn airShl(self: *Self, inst: Air.Inst.Index) !void {
         const rhs = try self.resolveInst(bin_op.rhs);
         const lhs_ty = self.typeOf(bin_op.lhs);
         const rhs_ty = self.typeOf(bin_op.rhs);
+
         break :result try self.binOp(.shl, inst, lhs, rhs, lhs_ty, rhs_ty);
     };
     return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
@@ -1506,7 +1633,19 @@ fn slicePtr(self: *Self, mcv: MCValue) !MCValue {
 
 fn airSliceLen(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airSliceLen for {}", .{self.target.cpu.arch});
+    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else result: {
+        const ptr_bits = 64;
+        const ptr_bytes = @divExact(ptr_bits, 8);
+        const mcv = try self.resolveInst(ty_op.operand);
+        switch (mcv) {
+            .dead, .unreach, .none => unreachable,
+            .register => unreachable, // a slice doesn't fit in one register
+            .stack_offset => |off| {
+                break :result MCValue{ .stack_offset = off + ptr_bytes };
+            },
+            else => return self.fail("TODO airSliceLen for {}", .{mcv}),
+        }
+    };
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
@@ -1598,10 +1737,60 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
-    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airCtz for {}", .{self.target.cpu.arch});
+    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else result: {
+        const operand = try self.resolveInst(ty_op.operand);
+        const operand_ty = self.typeOf(ty_op.operand);
+
+        const dest_reg = try self.register_manager.allocReg(inst, gp);
+
+        const source_reg, const source_lock = blk: {
+            if (operand == .register) break :blk .{ operand.register, null };
+
+            const source_reg, const source_lock = try self.allocReg();
+            try self.genSetReg(operand_ty, source_reg, operand);
+            break :blk .{ source_reg, source_lock };
+        };
+        defer if (source_lock) |lock| self.register_manager.unlockReg(lock);
+
+        // TODO: the B extension for RISCV should have the ctz instruction, and we should use it.
+
+        try self.ctz(source_reg, dest_reg, operand_ty);
+
+        break :result .{ .register = dest_reg };
+    };
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
+fn ctz(self: *Self, src: Register, dst: Register, ty: Type) !void {
+    const mod = self.bin_file.comp.module.?;
+    const length = (ty.abiSize(mod) * 8) - 1;
+
+    const count_reg, const count_lock = try self.allocReg();
+    defer self.register_manager.unlockReg(count_lock);
+
+    const len_reg, const len_lock = try self.allocReg();
+    defer self.register_manager.unlockReg(len_lock);
+
+    try self.genSetReg(Type.usize, count_reg, .{ .immediate = 0 });
+    try self.genSetReg(Type.usize, len_reg, .{ .immediate = length });
+
+    _ = try self.addInst(.{
+        .tag = .beq,
+        .data = .{
+            .b_type = .{
+                .rs1 = count_reg,
+                .rs2 = len_reg,
+                .inst = @intCast(self.mir_instructions.len + 0),
+            },
+        },
+    });
+
+    _ = src;
+    _ = dst;
+
+    return self.fail("TODO: finish ctz", .{});
+}
+
 fn airPopcount(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;
     const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airPopcount for {}", .{self.target.cpu.arch});
@@ -1750,12 +1939,12 @@ fn airLoad(self: *Self, inst: Air.Inst.Index) !void {
     const elem_ty = self.typeOfIndex(inst);
     const result: MCValue = result: {
         if (!elem_ty.hasRuntimeBits(mod))
-            break :result MCValue.none;
+            break :result .none;
 
         const ptr = try self.resolveInst(ty_op.operand);
         const is_volatile = self.typeOf(ty_op.operand).isVolatilePtr(mod);
         if (self.liveness.isUnused(inst) and !is_volatile)
-            break :result MCValue.dead;
+            break :result .dead;
 
         const dst_mcv: MCValue = blk: {
             if (self.reuseOperand(inst, ty_op.operand, 0, ptr)) {
@@ -1771,27 +1960,38 @@ fn airLoad(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
 }
 
-fn load(self: *Self, dst_mcv: MCValue, src_ptr: MCValue, ptr_ty: Type) InnerError!void {
+fn load(self: *Self, dst_mcv: MCValue, ptr_mcv: MCValue, ptr_ty: Type) InnerError!void {
     const mod = self.bin_file.comp.module.?;
-    const elem_ty = ptr_ty.childType(mod);
+    const dst_ty = ptr_ty.childType(mod);
 
-    switch (src_ptr) {
-        .none => unreachable,
-        .undef => unreachable,
-        .unreach => unreachable,
-        .dead => unreachable,
-        .immediate => |imm| try self.setValue(elem_ty, dst_mcv, .{ .memory = imm }),
-        .ptr_stack_offset => |off| try self.setValue(elem_ty, dst_mcv, .{ .stack_offset = off }),
+    log.debug("loading {}:{} into {}", .{ ptr_mcv, ptr_ty.fmt(mod), dst_mcv });
 
-        .stack_offset,
+    switch (ptr_mcv) {
+        .none,
+        .undef,
+        .unreach,
+        .dead,
+        .register_pair,
+        => unreachable, // not a valid pointer
+
+        .immediate,
         .register,
+        .register_offset,
+        .ptr_stack_offset,
+        => try self.genCopy(dst_ty, dst_mcv, ptr_mcv.deref()),
+
         .memory,
-        => try self.setValue(elem_ty, dst_mcv, src_ptr),
+        .indirect,
+        .load_symbol,
+        .stack_offset,
+        => {
+            const addr_reg = try self.copyToTmpRegister(ptr_ty, ptr_mcv);
+            const addr_lock = self.register_manager.lockRegAssumeUnused(addr_reg);
+            defer self.register_manager.unlockReg(addr_lock);
 
-        .load_symbol => {
-            const reg = try self.copyToTmpRegister(ptr_ty, src_ptr);
-            try self.load(dst_mcv, .{ .register = reg }, ptr_ty);
+            try self.genCopy(dst_ty, dst_mcv, .{ .indirect = .{ .reg = addr_reg } });
         },
+        .air_ref => |ptr_ref| try self.load(dst_mcv, try self.resolveInst(ptr_ref), ptr_ty),
     }
 }
 
@@ -1817,7 +2017,12 @@ fn store(self: *Self, pointer: MCValue, value: MCValue, ptr_ty: Type, value_ty:
     const mod = self.bin_file.comp.module.?;
     const value_abi_size = value_ty.abiSize(mod);
 
-    log.debug("storing {s}", .{@tagName(pointer)});
+    log.debug("storing {}:{} in {}:{}", .{ value, value_ty.fmt(mod), pointer, ptr_ty.fmt(mod) });
+
+    if (value_ty.isSlice(mod)) {
+        // cheat a bit by loading in two parts
+
+    }
 
     switch (pointer) {
         .none => unreachable,
@@ -1976,7 +2181,6 @@ fn genArgDbgInfo(self: Self, inst: Air.Inst.Index, mcv: MCValue) !void {
 }
 
 fn airArg(self: *Self, inst: Air.Inst.Index) !void {
-    const mod = self.bin_file.comp.module.?;
     var arg_index = self.arg_index;
 
     // we skip over args that have no bits
@@ -1986,21 +2190,10 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
     const result: MCValue = if (self.liveness.isUnused(inst)) .unreach else result: {
         const src_mcv = self.args[arg_index];
 
-        // we want to move every arg onto the stack.
-        // while it might no tbe the best solution right now, it simplifies
-        // the spilling of args with multiple arg levels.
         const dst_mcv = switch (src_mcv) {
             .register => |src_reg| dst: {
-                // TODO: get the true type of the arg, and fit the spill to size.
-                const arg_size = Type.usize.abiSize(mod);
-                const arg_align = Type.usize.abiAlignment(mod);
-                const offset = try self.allocMem(inst, @intCast(arg_size), arg_align);
-                try self.genSetStack(Type.usize, offset, .{ .register = src_reg });
-
-                // can go on to be reused in next function call
-                self.register_manager.freeReg(src_reg);
-
-                break :dst .{ .stack_offset = offset };
+                try self.register_manager.getReg(src_reg, null);
+                break :dst src_mcv;
             },
             else => return self.fail("TODO: airArg {s}", .{@tagName(src_mcv)}),
         };
@@ -2044,87 +2237,122 @@ fn airFence(self: *Self) !void {
 }
 
 fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier) !void {
-    const mod = self.bin_file.comp.module.?;
     if (modifier == .always_tail) return self.fail("TODO implement tail calls for riscv64", .{});
     const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
-    const fn_ty = self.typeOf(pl_op.operand);
     const callee = pl_op.operand;
     const extra = self.air.extraData(Air.Call, pl_op.payload);
-    const args: []const Air.Inst.Ref = @ptrCast(self.air.extra[extra.end..][0..extra.data.args_len]);
+    const arg_refs: []const Air.Inst.Ref = @ptrCast(self.air.extra[extra.end..][0..extra.data.args_len]);
+
+    const expected_num_args = 8;
+    const ExpectedContents = extern struct {
+        vals: [expected_num_args][@sizeOf(MCValue)]u8 align(@alignOf(MCValue)),
+    };
+    var stack align(@max(@alignOf(ExpectedContents), @alignOf(std.heap.StackFallbackAllocator(0)))) =
+        std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa);
+    const allocator = stack.get();
+
+    const arg_tys = try allocator.alloc(Type, arg_refs.len);
+    defer allocator.free(arg_tys);
+    for (arg_tys, arg_refs) |*arg_ty, arg_ref| arg_ty.* = self.typeOf(arg_ref);
+
+    const arg_vals = try allocator.alloc(MCValue, arg_refs.len);
+    defer allocator.free(arg_vals);
+    for (arg_vals, arg_refs) |*arg_val, arg_ref| arg_val.* = .{ .air_ref = arg_ref };
+
+    const call_ret = try self.genCall(.{ .air = callee }, arg_tys, arg_vals);
+
+    var bt = self.liveness.iterateBigTomb(inst);
+    try self.feed(&bt, pl_op.operand);
+    for (arg_refs) |arg_ref| try self.feed(&bt, arg_ref);
+
+    const result = if (self.liveness.isUnused(inst)) .unreach else call_ret;
+    return self.finishAirResult(inst, result);
+}
+
+fn genCall(
+    self: *Self,
+    info: union(enum) {
+        air: Air.Inst.Ref,
+        lib: struct {
+            return_type: InternPool.Index,
+            param_types: []const InternPool.Index,
+            lib: ?[]const u8 = null,
+            callee: []const u8,
+        },
+    },
+    arg_tys: []const Type,
+    args: []const MCValue,
+) !MCValue {
+    const mod = self.bin_file.comp.module.?;
+
+    const fn_ty = switch (info) {
+        .air => |callee| fn_info: {
+            const callee_ty = self.typeOf(callee);
+            break :fn_info switch (callee_ty.zigTypeTag(mod)) {
+                .Fn => callee_ty,
+                .Pointer => callee_ty.childType(mod),
+                else => unreachable,
+            };
+        },
+        .lib => |lib| try mod.funcType(.{
+            .param_types = lib.param_types,
+            .return_type = lib.return_type,
+            .cc = .C,
+        }),
+    };
 
-    var info = try self.resolveCallingConventionValues(fn_ty);
-    defer info.deinit(self);
+    var call_info = try self.resolveCallingConventionValues(fn_ty, .caller);
+    defer call_info.deinit(self);
+
+    for (call_info.args, 0..) |mc_arg, arg_i| try self.genCopy(arg_tys[arg_i], mc_arg, args[arg_i]);
 
     // Due to incremental compilation, how function calls are generated depends
     // on linking.
-    if (self.bin_file.cast(link.File.Elf)) |elf_file| {
-        for (info.args, 0..) |mc_arg, arg_i| {
-            const arg = args[arg_i];
-            const arg_ty = self.typeOf(arg);
-            const arg_mcv = try self.resolveInst(args[arg_i]);
-            try self.setValue(arg_ty, mc_arg, arg_mcv);
-        }
-
-        if (try self.air.value(callee, mod)) |func_value| {
-            switch (mod.intern_pool.indexToKey(func_value.ip_index)) {
+    switch (info) {
+        .air => |callee| if (try self.air.value(callee, mod)) |func_value| {
+            const func_key = mod.intern_pool.indexToKey(func_value.ip_index);
+            switch (switch (func_key) {
+                else => func_key,
+                .ptr => |ptr| switch (ptr.addr) {
+                    .decl => |decl| mod.intern_pool.indexToKey(mod.declPtr(decl).val.toIntern()),
+                    else => func_key,
+                },
+            }) {
                 .func => |func| {
-                    const sym_index = try elf_file.zigObjectPtr().?.getOrCreateMetadataForDecl(elf_file, func.owner_decl);
-                    const sym = elf_file.symbol(sym_index);
-                    _ = try sym.getOrCreateZigGotEntry(sym_index, elf_file);
-                    const got_addr = sym.zigGotAddress(elf_file);
-                    try self.genSetReg(Type.usize, .ra, .{ .memory = got_addr });
-                    _ = try self.addInst(.{
-                        .tag = .jalr,
-                        .data = .{ .i_type = .{
-                            .rd = .ra,
-                            .rs1 = .ra,
-                            .imm12 = 0,
-                        } },
-                    });
+                    if (self.bin_file.cast(link.File.Elf)) |elf_file| {
+                        const sym_index = try elf_file.zigObjectPtr().?.getOrCreateMetadataForDecl(elf_file, func.owner_decl);
+                        const sym = elf_file.symbol(sym_index);
+                        _ = try sym.getOrCreateZigGotEntry(sym_index, elf_file);
+                        const got_addr = sym.zigGotAddress(elf_file);
+                        try self.genSetReg(Type.usize, .ra, .{ .memory = got_addr });
+                        _ = try self.addInst(.{
+                            .tag = .jalr,
+                            .data = .{ .i_type = .{
+                                .rd = .ra,
+                                .rs1 = .ra,
+                                .imm12 = 0,
+                            } },
+                        });
+                    } else if (self.bin_file.cast(link.File.Coff)) |_| {
+                        return self.fail("TODO implement calling in COFF for {}", .{self.target.cpu.arch});
+                    } else if (self.bin_file.cast(link.File.MachO)) |_| {
+                        unreachable; // unsupported architecture for MachO
+                    } else if (self.bin_file.cast(link.File.Plan9)) |_| {
+                        return self.fail("TODO implement call on plan9 for {}", .{self.target.cpu.arch});
+                    } else unreachable;
                 },
                 .extern_func => {
-                    return self.fail("TODO implement calling extern functions", .{});
-                },
-                else => {
-                    return self.fail("TODO implement calling bitcasted functions", .{});
+                    return self.fail("TODO: extern func calls", .{});
                 },
+                else => return self.fail("TODO implement calling bitcasted functions", .{}),
             }
         } else {
-            return self.fail("TODO implement calling runtime-known function pointer", .{});
-        }
-    } else if (self.bin_file.cast(link.File.Coff)) |_| {
-        return self.fail("TODO implement calling in COFF for {}", .{self.target.cpu.arch});
-    } else if (self.bin_file.cast(link.File.MachO)) |_| {
-        unreachable; // unsupported architecture for MachO
-    } else if (self.bin_file.cast(link.File.Plan9)) |_| {
-        return self.fail("TODO implement call on plan9 for {}", .{self.target.cpu.arch});
-    } else unreachable;
-
-    const result: MCValue = result: {
-        switch (info.return_value) {
-            .register => |reg| {
-                if (RegisterManager.indexOfReg(&callee_preserved_regs, reg) == null) {
-                    // Save function return value in a callee saved register
-                    break :result try self.copyToNewRegister(inst, info.return_value);
-                }
-            },
-            else => {},
-        }
-        break :result info.return_value;
-    };
-
-    if (args.len <= Liveness.bpi - 2) {
-        var buf = [1]Air.Inst.Ref{.none} ** (Liveness.bpi - 1);
-        buf[0] = callee;
-        @memcpy(buf[1..][0..args.len], args);
-        return self.finishAir(inst, result, buf);
-    }
-    var bt = try self.iterateBigTomb(inst, 1 + args.len);
-    bt.feed(callee);
-    for (args) |arg| {
-        bt.feed(arg);
+            return self.fail("TODO: call function pointers", .{});
+        },
+        .lib => return self.fail("TODO: lib func calls", .{}),
     }
-    return bt.finishAir(result);
+
+    return call_info.return_value;
 }
 
 fn airRet(self: *Self, inst: Air.Inst.Index, safety: bool) !void {
@@ -2151,7 +2379,7 @@ fn ret(self: *Self, mcv: MCValue) !void {
     const mod = self.bin_file.comp.module.?;
 
     const ret_ty = self.fn_type.fnReturnType(mod);
-    try self.setValue(ret_ty, self.ret_mcv, mcv);
+    try self.genCopy(ret_ty, self.ret_mcv, mcv);
 
     _ = try self.addInst(.{
         .tag = .psuedo_epilogue,
@@ -2183,6 +2411,7 @@ fn airCmp(self: *Self, inst: Air.Inst.Index) !void {
     const ty = self.typeOf(bin_op.lhs);
     const mod = self.bin_file.comp.module.?;
     assert(ty.eql(self.typeOf(bin_op.rhs), mod));
+
     if (ty.zigTypeTag(mod) == .ErrorSet)
         return self.fail("TODO implement cmp for errors", .{});
 
@@ -2233,11 +2462,54 @@ fn airDbgInlineBlock(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airDbgVar(self: *Self, inst: Air.Inst.Index) !void {
     const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
-    const name = self.air.nullTerminatedString(pl_op.payload);
     const operand = pl_op.operand;
-    // TODO emit debug info for this variable
-    _ = name;
-    return self.finishAir(inst, .dead, .{ operand, .none, .none });
+    const ty = self.typeOf(operand);
+    const mcv = try self.resolveInst(operand);
+
+    const name = self.air.nullTerminatedString(pl_op.payload);
+
+    const tag = self.air.instructions.items(.tag)[@intFromEnum(inst)];
+    try self.genVarDbgInfo(tag, ty, mcv, name);
+
+    return self.finishAir(inst, .unreach, .{ operand, .none, .none });
+}
+
+fn genVarDbgInfo(
+    self: Self,
+    tag: Air.Inst.Tag,
+    ty: Type,
+    mcv: MCValue,
+    name: [:0]const u8,
+) !void {
+    const mod = self.bin_file.comp.module.?;
+    const is_ptr = switch (tag) {
+        .dbg_var_ptr => true,
+        .dbg_var_val => false,
+        else => unreachable,
+    };
+
+    switch (self.debug_output) {
+        .dwarf => |dw| {
+            const loc: link.File.Dwarf.DeclState.DbgInfoLoc = switch (mcv) {
+                .register => |reg| .{ .register = reg.dwarfLocOp() },
+                .memory => |address| .{ .memory = address },
+                .load_symbol => |sym_off| loc: {
+                    assert(sym_off.off == 0);
+                    break :loc .{ .linker_load = .{ .type = .direct, .sym_index = sym_off.sym } };
+                },
+                .immediate => |x| .{ .immediate = x },
+                .undef => .undef,
+                .none => .none,
+                else => blk: {
+                    log.debug("TODO generate debug info for {}", .{mcv});
+                    break :blk .nop;
+                },
+            };
+            try dw.genVarDbgInfo(name, ty, mod.funcOwnerDeclIndex(self.func_index), is_ptr, loc);
+        },
+        .plan9 => {},
+        .none => {},
+    }
 }
 
 fn airCondBr(self: *Self, inst: Air.Inst.Index) !void {
@@ -2348,7 +2620,7 @@ fn airCondBr(self: *Self, inst: Air.Inst.Index) !void {
         log.debug("consolidating else_entry {d} {}=>{}", .{ else_key, else_value, canon_mcv });
         // TODO make sure the destination stack offset / register does not already have something
         // going on there.
-        try self.setValue(self.typeOfIndex(else_key), canon_mcv, else_value);
+        try self.genCopy(self.typeOfIndex(else_key), canon_mcv, else_value);
         // TODO track the new register / stack allocation
     }
     try parent_branch.inst_table.ensureUnusedCapacity(self.gpa, saved_then_branch.inst_table.count());
@@ -2375,7 +2647,7 @@ fn airCondBr(self: *Self, inst: Air.Inst.Index) !void {
         log.debug("consolidating then_entry {d} {}=>{}", .{ then_key, parent_mcv, then_value });
         // TODO make sure the destination stack offset / register does not already have something
         // going on there.
-        try self.setValue(self.typeOfIndex(then_key), parent_mcv, then_value);
+        try self.genCopy(self.typeOfIndex(then_key), parent_mcv, then_value);
         // TODO track the new register / stack allocation
     }
 
@@ -2638,7 +2910,7 @@ fn br(self: *Self, block: Air.Inst.Index, operand: Air.Inst.Ref) !void {
         if (block_mcv == .none) {
             block_data.mcv = operand_mcv;
         } else {
-            try self.setValue(self.typeOfIndex(block), block_mcv, operand_mcv);
+            try self.genCopy(self.typeOfIndex(block), block_mcv, operand_mcv);
         }
     }
     return self.brVoid(block);
@@ -2783,29 +3055,45 @@ fn iterateBigTomb(self: *Self, inst: Air.Inst.Index, operand_count: usize) !BigT
 }
 
 /// Sets the value without any modifications to register allocation metadata or stack allocation metadata.
-fn setValue(self: *Self, ty: Type, dst_val: MCValue, src_val: MCValue) !void {
+fn genCopy(self: *Self, ty: Type, dst_mcv: MCValue, src_mcv: MCValue) !void {
     // There isn't anything to store
-    if (dst_val == .none) return;
+    if (dst_mcv == .none) return;
 
-    if (!dst_val.isMutable()) {
+    if (!dst_mcv.isMutable()) {
         // panic so we can see the trace
-        return std.debug.panic("tried to setValue immutable: {s}", .{@tagName(dst_val)});
+        return std.debug.panic("tried to genCopy immutable: {s}", .{@tagName(dst_mcv)});
     }
 
-    switch (dst_val) {
-        .register => |reg| return self.genSetReg(ty, reg, src_val),
-        .stack_offset => |off| return self.genSetStack(ty, off, src_val),
-        .memory => |addr| return self.genSetMem(ty, addr, src_val),
-        else => return self.fail("TODO: setValue {s}", .{@tagName(dst_val)}),
+    switch (dst_mcv) {
+        .register => |reg| return self.genSetReg(ty, reg, src_mcv),
+        .register_pair => |pair| return self.genSetRegPair(ty, pair, src_mcv),
+        .register_offset => |dst_reg_off| try self.genSetReg(ty, dst_reg_off.reg, switch (src_mcv) {
+            .none,
+            .unreach,
+            .dead,
+            .undef,
+            => unreachable,
+            .immediate,
+            .register,
+            .register_offset,
+            => src_mcv.offset(-dst_reg_off.off),
+            else => .{ .register_offset = .{
+                .reg = try self.copyToTmpRegister(ty, src_mcv),
+                .off = -dst_reg_off.off,
+            } },
+        }),
+        .stack_offset => |off| return self.genSetStack(ty, off, src_mcv),
+        .memory => |addr| return self.genSetMem(ty, addr, src_mcv),
+        else => return self.fail("TODO: genCopy {s} with {s}", .{ @tagName(dst_mcv), @tagName(src_mcv) }),
     }
 }
 
-/// Sets the value of `src_val` into stack memory at `stack_offset`.
-fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_val: MCValue) InnerError!void {
+/// Sets the value of `src_mcv` into stack memory at `stack_offset`.
+fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_mcv: MCValue) InnerError!void {
     const mod = self.bin_file.comp.module.?;
     const abi_size: u32 = @intCast(ty.abiSize(mod));
 
-    switch (src_val) {
+    switch (src_mcv) {
         .none => return,
         .dead => unreachable,
         .undef => {
@@ -2820,7 +3108,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_val: MCValue) Inner
             const reg, const reg_lock = try self.allocReg();
             defer self.register_manager.unlockReg(reg_lock);
 
-            try self.genSetReg(ty, reg, src_val);
+            try self.genSetReg(ty, reg, src_mcv);
 
             return self.genSetStack(ty, stack_offset, .{ .register = reg });
         },
@@ -2839,24 +3127,24 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_val: MCValue) Inner
                         .tag = tag,
                         .data = .{ .i_type = .{
                             .rd = reg,
-                            .rs1 = .s0,
+                            .rs1 = .sp,
                             .imm12 = math.cast(i12, stack_offset) orelse {
                                 return self.fail("TODO: genSetStack bigger stack values", .{});
                             },
                         } },
                     });
                 },
-                else => return self.fail("TODO: genSetStack for size={d}", .{abi_size}),
+                else => unreachable, // register can hold a max of 8 bytes
             }
         },
         .stack_offset, .load_symbol => {
-            switch (src_val) {
+            switch (src_mcv) {
                 .stack_offset => |off| if (off == stack_offset) return,
                 else => {},
             }
 
             if (abi_size <= 8) {
-                const reg = try self.copyToTmpRegister(ty, src_val);
+                const reg = try self.copyToTmpRegister(ty, src_mcv);
                 return self.genSetStack(ty, stack_offset, .{ .register = reg });
             }
 
@@ -2875,7 +3163,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_val: MCValue) Inner
             const count_reg = regs[3];
             const tmp_reg = regs[4];
 
-            switch (src_val) {
+            switch (src_mcv) {
                 .stack_offset => |offset| {
                     try self.genSetReg(ptr_ty, src_reg, .{ .ptr_stack_offset = offset });
                 },
@@ -2894,14 +3182,14 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_val: MCValue) Inner
                         .tag = .load_symbol,
                         .data = .{
                             .payload = try self.addExtra(Mir.LoadSymbolPayload{
-                                .register = @intFromEnum(src_reg),
+                                .register = src_reg.id(),
                                 .atom_index = atom_index,
                                 .sym_index = sym_off.sym,
                             }),
                         },
                     });
                 },
-                else => return self.fail("TODO: genSetStack unreachable {s}", .{@tagName(src_val)}),
+                else => return self.fail("TODO: genSetStack unreachable {s}", .{@tagName(src_mcv)}),
             }
 
             try self.genSetReg(ptr_ty, dst_reg, .{ .ptr_stack_offset = stack_offset });
@@ -2910,16 +3198,17 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, src_val: MCValue) Inner
             // memcpy(src, dst, len)
             try self.genInlineMemcpy(src_reg, dst_reg, len_reg, count_reg, tmp_reg);
         },
-        else => return self.fail("TODO: genSetStack {s}", .{@tagName(src_val)}),
+        .air_ref => |ref| try self.genSetStack(ty, stack_offset, try self.resolveInst(ref)),
+        else => return self.fail("TODO: genSetStack {s}", .{@tagName(src_mcv)}),
     }
 }
 
-fn genSetMem(self: *Self, ty: Type, addr: u64, src_val: MCValue) InnerError!void {
+fn genSetMem(self: *Self, ty: Type, addr: u64, src_mcv: MCValue) InnerError!void {
     const mod = self.bin_file.comp.module.?;
     const abi_size: u32 = @intCast(ty.abiSize(mod));
     _ = abi_size;
     _ = addr;
-    _ = src_val;
+    _ = src_mcv;
 
     return self.fail("TODO: genSetMem", .{});
 }
@@ -2932,51 +3221,101 @@ fn genInlineMemcpy(
     count: Register,
     tmp: Register,
 ) !void {
-    _ = src;
-    _ = dst;
+    try self.genSetReg(Type.usize, count, .{ .register = len });
+
+    // lb tmp, 0(src)
+    const first_inst = try self.addInst(.{
+        .tag = .lb,
+        .data = .{
+            .i_type = .{
+                .rd = tmp,
+                .rs1 = src,
+                .imm12 = 0,
+            },
+        },
+    });
 
-    // store 0 in the count
-    try self.genSetReg(Type.usize, count, .{ .immediate = 0 });
+    // sb tmp, 0(dst)
+    _ = try self.addInst(.{
+        .tag = .sb,
+        .data = .{
+            .i_type = .{
+                .rd = tmp,
+                .rs1 = dst,
+                .imm12 = 0,
+            },
+        },
+    });
 
-    // compare count to length
-    const compare_inst = try self.addInst(.{
-        .tag = .cmp_eq,
-        .data = .{ .r_type = .{
-            .rd = tmp,
-            .rs1 = count,
-            .rs2 = len,
-        } },
+    // dec count by 1
+    _ = try self.addInst(.{
+        .tag = .addi,
+        .data = .{
+            .i_type = .{
+                .rd = count,
+                .rs1 = count,
+                .imm12 = -1,
+            },
+        },
     });
 
-    // end if true
+    // branch if count is 0
     _ = try self.addInst(.{
-        .tag = .bne,
+        .tag = .beq,
         .data = .{
             .b_type = .{
-                .inst = @intCast(self.mir_instructions.len + 0), // points after the last inst
-                .rs1 = .zero,
-                .rs2 = tmp,
+                .inst = @intCast(self.mir_instructions.len + 4), // points after the last inst
+                .rs1 = count,
+                .rs2 = .zero,
+            },
+        },
+    });
+
+    // increment the pointers
+    _ = try self.addInst(.{
+        .tag = .addi,
+        .data = .{
+            .i_type = .{
+                .rd = src,
+                .rs1 = src,
+                .imm12 = 1,
+            },
+        },
+    });
+
+    _ = try self.addInst(.{
+        .tag = .addi,
+        .data = .{
+            .i_type = .{
+                .rd = dst,
+                .rs1 = dst,
+                .imm12 = 1,
             },
         },
     });
-    _ = compare_inst;
 
-    return self.fail("TODO: finish genInlineMemcpy", .{});
+    // jump back to start of loop
+    _ = try self.addInst(.{
+        .tag = .j,
+        .data = .{
+            .inst = first_inst,
+        },
+    });
 }
 
-/// Sets the value of `src_val` into `reg`. Assumes you have a lock on it.
-fn genSetReg(self: *Self, ty: Type, reg: Register, src_val: MCValue) InnerError!void {
+/// Sets the value of `src_mcv` into `reg`. Assumes you have a lock on it.
+fn genSetReg(self: *Self, ty: Type, reg: Register, src_mcv: MCValue) InnerError!void {
     const mod = self.bin_file.comp.module.?;
     const abi_size: u32 = @intCast(ty.abiSize(mod));
 
-    switch (src_val) {
+    switch (src_mcv) {
         .dead => unreachable,
         .ptr_stack_offset => |off| {
             _ = try self.addInst(.{
                 .tag = .addi,
                 .data = .{ .i_type = .{
                     .rd = reg,
-                    .rs1 = .s0,
+                    .rs1 = .sp,
                     .imm12 = math.cast(i12, off) orelse {
                         return self.fail("TODO: bigger stack sizes", .{});
                     },
@@ -3006,7 +3345,6 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, src_val: MCValue) InnerError!
                 const carry: i32 = if (lo12 < 0) 1 else 0;
                 const hi20: i20 = @truncate((x >> 12) +% carry);
 
-                // TODO: add test case for 32-bit immediate
                 _ = try self.addInst(.{
                     .tag = .lui,
                     .data = .{ .u_type = .{
@@ -3069,6 +3407,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, src_val: MCValue) InnerError!
                 } },
             });
         },
+        .register_pair => |pair| try self.genSetReg(ty, reg, .{ .register = pair[0] }),
         .memory => |addr| {
             try self.genSetReg(ty, reg, .{ .immediate = addr });
 
@@ -3080,8 +3419,6 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, src_val: MCValue) InnerError!
                     .imm12 = 0,
                 } },
             });
-
-            // LOAD imm=[i12 offset = 0], rs1
         },
         .stack_offset => |off| {
             const tag: Mir.Inst.Tag = switch (abi_size) {
@@ -3096,7 +3433,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, src_val: MCValue) InnerError!
                 .tag = tag,
                 .data = .{ .i_type = .{
                     .rd = reg,
-                    .rs1 = .s0,
+                    .rs1 = .sp,
                     .imm12 = math.cast(i12, off) orelse {
                         return self.fail("TODO: genSetReg support larger stack sizes", .{});
                     },
@@ -3120,13 +3457,55 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, src_val: MCValue) InnerError!
                 .tag = .load_symbol,
                 .data = .{
                     .payload = try self.addExtra(Mir.LoadSymbolPayload{
-                        .register = @intFromEnum(reg),
+                        .register = reg.id(),
                         .atom_index = atom_index,
                         .sym_index = sym_off.sym,
                     }),
                 },
             });
         },
+        .air_ref => |ref| try self.genSetReg(ty, reg, try self.resolveInst(ref)),
+        .indirect => |reg_off| {
+            const tag: Mir.Inst.Tag = switch (abi_size) {
+                1 => .lb,
+                2 => .lh,
+                4 => .lw,
+                8 => .ld,
+                else => return self.fail("TODO: genSetReg for size {d}", .{abi_size}),
+            };
+
+            _ = try self.addInst(.{
+                .tag = tag,
+                .data = .{
+                    .i_type = .{
+                        .rd = reg,
+                        .rs1 = reg_off.reg,
+                        .imm12 = @intCast(reg_off.off),
+                    },
+                },
+            });
+        },
+        else => return self.fail("TODO: genSetReg {s}", .{@tagName(src_mcv)}),
+    }
+}
+
+fn genSetRegPair(self: *Self, ty: Type, pair: [2]Register, src_mcv: MCValue) InnerError!void {
+    const mod = self.bin_file.comp.module.?;
+    const abi_size: u32 = @intCast(ty.abiSize(mod));
+
+    assert(abi_size > 8 and abi_size <= 16); // must fit only fit into two registers
+
+    switch (src_mcv) {
+        .air_ref => |ref| return self.genSetRegPair(ty, pair, try self.resolveInst(ref)),
+        .load_symbol => |sym_off| {
+            _ = sym_off;
+            // return self.fail("TODO: genSetRegPair load_symbol", .{});
+            // commented out just for testing.
+
+            // plan here is to load the address into a temporary register and
+            // copy into the pair.
+        },
+        else => return self.fail("TODO: genSetRegPair {s}", .{@tagName(src_mcv)}),
     }
 }
 
@@ -3138,7 +3517,7 @@ fn airIntFromPtr(self: *Self, inst: Air.Inst.Index) !void {
 
         const dst_mcv = try self.allocRegOrMem(inst, true);
         const dst_ty = self.typeOfIndex(inst);
-        try self.setValue(dst_ty, dst_mcv, src_mcv);
+        try self.genCopy(dst_ty, dst_mcv, src_mcv);
         break :result dst_mcv;
     };
     return self.finishAir(inst, result, .{ un_op, .none, .none });
@@ -3157,7 +3536,7 @@ fn airBitCast(self: *Self, inst: Air.Inst.Index) !void {
         defer if (operand_lock) |lock| self.register_manager.unlockReg(lock);
 
         const dest = try self.allocRegOrMem(inst, true);
-        try self.setValue(self.typeOfIndex(inst), dest, operand);
+        try self.genCopy(self.typeOfIndex(inst), dest, operand);
         break :result dest;
     };
     return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
@@ -3385,8 +3764,12 @@ const CallMCValues = struct {
 };
 
 /// Caller must call `CallMCValues.deinit`.
-fn resolveCallingConventionValues(self: *Self, fn_ty: Type) !CallMCValues {
+fn resolveCallingConventionValues(self: *Self, fn_ty: Type, role: CallView) !CallMCValues {
     const mod = self.bin_file.comp.module.?;
+    const ip = &mod.intern_pool;
+
+    _ = role;
+
     const fn_info = mod.typeToFunc(fn_ty).?;
     const cc = fn_info.cc;
     var result: CallMCValues = .{
@@ -3413,26 +3796,31 @@ fn resolveCallingConventionValues(self: *Self, fn_ty: Type) !CallMCValues {
                 return self.fail("TODO: support more than 8 function args", .{});
             }
 
-            const locks = try self.gpa.alloc(RegisterLock, result.args.len);
-            defer self.gpa.free(locks);
+            var fa_reg_i: u32 = 0;
 
-            for (0..result.args.len) |i| {
-                const arg_reg = try self.register_manager.allocReg(null, fa);
-                const lock = self.register_manager.lockRegAssumeUnused(arg_reg);
-                locks[i] = lock;
-                result.args[i] = .{ .register = arg_reg };
-            }
+            // spill the needed argument registers
+            for (fn_info.param_types.get(ip), result.args) |ty, *result_arg| {
+                const param_ty = Type.fromInterned(ty);
+                const param_size = param_ty.abiSize(mod);
 
-            // we can just free the locks now, as this should be the only place where the fa
-            // arg set is used.
-            for (locks) |lock| {
-                self.register_manager.unlockReg(lock);
+                switch (param_size) {
+                    1...8 => {
+                        const arg_reg: Register = abi.function_arg_regs[fa_reg_i];
+                        fa_reg_i += 1;
+                        try self.register_manager.getReg(arg_reg, null);
+                        result_arg.* = .{ .register = arg_reg };
+                    },
+                    9...16 => {
+                        const arg_regs: [2]Register = abi.function_arg_regs[fa_reg_i..][0..2].*;
+                        fa_reg_i += 2;
+                        for (arg_regs) |reg| try self.register_manager.getReg(reg, null);
+                        result_arg.* = .{ .register_pair = arg_regs };
+                    },
+                    else => return self.fail("TODO: support args of size {}", .{param_size}),
+                }
             }
 
-            // stack_offset = num s registers spilled + local var space
-            // TODO: spill used s registers here
-
-            result.stack_byte_count = 0;
+            result.stack_byte_count = self.max_end_stack;
             result.stack_align = .@"16";
         },
         else => return self.fail("TODO implement function parameters for {} on riscv64", .{cc}),
src/arch/riscv64/Emit.zig
@@ -1,19 +1,6 @@
 //! This file contains the functionality for lowering RISCV64 MIR into
 //! machine code
 
-const Emit = @This();
-const std = @import("std");
-const math = std.math;
-const Mir = @import("Mir.zig");
-const bits = @import("bits.zig");
-const link = @import("../../link.zig");
-const Module = @import("../../Module.zig");
-const ErrorMsg = Module.ErrorMsg;
-const assert = std.debug.assert;
-const Instruction = bits.Instruction;
-const Register = bits.Register;
-const DebugInfoOutput = @import("../../codegen.zig").DebugInfoOutput;
-
 mir: Mir,
 bin_file: *link.File,
 debug_output: DebugInfoOutput,
@@ -22,12 +9,17 @@ err_msg: ?*ErrorMsg = null,
 src_loc: Module.SrcLoc,
 code: *std.ArrayList(u8),
 
+/// List of registers to save in the prologue.
+save_reg_list: Mir.RegisterList,
+
 prev_di_line: u32,
 prev_di_column: u32,
 /// Relative to the beginning of `code`.
 prev_di_pc: usize,
+
 /// Function's stack size. Used for backpatching.
 stack_size: u32,
+
 /// For backward branches: stores the code offset of the target
 /// instruction
 ///
@@ -212,7 +204,7 @@ fn mirRType(emit: *Emit, inst: Mir.Inst.Index) !void {
             // rs1 != rs2
 
             try emit.writeInstruction(Instruction.xor(rd, rs1, rs2));
-            try emit.writeInstruction(Instruction.sltu(rd, .x0, rd)); // snez
+            try emit.writeInstruction(Instruction.sltu(rd, .zero, rd)); // snez
         },
         .cmp_lt => {
             // rd = 1 if rs1 < rs2
@@ -368,17 +360,20 @@ fn mirPsuedo(emit: *Emit, inst: Mir.Inst.Index) !void {
                 return emit.fail("TODO: mirPsuedo support larger stack sizes", .{});
             };
 
-            // Decrement sp by num s registers + local var space
+            // Decrement sp by (num s registers * 8) + local var space
             try emit.writeInstruction(Instruction.addi(.sp, .sp, -stack_size));
 
             // Spill ra
-            try emit.writeInstruction(Instruction.sd(.ra, stack_size - 8, .sp));
-
-            // Spill s0
-            try emit.writeInstruction(Instruction.sd(.s0, stack_size - 16, .sp));
-
-            // Setup s0
-            try emit.writeInstruction(Instruction.addi(.s0, .sp, stack_size));
+            try emit.writeInstruction(Instruction.sd(.ra, 0, .sp));
+
+            // Spill callee saved registers.
+            var s_reg_iter = emit.save_reg_list.iterator(.{});
+            var i: i12 = 8;
+            while (s_reg_iter.next()) |reg_i| {
+                const reg = abi.callee_preserved_regs[reg_i];
+                try emit.writeInstruction(Instruction.sd(reg, i, .sp));
+                i += 8;
+            }
         },
         .psuedo_epilogue => {
             const stack_size: i12 = math.cast(i12, emit.stack_size) orelse {
@@ -386,10 +381,16 @@ fn mirPsuedo(emit: *Emit, inst: Mir.Inst.Index) !void {
             };
 
             // Restore ra
-            try emit.writeInstruction(Instruction.ld(.ra, stack_size - 8, .sp));
-
-            // Restore s0
-            try emit.writeInstruction(Instruction.ld(.s0, stack_size - 16, .sp));
+            try emit.writeInstruction(Instruction.ld(.ra, 0, .sp));
+
+            // Restore spilled callee saved registers
+            var s_reg_iter = emit.save_reg_list.iterator(.{});
+            var i: i12 = 8;
+            while (s_reg_iter.next()) |reg_i| {
+                const reg = abi.callee_preserved_regs[reg_i];
+                try emit.writeInstruction(Instruction.ld(reg, i, .sp));
+                i += 8;
+            }
 
             // Increment sp back to previous value
             try emit.writeInstruction(Instruction.addi(.sp, .sp, stack_size));
@@ -408,8 +409,11 @@ fn mirRR(emit: *Emit, inst: Mir.Inst.Index) !void {
     const tag = emit.mir.instructions.items(.tag)[inst];
     const rr = emit.mir.instructions.items(.data)[inst].rr;
 
+    const rd = rr.rd;
+    const rs = rr.rs;
+
     switch (tag) {
-        .mv => try emit.writeInstruction(Instruction.addi(rr.rd, rr.rs, 0)),
+        .mv => try emit.writeInstruction(Instruction.addi(rd, rs, 0)),
         else => unreachable,
     }
 }
@@ -435,7 +439,6 @@ fn mirNop(emit: *Emit, inst: Mir.Inst.Index) !void {
 }
 
 fn mirLoadSymbol(emit: *Emit, inst: Mir.Inst.Index) !void {
-    // const tag = emit.mir.instructions.items(.tag)[inst];
     const payload = emit.mir.instructions.items(.data)[inst].payload;
     const data = emit.mir.extraData(Mir.LoadSymbolPayload, payload).data;
     const reg = @as(Register, @enumFromInt(data.register));
@@ -523,20 +526,19 @@ fn instructionSize(emit: *Emit, inst: Mir.Inst.Index) usize {
         .dbg_prologue_end,
         => 0,
 
-        .psuedo_prologue,
-        => 16,
-
-        .psuedo_epilogue,
-        .abs,
-        => 12,
-
         .cmp_eq,
         .cmp_neq,
         .cmp_imm_eq,
         .cmp_gte,
         .load_symbol,
+        .abs,
         => 8,
 
+        .psuedo_epilogue, .psuedo_prologue => size: {
+            const count = emit.save_reg_list.count() * 4;
+            break :size count + 8;
+        },
+
         else => 4,
     };
 }
@@ -547,25 +549,17 @@ fn lowerMir(emit: *Emit) !void {
     const mir_tags = emit.mir.instructions.items(.tag);
     const mir_datas = emit.mir.instructions.items(.data);
 
+    const proglogue_size: u32 = @intCast(emit.save_reg_list.size());
+    emit.stack_size += proglogue_size;
+
     for (mir_tags, 0..) |tag, index| {
         const inst: u32 = @intCast(index);
 
         if (isStore(tag) or isLoad(tag)) {
             const data = mir_datas[inst].i_type;
-            // TODO: probably create a psuedo instruction for s0 loads/stores instead of this.
-            if (data.rs1 == .s0) {
+            if (data.rs1 == .sp) {
                 const offset = mir_datas[inst].i_type.imm12;
-
-                // sp + 32 (aka s0)
-                // ra -- previous ra spilled
-                // s0 -- previous s0 spilled
-                // --- this is -16(s0)
-
-                // TODO: this "+ 8" is completely arbiratary as the largest possible store
-                // we don't want to actually use it. instead we need to calculate the difference
-                // between the first and second stack store and use it instead.
-
-                mir_datas[inst].i_type.imm12 = -(16 + offset + 8);
+                mir_datas[inst].i_type.imm12 = offset + @as(i12, @intCast(proglogue_size)) + 8;
             }
         }
 
@@ -584,3 +578,17 @@ fn lowerMir(emit: *Emit) !void {
         current_code_offset += emit.instructionSize(inst);
     }
 }
+
+const Emit = @This();
+const std = @import("std");
+const math = std.math;
+const Mir = @import("Mir.zig");
+const bits = @import("bits.zig");
+const abi = @import("abi.zig");
+const link = @import("../../link.zig");
+const Module = @import("../../Module.zig");
+const ErrorMsg = Module.ErrorMsg;
+const assert = std.debug.assert;
+const Instruction = bits.Instruction;
+const Register = bits.Register;
+const DebugInfoOutput = @import("../../codegen.zig").DebugInfoOutput;
src/arch/riscv64/Mir.zig
@@ -6,14 +6,6 @@
 //! The main purpose of MIR is to postpone the assignment of offsets until Isel,
 //! so that, for example, the smaller encodings of jump instructions can be used.
 
-const Mir = @This();
-const std = @import("std");
-const builtin = @import("builtin");
-const assert = std.debug.assert;
-
-const bits = @import("bits.zig");
-const Register = bits.Register;
-
 instructions: std.MultiArrayList(Inst).Slice,
 /// The meaning of this data is determined by `Inst.Tag` value.
 extra: []const u32,
@@ -58,7 +50,7 @@ pub const Inst = struct {
         /// Jumps. Uses `inst` payload.
         j,
 
-        /// Immediate and, uses i_type payload
+        /// Immediate AND, uses i_type payload
         andi,
 
         // NOTE: Maybe create a special data for compares that includes the ops
@@ -219,15 +211,6 @@ pub const Inst = struct {
         },
     };
 
-    const CompareOp = enum {
-        eq,
-        neq,
-        gt,
-        gte,
-        lt,
-        lte,
-    };
-
     // Make sure we don't accidentally make instructions bigger than expected.
     // Note that in Debug builds, Zig is allowed to insert a secret field for safety checks.
     // comptime {
@@ -268,3 +251,49 @@ pub const LoadSymbolPayload = struct {
     atom_index: u32,
     sym_index: u32,
 };
+
+/// Used in conjunction with payload to transfer a list of used registers in a compact manner.
+pub const RegisterList = struct {
+    bitset: BitSet = BitSet.initEmpty(),
+
+    const BitSet = IntegerBitSet(32);
+    const Self = @This();
+
+    fn getIndexForReg(registers: []const Register, reg: Register) BitSet.MaskInt {
+        for (registers, 0..) |cpreg, i| {
+            if (reg.id() == cpreg.id()) return @intCast(i);
+        }
+        unreachable; // register not in input register list!
+    }
+
+    pub fn push(self: *Self, registers: []const Register, reg: Register) void {
+        const index = getIndexForReg(registers, reg);
+        self.bitset.set(index);
+    }
+
+    pub fn isSet(self: Self, registers: []const Register, reg: Register) bool {
+        const index = getIndexForReg(registers, reg);
+        return self.bitset.isSet(index);
+    }
+
+    pub fn iterator(self: Self, comptime options: std.bit_set.IteratorOptions) BitSet.Iterator(options) {
+        return self.bitset.iterator(options);
+    }
+
+    pub fn count(self: Self) u32 {
+        return @intCast(self.bitset.count());
+    }
+
+    pub fn size(self: Self) u32 {
+        return @intCast(self.bitset.count() * 8);
+    }
+};
+
+const Mir = @This();
+const std = @import("std");
+const builtin = @import("builtin");
+const assert = std.debug.assert;
+
+const bits = @import("bits.zig");
+const Register = bits.Register;
+const IntegerBitSet = std.bit_set.IntegerBitSet;
src/register_manager.zig
@@ -102,7 +102,7 @@ pub fn RegisterManager(
             }
 
             const OptionalIndex = std.math.IntFittingRange(0, set.len);
-            comptime var map = [1]OptionalIndex{set.len} ** (max_id + 1 - min_id);
+            comptime var map = [1]OptionalIndex{set.len} ** (max_id - min_id + 1);
             inline for (set, 0..) |elem, elem_index| map[comptime elem.id() - min_id] = elem_index;
 
             const id_index = reg.id() -% min_id;