Commit 4ec49da5f6

Jacob Young <jacobly0@users.noreply.github.com>
2023-04-29 02:39:38
x86_64: implement a bunch of floating point stuff
1 parent 7c9891d
src/arch/x86_64/CodeGen.zig
@@ -1297,9 +1297,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .ceil,
             .round,
             .trunc_float,
-            .neg,
             => try self.airUnaryMath(inst),
 
+            .neg => try self.airNeg(inst),
+
             .add_with_overflow => try self.airAddSubWithOverflow(inst),
             .sub_with_overflow => try self.airAddSubWithOverflow(inst),
             .mul_with_overflow => try self.airMulWithOverflow(inst),
@@ -1881,7 +1882,7 @@ pub fn spillRegisters(self: *Self, registers: []const Register) !void {
 /// allocated. A second call to `copyToTmpRegister` may return the same register.
 /// This can have a side effect of spilling instructions to the stack to free up a register.
 fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register {
-    const reg: Register = try self.register_manager.allocReg(null, try self.regClassForType(ty));
+    const reg = try self.register_manager.allocReg(null, try self.regClassForType(ty));
     try self.genSetReg(reg, ty, mcv);
     return reg;
 }
@@ -1924,16 +1925,48 @@ fn airRetPtr(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    _ = ty_op;
-    return self.fail("TODO implement airFptrunc for {}", .{self.target.cpu.arch});
-    // return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+    const dst_ty = self.air.typeOfIndex(inst);
+    const src_ty = self.air.typeOf(ty_op.operand);
+    if (dst_ty.floatBits(self.target.*) != 32 or src_ty.floatBits(self.target.*) != 64 or
+        !Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+        return self.fail("TODO implement airFptrunc from {} to {}", .{
+            src_ty.fmt(self.bin_file.options.module.?),
+            dst_ty.fmt(self.bin_file.options.module.?),
+        });
+
+    const src_mcv = try self.resolveInst(ty_op.operand);
+    const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+        src_mcv
+    else
+        try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv);
+    const dst_lock = self.register_manager.lockReg(dst_mcv.register);
+    defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+    try self.genBinOpMir(.cvtsd2ss, src_ty, dst_mcv, src_mcv);
+    return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
 
 fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    _ = ty_op;
-    return self.fail("TODO implement airFpext for {}", .{self.target.cpu.arch});
-    // return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+    const dst_ty = self.air.typeOfIndex(inst);
+    const src_ty = self.air.typeOf(ty_op.operand);
+    if (dst_ty.floatBits(self.target.*) != 64 or src_ty.floatBits(self.target.*) != 32 or
+        !Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+        return self.fail("TODO implement airFpext from {} to {}", .{
+            src_ty.fmt(self.bin_file.options.module.?),
+            dst_ty.fmt(self.bin_file.options.module.?),
+        });
+
+    const src_mcv = try self.resolveInst(ty_op.operand);
+    const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
+        src_mcv
+    else
+        try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv);
+    const dst_lock = self.register_manager.lockReg(dst_mcv.register);
+    defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+    try self.genBinOpMir(.cvtss2sd, src_ty, dst_mcv, src_mcv);
+    return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
 
 fn airIntCast(self: *Self, inst: Air.Inst.Index) !void {
@@ -3953,10 +3986,65 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
 
+fn airNeg(self: *Self, inst: Air.Inst.Index) !void {
+    const un_op = self.air.instructions.items(.data)[inst].un_op;
+    const ty = self.air.typeOf(un_op);
+    const ty_bits = ty.floatBits(self.target.*);
+
+    var arena = std.heap.ArenaAllocator.init(self.gpa);
+    defer arena.deinit();
+
+    const ExpectedContents = union {
+        f16: Value.Payload.Float_16,
+        f32: Value.Payload.Float_32,
+        f64: Value.Payload.Float_64,
+        f80: Value.Payload.Float_80,
+        f128: Value.Payload.Float_128,
+    };
+    var stack align(@alignOf(ExpectedContents)) =
+        std.heap.stackFallback(@sizeOf(ExpectedContents), arena.allocator());
+
+    var vec_pl = Type.Payload.Array{
+        .base = .{ .tag = .vector },
+        .data = .{
+            .len = @divExact(128, ty_bits),
+            .elem_type = ty,
+        },
+    };
+    const vec_ty = Type.initPayload(&vec_pl.base);
+
+    var sign_pl = Value.Payload.SubValue{
+        .base = .{ .tag = .repeated },
+        .data = try Value.floatToValue(-0.0, stack.get(), ty, self.target.*),
+    };
+    const sign_val = Value.initPayload(&sign_pl.base);
+
+    const sign_mcv = try self.genTypedValue(.{ .ty = vec_ty, .val = sign_val });
+
+    const src_mcv = try self.resolveInst(un_op);
+    const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, un_op, 0, src_mcv))
+        src_mcv
+    else
+        try self.copyToRegisterWithInstTracking(inst, ty, src_mcv);
+    const dst_lock = self.register_manager.lockReg(dst_mcv.register);
+    defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
+
+    try self.genBinOpMir(switch (ty_bits) {
+        32 => .xorps,
+        64 => .xorpd,
+        else => return self.fail("TODO implement airNeg for {}", .{
+            ty.fmt(self.bin_file.options.module.?),
+        }),
+    }, vec_ty, dst_mcv, sign_mcv);
+    return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none });
+}
+
 fn airUnaryMath(self: *Self, inst: Air.Inst.Index) !void {
     const un_op = self.air.instructions.items(.data)[inst].un_op;
     _ = un_op;
-    return self.fail("TODO implement airUnaryMath for {}", .{self.target.cpu.arch});
+    return self.fail("TODO implement airUnaryMath for {}", .{
+        self.air.instructions.items(.tag)[inst],
+    });
     //return self.finishAir(inst, result, .{ un_op, .none, .none });
 }
 
@@ -4109,7 +4197,6 @@ fn load(self: *Self, dst_mcv: MCValue, ptr_ty: Type, ptr_mcv: MCValue) InnerErro
 fn airLoad(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
     const elem_ty = self.air.typeOfIndex(inst);
-    const elem_size = elem_ty.abiSize(self.target.*);
     const result: MCValue = result: {
         if (!elem_ty.hasRuntimeBitsIgnoreComptime()) break :result .none;
 
@@ -4117,14 +4204,20 @@ fn airLoad(self: *Self, inst: Air.Inst.Index) !void {
         const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rdi, .rsi, .rcx });
         defer for (reg_locks) |lock| self.register_manager.unlockReg(lock);
 
+        const ptr_ty = self.air.typeOf(ty_op.operand);
+        const elem_size = elem_ty.abiSize(self.target.*);
+
+        const elem_rc = try self.regClassForType(elem_ty);
+        const ptr_rc = try self.regClassForType(ptr_ty);
+
         const ptr_mcv = try self.resolveInst(ty_op.operand);
-        const dst_mcv = if (elem_size <= 8 and self.reuseOperand(inst, ty_op.operand, 0, ptr_mcv))
+        const dst_mcv = if (elem_size <= 8 and elem_rc.supersetOf(ptr_rc) and
+            self.reuseOperand(inst, ty_op.operand, 0, ptr_mcv))
             // The MCValue that holds the pointer can be re-used as the value.
             ptr_mcv
         else
             try self.allocRegOrMem(inst, true);
 
-        const ptr_ty = self.air.typeOf(ty_op.operand);
         if (ptr_ty.ptrInfo().data.host_size > 0) {
             try self.packedLoad(dst_mcv, ptr_ty, ptr_mcv);
         } else {
@@ -4346,17 +4439,9 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
 
         switch (src_mcv) {
             .load_frame => |frame_addr| {
-                const field_abi_size = @intCast(u32, field_ty.abiSize(self.target.*));
-                const limb_abi_size = @min(field_abi_size, 8);
-                const limb_abi_bits = limb_abi_size * 8;
-                const field_byte_off = @intCast(i32, field_off / limb_abi_bits * limb_abi_size);
-                const field_bit_off = field_off % limb_abi_bits;
-
-                if (field_bit_off == 0) {
-                    const off_mcv = MCValue{ .load_frame = .{
-                        .index = frame_addr.index,
-                        .off = frame_addr.off + field_byte_off,
-                    } };
+                if (field_off % 8 == 0) {
+                    const off_mcv =
+                        src_mcv.address().offset(@intCast(i32, @divExact(field_off, 8))).deref();
                     if (self.reuseOperand(inst, operand, 0, src_mcv)) break :result off_mcv;
 
                     const dst_mcv = try self.allocRegOrMem(inst, true);
@@ -4364,6 +4449,12 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
                     break :result dst_mcv;
                 }
 
+                const field_abi_size = @intCast(u32, field_ty.abiSize(self.target.*));
+                const limb_abi_size = @min(field_abi_size, 8);
+                const limb_abi_bits = limb_abi_size * 8;
+                const field_byte_off = @intCast(i32, field_off / limb_abi_bits * limb_abi_size);
+                const field_bit_off = field_off % limb_abi_bits;
+
                 if (field_abi_size > 8) {
                     return self.fail("TODO implement struct_field_val with large packed field", .{});
                 }
@@ -5181,24 +5272,69 @@ fn genBinOp(
     switch (tag) {
         .add,
         .addwrap,
-        => try self.genBinOpMir(switch (lhs_ty.tag()) {
+        => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
             else => .add,
-            .f32 => .addss,
-            .f64 => .addsd,
+            .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
+                    .addss
+                else
+                    return self.fail("TODO implement genBinOp for {s} {} without sse", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+                    .addsd
+                else
+                    return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                else => return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+            },
         }, lhs_ty, dst_mcv, src_mcv),
 
         .sub,
         .subwrap,
-        => try self.genBinOpMir(switch (lhs_ty.tag()) {
+        => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
             else => .sub,
-            .f32 => .subss,
-            .f64 => .subsd,
+            .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
+                    .subss
+                else
+                    return self.fail("TODO implement genBinOp for {s} {} without sse", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+                    .subsd
+                else
+                    return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                else => return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+            },
         }, lhs_ty, dst_mcv, src_mcv),
 
-        .mul => try self.genBinOpMir(switch (lhs_ty.tag()) {
-            .f32 => .mulss,
-            .f64 => .mulsd,
+        .mul => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
             else => return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?) }),
+            .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
+                    .mulss
+                else
+                    return self.fail("TODO implement genBinOp for {s} {} without sse", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+                    .mulsd
+                else
+                    return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                else => return self.fail("TODO implement genBinOp for {s} {}", .{
+                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
+            },
         }, lhs_ty, dst_mcv, src_mcv),
 
         .div_float,
@@ -5206,12 +5342,27 @@ fn genBinOp(
         .div_trunc,
         .div_floor,
         => {
-            try self.genBinOpMir(switch (lhs_ty.tag()) {
-                .f32 => .divss,
-                .f64 => .divsd,
+            try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
                 else => return self.fail("TODO implement genBinOp for {s} {}", .{
                     @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
                 }),
+                .Float => switch (lhs_ty.floatBits(self.target.*)) {
+                    32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
+                        .divss
+                    else
+                        return self.fail("TODO implement genBinOp for {s} {} without sse", .{
+                            @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                        }),
+                    64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+                        .divsd
+                    else
+                        return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
+                            @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                        }),
+                    else => return self.fail("TODO implement genBinOp for {s} {}", .{
+                        @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                    }),
+                },
             }, lhs_ty, dst_mcv, src_mcv);
             switch (tag) {
                 .div_float,
@@ -5222,16 +5373,18 @@ fn genBinOp(
                 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse4_1)) {
                     const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
                     const dst_alias = registerAlias(dst_mcv.register, abi_size);
-                    try self.asmRegisterRegisterImmediate(switch (lhs_ty.tag()) {
-                        .f32 => .roundss,
-                        .f64 => .roundsd,
+                    try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) {
+                        32 => .roundss,
+                        64 => .roundsd,
                         else => unreachable,
                     }, dst_alias, dst_alias, Immediate.u(switch (tag) {
                         .div_trunc => 0b1_0_11,
                         .div_floor => 0b1_0_01,
                         else => unreachable,
                     }));
-                } else return self.fail("TODO implement round without sse4_1", .{}),
+                } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{
+                    @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
+                }),
                 else => unreachable,
             }
         },
@@ -5453,39 +5606,68 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, ty: Type, dst_mcv: MCValue, s
                         )),
                     else => unreachable,
                 },
-                .register_offset,
                 .eflags,
+                .register_offset,
                 .memory,
+                .indirect,
                 .load_direct,
                 .lea_direct,
                 .load_got,
                 .lea_got,
                 .load_tlv,
                 .lea_tlv,
+                .load_frame,
                 .lea_frame,
                 => {
-                    assert(abi_size <= 8);
+                    blk: {
+                        return self.asmRegisterMemory(
+                            mir_tag,
+                            registerAlias(dst_reg, abi_size),
+                            Memory.sib(Memory.PtrSize.fromSize(abi_size), switch (src_mcv) {
+                                .memory => |addr| .{
+                                    .base = .{ .reg = .ds },
+                                    .disp = math.cast(i32, addr) orelse break :blk,
+                                },
+                                .indirect => |reg_off| .{
+                                    .base = .{ .reg = reg_off.reg },
+                                    .disp = reg_off.off,
+                                },
+                                .load_frame => |frame_addr| .{
+                                    .base = .{ .frame = frame_addr.index },
+                                    .disp = frame_addr.off,
+                                },
+                                else => break :blk,
+                            }),
+                        );
+                    }
+
                     const dst_reg_lock = self.register_manager.lockReg(dst_reg);
                     defer if (dst_reg_lock) |lock| self.register_manager.unlockReg(lock);
 
-                    const reg = try self.copyToTmpRegister(ty, src_mcv);
-                    return self.genBinOpMir(mir_tag, ty, dst_mcv, .{ .register = reg });
-                },
-                .indirect, .load_frame => try self.asmRegisterMemory(
-                    mir_tag,
-                    registerAlias(dst_reg, abi_size),
-                    Memory.sib(Memory.PtrSize.fromSize(abi_size), switch (src_mcv) {
-                        .indirect => |reg_off| .{
-                            .base = .{ .reg = reg_off.reg },
-                            .disp = reg_off.off,
+                    switch (src_mcv) {
+                        .eflags,
+                        .register_offset,
+                        .lea_direct,
+                        .lea_got,
+                        .lea_tlv,
+                        .lea_frame,
+                        => {
+                            const reg = try self.copyToTmpRegister(ty, src_mcv);
+                            return self.genBinOpMir(mir_tag, ty, dst_mcv, .{ .register = reg });
                         },
-                        .load_frame => |frame_addr| .{
-                            .base = .{ .frame = frame_addr.index },
-                            .disp = frame_addr.off,
+                        .memory,
+                        .load_direct,
+                        .load_got,
+                        .load_tlv,
+                        => {
+                            const addr_reg = try self.copyToTmpRegister(ty, src_mcv.address());
+                            return self.genBinOpMir(mir_tag, ty, dst_mcv, .{
+                                .indirect = .{ .reg = addr_reg },
+                            });
                         },
                         else => unreachable,
-                    }),
-                ),
+                    }
+                },
             }
         },
         .memory, .indirect, .load_got, .load_direct, .load_tlv, .load_frame => {
@@ -6175,10 +6357,25 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
     defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
 
     const src_mcv = if (flipped) lhs_mcv else rhs_mcv;
-    try self.genBinOpMir(switch (ty.tag()) {
+    try self.genBinOpMir(switch (ty.zigTypeTag()) {
         else => .cmp,
-        .f32 => .ucomiss,
-        .f64 => .ucomisd,
+        .Float => switch (ty.floatBits(self.target.*)) {
+            32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
+                .ucomiss
+            else
+                return self.fail("TODO implement airCmp for {} without sse", .{
+                    ty.fmt(self.bin_file.options.module.?),
+                }),
+            64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+                .ucomisd
+            else
+                return self.fail("TODO implement airCmp for {} without sse2", .{
+                    ty.fmt(self.bin_file.options.module.?),
+                }),
+            else => return self.fail("TODO implement airCmp for {}", .{
+                ty.fmt(self.bin_file.options.module.?),
+            }),
+        },
     }, ty, dst_mcv, src_mcv);
 
     const signedness = if (ty.isAbiInt()) ty.intInfo(self.target.*).signedness else .unsigned;
@@ -7608,7 +7805,8 @@ fn airBitCast(self: *Self, inst: Air.Inst.Index) !void {
         const dst_rc = try self.regClassForType(dst_ty);
         const src_rc = try self.regClassForType(src_ty);
         const operand = try self.resolveInst(ty_op.operand);
-        if (dst_rc.eql(src_rc) and self.reuseOperand(inst, ty_op.operand, 0, operand)) break :result operand;
+        if (dst_rc.supersetOf(src_rc) and self.reuseOperand(inst, ty_op.operand, 0, operand))
+            break :result operand;
 
         const operand_lock = switch (operand) {
             .register => |reg| self.register_manager.lockReg(reg),
@@ -7648,9 +7846,59 @@ fn airArrayToSlice(self: *Self, inst: Air.Inst.Index) !void {
 
 fn airIntToFloat(self: *Self, inst: Air.Inst.Index) !void {
     const ty_op = self.air.instructions.items(.data)[inst].ty_op;
-    _ = ty_op;
-    return self.fail("TODO implement airIntToFloat for {}", .{self.target.cpu.arch});
-    //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
+
+    const src_ty = self.air.typeOf(ty_op.operand);
+    const src_bits = @intCast(u32, src_ty.bitSize(self.target.*));
+    const src_signedness =
+        if (src_ty.isAbiInt()) src_ty.intInfo(self.target.*).signedness else .unsigned;
+    const dst_ty = self.air.typeOfIndex(inst);
+
+    const src_size = std.math.divCeil(u32, @max(switch (src_signedness) {
+        .signed => src_bits,
+        .unsigned => src_bits + 1,
+    }, 32), 8) catch unreachable;
+    if (src_size > 8) return self.fail("TODO implement airIntToFloat from {} to {}", .{
+        src_ty.fmt(self.bin_file.options.module.?),
+        dst_ty.fmt(self.bin_file.options.module.?),
+    });
+
+    const src_mcv = try self.resolveInst(ty_op.operand);
+    const src_reg = switch (src_mcv) {
+        .register => |reg| reg,
+        else => try self.copyToTmpRegister(src_ty, src_mcv),
+    };
+    const src_lock = self.register_manager.lockRegAssumeUnused(src_reg);
+    defer self.register_manager.unlockReg(src_lock);
+
+    if (src_bits < src_size * 8) try self.truncateRegister(src_ty, src_reg);
+
+    const dst_reg = try self.register_manager.allocReg(inst, try self.regClassForType(dst_ty));
+    const dst_mcv = MCValue{ .register = dst_reg };
+    const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg);
+    defer self.register_manager.unlockReg(dst_lock);
+
+    try self.asmRegisterRegister(switch (dst_ty.floatBits(self.target.*)) {
+        32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
+            .cvtsi2ss
+        else
+            return self.fail("TODO implement airIntToFloat from {} to {} without sse", .{
+                src_ty.fmt(self.bin_file.options.module.?),
+                dst_ty.fmt(self.bin_file.options.module.?),
+            }),
+        64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
+            .cvtsi2sd
+        else
+            return self.fail("TODO implement airIntToFloat from {} to {} without sse2", .{
+                src_ty.fmt(self.bin_file.options.module.?),
+                dst_ty.fmt(self.bin_file.options.module.?),
+            }),
+        else => return self.fail("TODO implement airIntToFloat from {} to {}", .{
+            src_ty.fmt(self.bin_file.options.module.?),
+            dst_ty.fmt(self.bin_file.options.module.?),
+        }),
+    }, dst_reg.to128(), registerAlias(src_reg, src_size));
+
+    return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
 
 fn airFloatToInt(self: *Self, inst: Air.Inst.Index) !void {
@@ -8717,6 +8965,7 @@ fn resolveCallingConventionValues(
         },
         .C => {
             var param_reg_i: usize = 0;
+            var param_sse_reg_i: usize = 0;
             result.stack_align = 16;
 
             switch (self.target.os.tag) {
@@ -8734,26 +8983,39 @@ fn resolveCallingConventionValues(
                 // TODO: is this even possible for C calling convention?
                 result.return_value = InstTracking.init(.none);
             } else {
-                const ret_reg = abi.getCAbiIntReturnRegs(self.target.*)[0];
-                const ret_ty_size = @intCast(u31, ret_ty.abiSize(self.target.*));
-                if (ret_ty_size <= 8) {
-                    const aliased_reg = registerAlias(ret_reg, ret_ty_size);
-                    result.return_value = .{ .short = .{ .register = aliased_reg }, .long = .none };
-                } else {
-                    const ret_indirect_reg = abi.getCAbiIntParamRegs(self.target.*)[param_reg_i];
-                    param_reg_i += 1;
-                    result.return_value = .{
-                        .short = .{ .indirect = .{ .reg = ret_reg } },
-                        .long = .{ .indirect = .{ .reg = ret_indirect_reg } },
-                    };
+                const classes = switch (self.target.os.tag) {
+                    .windows => &[1]abi.Class{abi.classifyWindows(ret_ty, self.target.*)},
+                    else => mem.sliceTo(&abi.classifySystemV(ret_ty, self.target.*, .ret), .none),
+                };
+                if (classes.len > 1) {
+                    return self.fail("TODO handle multiple classes per type", .{});
                 }
+                const ret_reg = abi.getCAbiIntReturnRegs(self.target.*)[0];
+                result.return_value = switch (classes[0]) {
+                    .integer => InstTracking.init(.{ .register = registerAlias(
+                        ret_reg,
+                        @intCast(u32, ret_ty.abiSize(self.target.*)),
+                    ) }),
+                    .float, .sse => InstTracking.init(.{ .register = .xmm0 }),
+                    .memory => ret: {
+                        const ret_indirect_reg = abi.getCAbiIntParamRegs(self.target.*)[param_reg_i];
+                        param_reg_i += 1;
+                        break :ret .{
+                            .short = .{ .indirect = .{ .reg = ret_reg } },
+                            .long = .{ .indirect = .{ .reg = ret_indirect_reg } },
+                        };
+                    },
+                    else => |class| return self.fail("TODO handle calling convention class {s}", .{
+                        @tagName(class),
+                    }),
+                };
             }
 
             // Input params
             for (param_types, result.args) |ty, *arg| {
                 assert(ty.hasRuntimeBitsIgnoreComptime());
 
-                const classes: []const abi.Class = switch (self.target.os.tag) {
+                const classes = switch (self.target.os.tag) {
                     .windows => &[1]abi.Class{abi.classifyWindows(ty, self.target.*)},
                     else => mem.sliceTo(&abi.classifySystemV(ty, self.target.*, .arg), .none),
                 };
@@ -8761,13 +9023,29 @@ fn resolveCallingConventionValues(
                     return self.fail("TODO handle multiple classes per type", .{});
                 }
                 switch (classes[0]) {
-                    .integer => blk: {
-                        if (param_reg_i >= abi.getCAbiIntParamRegs(self.target.*).len) break :blk;
-                        const param_reg = abi.getCAbiIntParamRegs(self.target.*)[param_reg_i];
+                    .integer => if (param_reg_i < abi.getCAbiIntParamRegs(self.target.*).len) {
+                        arg.* = .{ .register = abi.getCAbiIntParamRegs(self.target.*)[param_reg_i] };
                         param_reg_i += 1;
-                        arg.* = .{ .register = param_reg };
                         continue;
                     },
+                    .float, .sse => switch (self.target.os.tag) {
+                        .windows => if (param_reg_i < 4) {
+                            arg.* = .{ .register = @intToEnum(
+                                Register,
+                                @enumToInt(Register.xmm0) + param_reg_i,
+                            ) };
+                            param_reg_i += 1;
+                            continue;
+                        },
+                        else => if (param_sse_reg_i < 8) {
+                            arg.* = .{ .register = @intToEnum(
+                                Register,
+                                @enumToInt(Register.xmm0) + param_sse_reg_i,
+                            ) };
+                            param_sse_reg_i += 1;
+                            continue;
+                        },
+                    },
                     .memory => {}, // fallthrough
                     else => |class| return self.fail("TODO handle calling convention class {s}", .{
                         @tagName(class),
src/arch/x86_64/encoder.zig
@@ -323,7 +323,7 @@ pub const Instruction = struct {
         var rex = Rex{};
         rex.present = inst.encoding.data.mode == .rex;
         switch (inst.encoding.data.mode) {
-            .long, .sse2_long => rex.w = true,
+            .long, .sse_long, .sse2_long => rex.w = true,
             else => {},
         }
 
src/arch/x86_64/Encoding.zig
@@ -58,7 +58,7 @@ pub fn findByMnemonic(
     next: for (mnemonic_to_encodings_map[@enumToInt(mnemonic)]) |data| {
         switch (data.mode) {
             .rex => if (!rex_required) continue,
-            .long, .sse2_long => {},
+            .long, .sse_long, .sse2_long => {},
             else => if (rex_required) continue,
         }
         for (input_ops, data.ops) |input_op, data_op|
@@ -90,7 +90,7 @@ pub fn findByOpcode(opc: []const u8, prefixes: struct {
         if (prefixes.rex.w) {
             switch (data.mode) {
                 .short, .fpu, .sse, .sse2, .sse4_1, .none => continue,
-                .long, .sse2_long, .rex => {},
+                .long, .sse_long, .sse2_long, .rex => {},
             }
         } else if (prefixes.rex.present and !prefixes.rex.isSet()) {
             switch (data.mode) {
@@ -138,7 +138,7 @@ pub fn modRmExt(encoding: Encoding) u3 {
 pub fn operandBitSize(encoding: Encoding) u64 {
     switch (encoding.data.mode) {
         .short => return 16,
-        .long, .sse2_long => return 64,
+        .long, .sse_long, .sse2_long => return 64,
         else => {},
     }
     const bit_size: u64 = switch (encoding.data.op_en) {
@@ -163,7 +163,7 @@ pub fn format(
     _ = options;
     _ = fmt;
     switch (encoding.data.mode) {
-        .long, .sse2_long => try writer.writeAll("REX.W + "),
+        .long, .sse_long, .sse2_long => try writer.writeAll("REX.W + "),
         else => {},
     }
 
@@ -269,21 +269,25 @@ pub const Mnemonic = enum {
     // SSE
     addss,
     cmpss,
+    cvtsi2ss,
     divss,
     maxss, minss,
     movss,
     mulss,
     subss,
     ucomiss,
+    xorps,
     // SSE2
     addsd,
     //cmpsd,
+    cvtsd2ss, cvtsi2sd, cvtss2sd,
     divsd,
     maxsd, minsd,
     movq, //movd, movsd,
     mulsd,
     subsd,
     ucomisd,
+    xorpd,
     // SSE4.1
     roundss,
     roundsd,
@@ -318,7 +322,7 @@ pub const Op = enum {
     m,
     moffs,
     sreg,
-    xmm, xmm_m32, xmm_m64,
+    xmm, xmm_m32, xmm_m64, xmm_m128,
     // zig fmt: on
 
     pub fn fromOperand(operand: Instruction.Operand) Op {
@@ -400,7 +404,7 @@ pub const Op = enum {
             .imm32, .imm32s, .eax, .r32, .m32, .rm32, .rel32, .xmm_m32 => 32,
             .imm64, .rax, .r64, .m64, .rm64, .xmm_m64 => 64,
             .m80 => 80,
-            .m128, .xmm => 128,
+            .m128, .xmm, .xmm_m128 => 128,
         };
     }
 
@@ -423,8 +427,8 @@ pub const Op = enum {
             .al, .ax, .eax, .rax,
             .r8, .r16, .r32, .r64,
             .rm8, .rm16, .rm32, .rm64,
-            .xmm, .xmm_m32, .xmm_m64,
-            =>  true,
+            .xmm, .xmm_m32, .xmm_m64, .xmm_m128,
+            => true,
             else => false,
         };
         // zig fmt: on
@@ -449,7 +453,7 @@ pub const Op = enum {
             .rm8, .rm16, .rm32, .rm64,
             .m8, .m16, .m32, .m64, .m80, .m128,
             .m,
-            .xmm_m32, .xmm_m64,
+            .xmm_m32, .xmm_m64, .xmm_m128,
             =>  true,
             else => false,
         };
@@ -470,13 +474,13 @@ pub const Op = enum {
             .r8, .r16, .r32, .r64 => .general_purpose,
             .rm8, .rm16, .rm32, .rm64 => .general_purpose,
             .sreg => .segment,
-            .xmm, .xmm_m32, .xmm_m64 => .floating_point,
+            .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point,
         };
     }
 
     pub fn isFloatingPointRegister(op: Op) bool {
         return switch (op) {
-            .xmm, .xmm_m32, .xmm_m64 => true,
+            .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => true,
             else => false,
         };
     }
@@ -535,6 +539,7 @@ pub const Mode = enum {
     rex,
     long,
     sse,
+    sse_long,
     sse2,
     sse2_long,
     sse4_1,
src/arch/x86_64/encodings.zig
@@ -834,6 +834,9 @@ pub const table = [_]Entry{
 
     .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .sse },
 
+    .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .sse },
+    .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .sse_long },
+
     .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .sse },
 
     .{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .sse },
@@ -849,11 +852,20 @@ pub const table = [_]Entry{
 
     .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .sse },
 
+    .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .sse },
+
     // SSE2
     .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .sse2 },
 
     .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .sse2 },
 
+    .{ .cvtsd2ss, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .sse2 },
+
+    .{ .cvtsi2sd, .rm, &.{ .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .sse2 },
+    .{ .cvtsi2sd, .rm, &.{ .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .sse2_long },
+
+    .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .sse2 },
+
     .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .sse2 },
 
     .{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .sse2 },
@@ -878,6 +890,8 @@ pub const table = [_]Entry{
 
     .{ .ucomisd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x2e }, 0, .sse2 },
 
+    .{ .xorpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .sse2 },
+
     // SSE4.1
     .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .sse4_1 },
     .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .sse4_1 },
src/arch/x86_64/Lower.zig
@@ -95,6 +95,7 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction {
 
         .addss,
         .cmpss,
+        .cvtsi2ss,
         .divss,
         .maxss,
         .minss,
@@ -103,8 +104,12 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction {
         .roundss,
         .subss,
         .ucomiss,
+        .xorps,
         .addsd,
         .cmpsd,
+        .cvtsd2ss,
+        .cvtsi2sd,
+        .cvtss2sd,
         .divsd,
         .maxsd,
         .minsd,
@@ -113,6 +118,7 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction {
         .roundsd,
         .subsd,
         .ucomisd,
+        .xorpd,
         => try lower.mirGeneric(inst),
 
         .cmps,
src/arch/x86_64/Mir.zig
@@ -170,6 +170,8 @@ pub const Inst = struct {
         addss,
         /// Compare scalar single-precision floating-point values
         cmpss,
+        /// Convert doubleword integer to scalar single-precision floating-point value
+        cvtsi2ss,
         /// Divide scalar single-precision floating-point values
         divss,
         /// Return maximum single-precision floating-point value
@@ -186,10 +188,18 @@ pub const Inst = struct {
         subss,
         /// Unordered compare scalar single-precision floating-point values
         ucomiss,
+        /// Bitwise logical xor of packed single precision floating-point values
+        xorps,
         /// Add double precision floating point values
         addsd,
         /// Compare scalar double-precision floating-point values
         cmpsd,
+        /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value
+        cvtsd2ss,
+        /// Convert doubleword integer to scalar double-precision floating-point value
+        cvtsi2sd,
+        /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
+        cvtss2sd,
         /// Divide scalar double-precision floating-point values
         divsd,
         /// Return maximum double-precision floating-point value
@@ -206,6 +216,8 @@ pub const Inst = struct {
         subsd,
         /// Unordered compare scalar double-precision floating-point values
         ucomisd,
+        /// Bitwise logical xor of packed double precision floating-point values
+        xorpd,
 
         /// Compare string operands
         cmps,
src/codegen.zig
@@ -291,6 +291,20 @@ pub fn generateSymbol(
             },
         },
         .Pointer => switch (typed_value.val.tag()) {
+            .null_value => {
+                switch (target.cpu.arch.ptrBitWidth()) {
+                    32 => {
+                        mem.writeInt(u32, try code.addManyAsArray(4), 0, endian);
+                        if (typed_value.ty.isSlice()) try code.appendNTimes(0xaa, 4);
+                    },
+                    64 => {
+                        mem.writeInt(u64, try code.addManyAsArray(8), 0, endian);
+                        if (typed_value.ty.isSlice()) try code.appendNTimes(0xaa, 8);
+                    },
+                    else => unreachable,
+                }
+                return Result.ok;
+            },
             .zero, .one, .int_u64, .int_big_positive => {
                 switch (target.cpu.arch.ptrBitWidth()) {
                     32 => {
@@ -397,30 +411,15 @@ pub fn generateSymbol(
                     },
                 }
             },
-            .elem_ptr => {
-                const elem_ptr = typed_value.val.castTag(.elem_ptr).?.data;
-                const elem_size = typed_value.ty.childType().abiSize(target);
-                const addend = @intCast(u32, elem_ptr.index * elem_size);
-                const array_ptr = elem_ptr.array_ptr;
-
-                switch (array_ptr.tag()) {
-                    .decl_ref => {
-                        const decl_index = array_ptr.castTag(.decl_ref).?.data;
-                        return lowerDeclRef(bin_file, src_loc, typed_value, decl_index, code, debug_output, .{
-                            .parent_atom_index = reloc_info.parent_atom_index,
-                            .addend = (reloc_info.addend orelse 0) + addend,
-                        });
-                    },
-                    else => return Result{
-                        .fail = try ErrorMsg.create(
-                            bin_file.allocator,
-                            src_loc,
-                            "TODO implement generateSymbol for pointer type value: '{s}'",
-                            .{@tagName(typed_value.val.tag())},
-                        ),
-                    },
-                }
-            },
+            .elem_ptr => return lowerParentPtr(
+                bin_file,
+                src_loc,
+                typed_value,
+                typed_value.val,
+                code,
+                debug_output,
+                reloc_info,
+            ),
             else => return Result{
                 .fail = try ErrorMsg.create(
                     bin_file.allocator,
@@ -838,9 +837,62 @@ pub fn generateSymbol(
     }
 }
 
+fn lowerParentPtr(
+    bin_file: *link.File,
+    src_loc: Module.SrcLoc,
+    typed_value: TypedValue,
+    parent_ptr: Value,
+    code: *std.ArrayList(u8),
+    debug_output: DebugInfoOutput,
+    reloc_info: RelocInfo,
+) CodeGenError!Result {
+    const target = bin_file.options.target;
+
+    switch (parent_ptr.tag()) {
+        .elem_ptr => {
+            const elem_ptr = parent_ptr.castTag(.elem_ptr).?.data;
+            return lowerParentPtr(
+                bin_file,
+                src_loc,
+                typed_value,
+                elem_ptr.array_ptr,
+                code,
+                debug_output,
+                reloc_info.offset(@intCast(u32, elem_ptr.index * elem_ptr.elem_ty.abiSize(target))),
+            );
+        },
+        .decl_ref => {
+            const decl_index = parent_ptr.castTag(.decl_ref).?.data;
+            return lowerDeclRef(
+                bin_file,
+                src_loc,
+                typed_value,
+                decl_index,
+                code,
+                debug_output,
+                reloc_info,
+            );
+        },
+        else => |t| {
+            return Result{
+                .fail = try ErrorMsg.create(
+                    bin_file.allocator,
+                    src_loc,
+                    "TODO implement lowerParentPtr for type '{s}'",
+                    .{@tagName(t)},
+                ),
+            };
+        },
+    }
+}
+
 const RelocInfo = struct {
     parent_atom_index: u32,
     addend: ?u32 = null,
+
+    fn offset(ri: RelocInfo, addend: u32) RelocInfo {
+        return .{ .parent_atom_index = ri.parent_atom_index, .addend = (ri.addend orelse 0) + addend };
+    }
 };
 
 fn lowerDeclRef(
@@ -1095,6 +1147,9 @@ pub fn genTypedValue(
             .Slice => {},
             else => {
                 switch (typed_value.val.tag()) {
+                    .null_value => {
+                        return GenResult.mcv(.{ .immediate = 0 });
+                    },
                     .int_u64 => {
                         return GenResult.mcv(.{ .immediate = typed_value.val.toUnsignedInt(target) });
                     },