Commit 6d6cf59847

Andrew Kelley <andrew@ziglang.org>
2021-10-11 20:00:32
stage2: support nested structs and arrays and sret
* Add AIR instructions: ret_ptr, ret_load - This allows Sema to be blissfully unaware of the backend's decision to implement by-val/by-ref semantics for struct/union/array types. Backends can lower these simply as alloc, load, ret instructions, or they can take advantage of them to use a result pointer. * Add AIR instruction: array_elem_val - Allows for better codegen for `Sema.elemVal`. * Implement calculation of ABI alignment and ABI size for unions. * Before appending the following AIR instructions to a block, resolveTypeLayout is called on the type: - call - return type - ret - return type - store_ptr - elem type * Sema: fix memory leak in `zirArrayInit` and other cleanups to this function. * x86_64: implement the full x86_64 C ABI according to the spec * Type: implement `intInfo` for error sets. * Type: implement `intTagType` for tagged unions. The Zig type tag `Fn` is now used exclusively for function bodies. Function pointers are modeled as `*const T` where `T` is a `Fn` type. * The `call` AIR instruction now allows a function pointer operand as well as a function operand. * Sema now has a coercion from function body to function pointer. * Function type syntax, e.g. `fn()void`, now returns zig tag type of Pointer with child Fn, rather than Fn directly. - I think this should probably be reverted. Will discuss the lang specs before doing this. Idea being that function pointers would need to be specified as `*const fn()void` rather than `fn() void`. LLVM backend: * Enable calling the panic handler (previously this just emitted `@breakpoint()` since the backend could not handle the panic function). * Implement sret * Introduce `isByRef` and implement it for structs and arrays. Types that are `isByRef` are now passed as pointers to functions, and e.g. `elem_val` will return a pointer instead of doing a load. * Move the function type creating code from `resolveLlvmFunction` to `llvmType` where it belongs; now there is only 1 instance of this logic instead of two. * Add the `nonnull` attribute to non-optional pointer parameters. * Fix `resolveGlobalDecl` not using fully-qualified names and not using the `decl_map`. * Implement `genTypedValue` for pointer-like optionals. * Fix memory leak when lowering `block` instruction and OOM occurs. * Implement volatile checks where relevant.
1 parent f42725c
src/arch/x86_64/abi.zig
@@ -0,0 +1,337 @@
+const std = @import("std");
+const Type = @import("../../type.zig").Type;
+const Target = std.Target;
+const assert = std.debug.assert;
+
+pub const Class = enum { integer, sse, sseup, x87, x87up, complex_x87, memory, none };
+
+pub fn classifyWindows(ty: Type, target: Target) Class {
+    // https://docs.microsoft.com/en-gb/cpp/build/x64-calling-convention?view=vs-2017
+    // "There's a strict one-to-one correspondence between a function call's arguments
+    // and the registers used for those arguments. Any argument that doesn't fit in 8
+    // bytes, or isn't 1, 2, 4, or 8 bytes, must be passed by reference. A single argument
+    // is never spread across multiple registers."
+    // "Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed
+    // as if they were integers of the same size."
+    switch (ty.abiSize(target)) {
+        1, 2, 4, 8 => {},
+        else => return .memory,
+    }
+    return switch (ty.zigTypeTag()) {
+        .Int, .Bool, .Enum, .Void, .NoReturn, .ErrorSet, .Struct, .Union => .integer,
+        .Optional => if (ty.isPtrLikeOptional()) return .integer else return .memory,
+        .Float, .Vector => .sse,
+        else => unreachable,
+    };
+}
+
+/// There are a maximum of 8 possible return slots. Returned values are in
+/// the beginning of the array; unused slots are filled with .none.
+pub fn classifySystemV(ty: Type, target: Target) [8]Class {
+    const memory_class = [_]Class{
+        .memory, .none, .none, .none,
+        .none,   .none, .none, .none,
+    };
+    var result = [1]Class{.none} ** 8;
+    switch (ty.zigTypeTag()) {
+        .Int, .Enum, .ErrorSet => {
+            const bits = ty.intInfo(target).bits;
+            if (bits <= 64) {
+                result[0] = .integer;
+                return result;
+            }
+            if (bits <= 128) {
+                result[0] = .integer;
+                result[1] = .integer;
+                return result;
+            }
+            if (bits <= 192) {
+                result[0] = .integer;
+                result[1] = .integer;
+                result[2] = .integer;
+                return result;
+            }
+            if (bits <= 256) {
+                result[0] = .integer;
+                result[1] = .integer;
+                result[2] = .integer;
+                result[3] = .integer;
+                return result;
+            }
+            return memory_class;
+        },
+        .Bool, .Void, .NoReturn => {
+            result[0] = .integer;
+            return result;
+        },
+        .Float => switch (ty.floatBits(target)) {
+            16, 32, 64 => {
+                result[0] = .sse;
+                return result;
+            },
+            128 => {
+                // "Arguments of types__float128,_Decimal128and__m128are
+                // split into two halves.  The least significant ones belong
+                // to class SSE, the mostsignificant one to class SSEUP."
+                result[0] = .sse;
+                result[1] = .sseup;
+                return result;
+            },
+            else => {
+                // "The 64-bit mantissa of arguments of typelong double
+                // belongs to classX87, the 16-bit exponent plus 6 bytes
+                // of padding belongs to class X87UP."
+                result[0] = .x87;
+                result[1] = .x87up;
+                return result;
+            },
+        },
+        .Vector => {
+            const elem_ty = ty.childType();
+            const bits = elem_ty.bitSize(target) * ty.arrayLen();
+            if (bits <= 64) return .{
+                .sse,  .none, .none, .none,
+                .none, .none, .none, .none,
+            };
+            if (bits <= 128) return .{
+                .sse,  .sseup, .none, .none,
+                .none, .none,  .none, .none,
+            };
+            if (bits <= 192) return .{
+                .sse,  .sseup, .sseup, .none,
+                .none, .none,  .none,  .none,
+            };
+            if (bits <= 256) return .{
+                .sse,  .sseup, .sseup, .sseup,
+                .none, .none,  .none,  .none,
+            };
+            if (bits <= 320) return .{
+                .sse,   .sseup, .sseup, .sseup,
+                .sseup, .none,  .none,  .none,
+            };
+            if (bits <= 384) return .{
+                .sse,   .sseup, .sseup, .sseup,
+                .sseup, .sseup, .none,  .none,
+            };
+            if (bits <= 448) return .{
+                .sse,   .sseup, .sseup, .sseup,
+                .sseup, .sseup, .sseup, .none,
+            };
+            if (bits <= 512) return .{
+                .sse,   .sseup, .sseup, .sseup,
+                .sseup, .sseup, .sseup, .sseup,
+            };
+            return memory_class;
+        },
+        .Optional => {
+            if (ty.isPtrLikeOptional()) {
+                result[0] = .integer;
+                return result;
+            }
+            return memory_class;
+        },
+        .Struct => {
+            // "If the size of an object is larger than eight eightbytes, or
+            // it contains unaligned fields, it has class MEMORY"
+            // "If the size of the aggregate exceeds a single eightbyte, each is classified
+            // separately.".
+            const ty_size = ty.abiSize(target);
+            if (ty_size > 64)
+                return memory_class;
+
+            var result_i: usize = 0; // out of 8
+            var byte_i: usize = 0; // out of 8
+            const fields = ty.structFields();
+            for (fields.values()) |field| {
+                if (field.abi_align.tag() != .abi_align_default) {
+                    const field_alignment = field.abi_align.toUnsignedInt();
+                    if (field_alignment < field.ty.abiAlignment(target)) {
+                        return memory_class;
+                    }
+                }
+                const field_size = field.ty.abiSize(target);
+                const field_class_array = classifySystemV(field.ty, target);
+                const field_class = std.mem.sliceTo(&field_class_array, .none);
+                if (byte_i + field_size <= 8) {
+                    // Combine this field with the previous one.
+                    combine: {
+                        // "If both classes are equal, this is the resulting class."
+                        if (result[result_i] == field_class[0]) {
+                            break :combine;
+                        }
+
+                        // "If one of the classes is NO_CLASS, the resulting class
+                        // is the other class."
+                        if (result[result_i] == .none) {
+                            result[result_i] = field_class[0];
+                            break :combine;
+                        }
+                        assert(field_class[0] != .none);
+
+                        // "If one of the classes is MEMORY, the result is the MEMORY class."
+                        if (result[result_i] == .memory or field_class[0] == .memory) {
+                            result[result_i] = .memory;
+                            break :combine;
+                        }
+
+                        // "If one of the classes is INTEGER, the result is the INTEGER."
+                        if (result[result_i] == .integer or field_class[0] == .integer) {
+                            result[result_i] = .integer;
+                            break :combine;
+                        }
+
+                        // "If one of the classes is X87, X87UP, COMPLEX_X87 class,
+                        // MEMORY is used as class."
+                        if (result[result_i] == .x87 or
+                            result[result_i] == .x87up or
+                            result[result_i] == .complex_x87 or
+                            field_class[0] == .x87 or
+                            field_class[0] == .x87up or
+                            field_class[0] == .complex_x87)
+                        {
+                            result[result_i] = .memory;
+                            break :combine;
+                        }
+
+                        // "Otherwise class SSE is used."
+                        result[result_i] = .sse;
+                    }
+                    byte_i += field_size;
+                    if (byte_i == 8) {
+                        byte_i = 0;
+                        result_i += 1;
+                    }
+                } else {
+                    // Cannot combine this field with the previous one.
+                    if (byte_i != 0) {
+                        byte_i = 0;
+                        result_i += 1;
+                    }
+                    std.mem.copy(Class, result[result_i..], field_class);
+                    result_i += field_class.len;
+                    // If there are any bytes leftover, we have to try to combine
+                    // the next field with them.
+                    byte_i = field_size % 8;
+                    if (byte_i != 0) result_i -= 1;
+                }
+            }
+
+            // Post-merger cleanup
+
+            // "If one of the classes is MEMORY, the whole argument is passed in memory"
+            // "If X87UP is not preceded by X87, the whole argument is passed in memory."
+            var found_sseup = false;
+            for (result) |item, i| switch (item) {
+                .memory => return memory_class,
+                .x87up => if (i == 0 or result[i - 1] != .x87) return memory_class,
+                .sseup => found_sseup = true,
+                else => continue,
+            };
+            // "If the size of the aggregate exceeds two eightbytes and the first eight-
+            // byte isn’t SSE or any other eightbyte isn’t SSEUP, the whole argument
+            // is passed in memory."
+            if (ty_size > 16 and (result[0] != .sse or !found_sseup)) return memory_class;
+
+            // "If SSEUP is not preceded by SSE or SSEUP, it is converted to SSE."
+            for (result) |*item, i| {
+                if (item.* == .sseup) switch (result[i - 1]) {
+                    .sse, .sseup => continue,
+                    else => item.* = .sse,
+                };
+            }
+            return result;
+        },
+        .Union => {
+            // "If the size of an object is larger than eight eightbytes, or
+            // it contains unaligned fields, it has class MEMORY"
+            // "If the size of the aggregate exceeds a single eightbyte, each is classified
+            // separately.".
+            const ty_size = ty.abiSize(target);
+            if (ty_size > 64)
+                return memory_class;
+
+            const fields = ty.unionFields();
+            for (fields.values()) |field| {
+                if (field.abi_align.tag() != .abi_align_default) {
+                    const field_alignment = field.abi_align.toUnsignedInt();
+                    if (field_alignment < field.ty.abiAlignment(target)) {
+                        return memory_class;
+                    }
+                }
+                // Combine this field with the previous one.
+                const field_class = classifySystemV(field.ty, target);
+                for (result) |*result_item, i| {
+                    const field_item = field_class[i];
+                    // "If both classes are equal, this is the resulting class."
+                    if (result_item.* == field_item) {
+                        continue;
+                    }
+
+                    // "If one of the classes is NO_CLASS, the resulting class
+                    // is the other class."
+                    if (result_item.* == .none) {
+                        result_item.* = field_item;
+                        continue;
+                    }
+                    if (field_item == .none) {
+                        continue;
+                    }
+
+                    // "If one of the classes is MEMORY, the result is the MEMORY class."
+                    if (result_item.* == .memory or field_item == .memory) {
+                        result_item.* = .memory;
+                        continue;
+                    }
+
+                    // "If one of the classes is INTEGER, the result is the INTEGER."
+                    if (result_item.* == .integer or field_item == .integer) {
+                        result_item.* = .integer;
+                        continue;
+                    }
+
+                    // "If one of the classes is X87, X87UP, COMPLEX_X87 class,
+                    // MEMORY is used as class."
+                    if (result_item.* == .x87 or
+                        result_item.* == .x87up or
+                        result_item.* == .complex_x87 or
+                        field_item == .x87 or
+                        field_item == .x87up or
+                        field_item == .complex_x87)
+                    {
+                        result_item.* = .memory;
+                        continue;
+                    }
+
+                    // "Otherwise class SSE is used."
+                    result_item.* = .sse;
+                }
+            }
+
+            // Post-merger cleanup
+
+            // "If one of the classes is MEMORY, the whole argument is passed in memory"
+            // "If X87UP is not preceded by X87, the whole argument is passed in memory."
+            var found_sseup = false;
+            for (result) |item, i| switch (item) {
+                .memory => return memory_class,
+                .x87up => if (i == 0 or result[i - 1] != .x87) return memory_class,
+                .sseup => found_sseup = true,
+                else => continue,
+            };
+            // "If the size of the aggregate exceeds two eightbytes and the first eight-
+            // byte isn’t SSE or any other eightbyte isn’t SSEUP, the whole argument
+            // is passed in memory."
+            if (ty_size > 16 and (result[0] != .sse or !found_sseup)) return memory_class;
+
+            // "If SSEUP is not preceded by SSE or SSEUP, it is converted to SSE."
+            for (result) |*item, i| {
+                if (item.* == .sseup) switch (result[i - 1]) {
+                    .sse, .sseup => continue,
+                    else => item.* = .sse,
+                };
+            }
+            return result;
+        },
+        else => unreachable,
+    }
+}
src/codegen/llvm/bindings.zig
@@ -163,6 +163,18 @@ pub const Value = opaque {
 
     pub const deleteFunction = LLVMDeleteFunction;
     extern fn LLVMDeleteFunction(Fn: *const Value) void;
+
+    pub const addSretAttr = ZigLLVMAddSretAttr;
+    extern fn ZigLLVMAddSretAttr(fn_ref: *const Value, ArgNo: c_uint, type_val: *const Type) void;
+
+    pub const setCallSret = ZigLLVMSetCallSret;
+    extern fn ZigLLVMSetCallSret(Call: *const Value, return_type: *const Type) void;
+
+    pub const getParam = LLVMGetParam;
+    extern fn LLVMGetParam(Fn: *const Value, Index: c_uint) *const Value;
+
+    pub const setInitializer = LLVMSetInitializer;
+    extern fn LLVMSetInitializer(GlobalVar: *const Value, ConstantVal: *const Value) void;
 };
 
 pub const Type = opaque {
@@ -292,12 +304,6 @@ pub const VerifierFailureAction = enum(c_int) {
 pub const constNeg = LLVMConstNeg;
 extern fn LLVMConstNeg(ConstantVal: *const Value) *const Value;
 
-pub const setInitializer = LLVMSetInitializer;
-extern fn LLVMSetInitializer(GlobalVar: *const Value, ConstantVal: *const Value) void;
-
-pub const getParam = LLVMGetParam;
-extern fn LLVMGetParam(Fn: *const Value, Index: c_uint) *const Value;
-
 pub const getEnumAttributeKindForName = LLVMGetEnumAttributeKindForName;
 extern fn LLVMGetEnumAttributeKindForName(Name: [*]const u8, SLen: usize) c_uint;
 
src/codegen/c.zig
@@ -384,12 +384,6 @@ pub const DeclGen = struct {
                 }
             },
             .Fn => switch (val.tag()) {
-                .null_value, .zero => try writer.writeAll("NULL"),
-                .one => try writer.writeAll("1"),
-                .decl_ref => {
-                    const decl = val.castTag(.decl_ref).?.data;
-                    return dg.renderDeclValue(writer, ty, val, decl);
-                },
                 .function => {
                     const decl = val.castTag(.function).?.data.owner_decl;
                     return dg.renderDeclValue(writer, ty, val, decl);
@@ -1026,6 +1020,7 @@ fn genBody(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, OutO
             .is_non_null_ptr => try airIsNull(f, inst, "!=", "[0]"),
 
             .alloc            => try airAlloc(f, inst),
+            .ret_ptr          => try airRetPtr(f, inst),
             .assembly         => try airAsm(f, inst),
             .block            => try airBlock(f, inst),
             .bitcast          => try airBitcast(f, inst),
@@ -1036,6 +1031,7 @@ fn genBody(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, OutO
             .bool_to_int      => try airBoolToInt(f, inst),
             .load             => try airLoad(f, inst),
             .ret              => try airRet(f, inst),
+            .ret_load         => try airRetLoad(f, inst),
             .store            => try airStore(f, inst),
             .loop             => try airLoop(f, inst),
             .cond_br          => try airCondBr(f, inst),
@@ -1081,6 +1077,7 @@ fn genBody(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, OutO
             .ptr_elem_ptr       => try airPtrElemPtr(f, inst),
             .slice_elem_val     => try airSliceElemVal(f, inst, "["),
             .ptr_slice_elem_val => try airSliceElemVal(f, inst, "[0]["),
+            .array_elem_val     => try airArrayElemVal(f, inst),
 
             .unwrap_errunion_payload     => try airUnwrapErrUnionPay(f, inst),
             .unwrap_errunion_err         => try airUnwrapErrUnionErr(f, inst),
@@ -1148,6 +1145,22 @@ fn airSliceElemVal(f: *Function, inst: Air.Inst.Index, prefix: []const u8) !CVal
     return local;
 }
 
+fn airArrayElemVal(f: *Function, inst: Air.Inst.Index) !CValue {
+    if (f.liveness.isUnused(inst)) return CValue.none;
+
+    const bin_op = f.air.instructions.items(.data)[inst].bin_op;
+    const array = try f.resolveInst(bin_op.lhs);
+    const index = try f.resolveInst(bin_op.rhs);
+    const writer = f.object.writer();
+    const local = try f.allocLocal(f.air.typeOfIndex(inst), .Const);
+    try writer.writeAll(" = ");
+    try f.writeCValue(writer, array);
+    try writer.writeAll("[");
+    try f.writeCValue(writer, index);
+    try writer.writeAll("];\n");
+    return local;
+}
+
 fn airAlloc(f: *Function, inst: Air.Inst.Index) !CValue {
     const writer = f.object.writer();
     const inst_ty = f.air.typeOfIndex(inst);
@@ -1161,6 +1174,18 @@ fn airAlloc(f: *Function, inst: Air.Inst.Index) !CValue {
     return CValue{ .local_ref = local.local };
 }
 
+fn airRetPtr(f: *Function, inst: Air.Inst.Index) !CValue {
+    const writer = f.object.writer();
+    const inst_ty = f.air.typeOfIndex(inst);
+
+    // First line: the variable used as data storage.
+    const elem_type = inst_ty.elemType();
+    const local = try f.allocLocal(elem_type, .Mut);
+    try writer.writeAll(";\n");
+
+    return CValue{ .local_ref = local.local };
+}
+
 fn airArg(f: *Function) CValue {
     const i = f.next_arg_index;
     f.next_arg_index += 1;
@@ -1212,6 +1237,21 @@ fn airRet(f: *Function, inst: Air.Inst.Index) !CValue {
     return CValue.none;
 }
 
+fn airRetLoad(f: *Function, inst: Air.Inst.Index) !CValue {
+    const un_op = f.air.instructions.items(.data)[inst].un_op;
+    const writer = f.object.writer();
+    const ptr_ty = f.air.typeOf(un_op);
+    const ret_ty = ptr_ty.childType();
+    if (!ret_ty.hasCodeGenBits()) {
+        try writer.writeAll("return;\n");
+    }
+    const ptr = try f.resolveInst(un_op);
+    try writer.writeAll("return *");
+    try f.writeCValue(writer, ptr);
+    try writer.writeAll(";\n");
+    return CValue.none;
+}
+
 fn airIntCast(f: *Function, inst: Air.Inst.Index) !CValue {
     if (f.liveness.isUnused(inst))
         return CValue.none;
@@ -1559,7 +1599,12 @@ fn airCall(f: *Function, inst: Air.Inst.Index) !CValue {
     const pl_op = f.air.instructions.items(.data)[inst].pl_op;
     const extra = f.air.extraData(Air.Call, pl_op.payload);
     const args = @bitCast([]const Air.Inst.Ref, f.air.extra[extra.end..][0..extra.data.args_len]);
-    const fn_ty = f.air.typeOf(pl_op.operand);
+    const callee_ty = f.air.typeOf(pl_op.operand);
+    const fn_ty = switch (callee_ty.zigTypeTag()) {
+        .Fn => callee_ty,
+        .Pointer => callee_ty.childType(),
+        else => unreachable,
+    };
     const ret_ty = fn_ty.fnReturnType();
     const unused_result = f.liveness.isUnused(inst);
     const writer = f.object.writer();
@@ -1574,16 +1619,21 @@ fn airCall(f: *Function, inst: Air.Inst.Index) !CValue {
         try writer.writeAll(" = ");
     }
 
-    if (f.air.value(pl_op.operand)) |func_val| {
-        const fn_decl = if (func_val.castTag(.extern_fn)) |extern_fn|
-            extern_fn.data
-        else if (func_val.castTag(.function)) |func_payload|
-            func_payload.data.owner_decl
-        else
-            unreachable;
-
-        try f.object.dg.renderDeclName(fn_decl, writer);
-    } else {
+    callee: {
+        known: {
+            const fn_decl = fn_decl: {
+                const callee_val = f.air.value(pl_op.operand) orelse break :known;
+                break :fn_decl switch (callee_val.tag()) {
+                    .extern_fn => callee_val.castTag(.extern_fn).?.data,
+                    .function => callee_val.castTag(.function).?.data.owner_decl,
+                    .decl_ref => callee_val.castTag(.decl_ref).?.data,
+                    else => break :known,
+                };
+            };
+            try f.object.dg.renderDeclName(fn_decl, writer);
+            break :callee;
+        }
+        // Fall back to function pointer call.
         const callee = try f.resolveInst(pl_op.operand);
         try f.writeCValue(writer, callee);
     }
src/codegen/llvm.zig
@@ -21,6 +21,8 @@ const Type = @import("../type.zig").Type;
 
 const LazySrcLoc = Module.LazySrcLoc;
 
+const Error = error{ OutOfMemory, CodegenFail };
+
 pub fn targetTriple(allocator: *Allocator, target: std.Target) ![:0]u8 {
     const llvm_arch = switch (target.cpu.arch) {
         .arm => "arm",
@@ -410,10 +412,18 @@ pub const Object = struct {
 
         // This gets the LLVM values from the function and stores them in `dg.args`.
         const fn_info = decl.ty.fnInfo();
-        var args = try dg.gpa.alloc(*const llvm.Value, fn_info.param_types.len);
+        const ret_ty_by_ref = isByRef(fn_info.return_type);
+        const ret_ptr = if (ret_ty_by_ref) llvm_func.getParam(0) else null;
+
+        var args = std.ArrayList(*const llvm.Value).init(dg.gpa);
+        defer args.deinit();
 
-        for (args) |*arg, i| {
-            arg.* = llvm.getParam(llvm_func, @intCast(c_uint, i));
+        const param_offset: c_uint = @boolToInt(ret_ptr != null);
+        for (fn_info.param_types) |param_ty| {
+            if (!param_ty.hasCodeGenBits()) continue;
+
+            const llvm_arg_i = @intCast(c_uint, args.items.len) + param_offset;
+            try args.append(llvm_func.getParam(llvm_arg_i));
         }
 
         // Remove all the basic blocks of a function in order to start over, generating
@@ -434,7 +444,8 @@ pub const Object = struct {
             .context = dg.context,
             .dg = &dg,
             .builder = builder,
-            .args = args,
+            .ret_ptr = ret_ptr,
+            .args = args.toOwnedSlice(),
             .arg_index = 0,
             .func_inst_table = .{},
             .entry_block = entry_block,
@@ -556,7 +567,7 @@ pub const DeclGen = struct {
     gpa: *Allocator,
     err_msg: ?*Module.ErrorMsg,
 
-    fn todo(self: *DeclGen, comptime format: []const u8, args: anytype) error{ OutOfMemory, CodegenFail } {
+    fn todo(self: *DeclGen, comptime format: []const u8, args: anytype) Error {
         @setCold(true);
         assert(self.err_msg == null);
         const src_loc = @as(LazySrcLoc, .{ .node_offset = 0 }).toSrcLoc(self.decl);
@@ -591,50 +602,33 @@ pub const DeclGen = struct {
             };
 
             const llvm_init = try self.genTypedValue(.{ .ty = decl.ty, .val = init_val });
-            llvm.setInitializer(global, llvm_init);
+            global.setInitializer(llvm_init);
         }
     }
 
     /// If the llvm function does not exist, create it.
     /// Note that this can be called before the function's semantic analysis has
     /// completed, so if any attributes rely on that, they must be done in updateFunc, not here.
-    fn resolveLlvmFunction(self: *DeclGen, decl: *Module.Decl) !*const llvm.Value {
-        const gop = try self.object.decl_map.getOrPut(self.gpa, decl);
+    fn resolveLlvmFunction(dg: *DeclGen, decl: *Module.Decl) !*const llvm.Value {
+        const gop = try dg.object.decl_map.getOrPut(dg.gpa, decl);
         if (gop.found_existing) return gop.value_ptr.*;
 
         assert(decl.has_tv);
         const zig_fn_type = decl.ty;
         const fn_info = zig_fn_type.fnInfo();
-        const return_type = fn_info.return_type;
-
-        const llvm_param_buffer = try self.gpa.alloc(*const llvm.Type, fn_info.param_types.len);
-        defer self.gpa.free(llvm_param_buffer);
-
-        var llvm_params_len: c_uint = 0;
-        for (fn_info.param_types) |param_ty| {
-            if (param_ty.hasCodeGenBits()) {
-                llvm_param_buffer[llvm_params_len] = try self.llvmType(param_ty);
-                llvm_params_len += 1;
-            }
-        }
+        const target = dg.module.getTarget();
+        const sret = firstParamSRet(fn_info, target);
 
-        const llvm_ret_ty = if (!return_type.hasCodeGenBits())
-            self.context.voidType()
-        else
-            try self.llvmType(return_type);
+        const return_type = fn_info.return_type;
+        const raw_llvm_ret_ty = try dg.llvmType(return_type);
 
-        const fn_type = llvm.functionType(
-            llvm_ret_ty,
-            llvm_param_buffer.ptr,
-            llvm_params_len,
-            .False,
-        );
-        const llvm_addrspace = self.llvmAddressSpace(decl.@"addrspace");
+        const fn_type = try dg.llvmType(zig_fn_type);
 
-        const fqn = try decl.getFullyQualifiedName(self.gpa);
-        defer self.gpa.free(fqn);
+        const fqn = try decl.getFullyQualifiedName(dg.gpa);
+        defer dg.gpa.free(fqn);
 
-        const llvm_fn = self.llvmModule().addFunctionInAddressSpace(fqn, fn_type, llvm_addrspace);
+        const llvm_addrspace = dg.llvmAddressSpace(decl.@"addrspace");
+        const llvm_fn = dg.llvmModule().addFunctionInAddressSpace(fqn, fn_type, llvm_addrspace);
         gop.value_ptr.* = llvm_fn;
 
         const is_extern = decl.val.tag() == .extern_fn;
@@ -643,53 +637,76 @@ pub const DeclGen = struct {
             llvm_fn.setUnnamedAddr(.True);
         }
 
-        if (self.module.comp.bin_file.options.skip_linker_dependencies) {
+        if (sret) {
+            dg.addArgAttr(llvm_fn, 0, "nonnull"); // Sret pointers must not be address 0
+            dg.addArgAttr(llvm_fn, 0, "noalias");
+            llvm_fn.addSretAttr(0, raw_llvm_ret_ty);
+        }
+
+        // Set parameter attributes.
+        var llvm_param_i: c_uint = @boolToInt(sret);
+        for (fn_info.param_types) |param_ty| {
+            if (!param_ty.hasCodeGenBits()) continue;
+
+            if (isByRef(param_ty)) {
+                dg.addArgAttr(llvm_fn, llvm_param_i, "nonnull");
+                // TODO readonly, noalias, align
+            }
+            llvm_param_i += 1;
+        }
+
+        if (dg.module.comp.bin_file.options.skip_linker_dependencies) {
             // The intent here is for compiler-rt and libc functions to not generate
             // infinite recursion. For example, if we are compiling the memcpy function,
             // and llvm detects that the body is equivalent to memcpy, it may replace the
             // body of memcpy with a call to memcpy, which would then cause a stack
             // overflow instead of performing memcpy.
-            self.addFnAttr(llvm_fn, "nobuiltin");
+            dg.addFnAttr(llvm_fn, "nobuiltin");
         }
 
         // TODO: more attributes. see codegen.cpp `make_fn_llvm_value`.
-        const target = self.module.getTarget();
         if (fn_info.cc == .Naked) {
-            self.addFnAttr(llvm_fn, "naked");
+            dg.addFnAttr(llvm_fn, "naked");
         } else {
             llvm_fn.setFunctionCallConv(toLlvmCallConv(fn_info.cc, target));
         }
 
         // Function attributes that are independent of analysis results of the function body.
-        if (!self.module.comp.bin_file.options.red_zone) {
-            self.addFnAttr(llvm_fn, "noredzone");
+        if (!dg.module.comp.bin_file.options.red_zone) {
+            dg.addFnAttr(llvm_fn, "noredzone");
         }
-        self.addFnAttr(llvm_fn, "nounwind");
-        if (self.module.comp.unwind_tables) {
-            self.addFnAttr(llvm_fn, "uwtable");
+        dg.addFnAttr(llvm_fn, "nounwind");
+        if (dg.module.comp.unwind_tables) {
+            dg.addFnAttr(llvm_fn, "uwtable");
         }
-        if (self.module.comp.bin_file.options.optimize_mode == .ReleaseSmall) {
-            self.addFnAttr(llvm_fn, "minsize");
-            self.addFnAttr(llvm_fn, "optsize");
+        if (dg.module.comp.bin_file.options.optimize_mode == .ReleaseSmall) {
+            dg.addFnAttr(llvm_fn, "minsize");
+            dg.addFnAttr(llvm_fn, "optsize");
         }
-        if (self.module.comp.bin_file.options.tsan) {
-            self.addFnAttr(llvm_fn, "sanitize_thread");
+        if (dg.module.comp.bin_file.options.tsan) {
+            dg.addFnAttr(llvm_fn, "sanitize_thread");
         }
         // TODO add target-cpu and target-features fn attributes
         if (return_type.isNoReturn()) {
-            self.addFnAttr(llvm_fn, "noreturn");
+            dg.addFnAttr(llvm_fn, "noreturn");
         }
 
         return llvm_fn;
     }
 
-    fn resolveGlobalDecl(self: *DeclGen, decl: *Module.Decl) error{ OutOfMemory, CodegenFail }!*const llvm.Value {
-        const llvm_module = self.object.llvm_module;
-        if (llvm_module.getNamedGlobal(decl.name)) |val| return val;
-        // TODO: remove this redundant `llvmType`, it is also called in `genTypedValue`.
-        const llvm_type = try self.llvmType(decl.ty);
-        const llvm_addrspace = self.llvmAddressSpace(decl.@"addrspace");
-        return llvm_module.addGlobalInAddressSpace(llvm_type, decl.name, llvm_addrspace);
+    fn resolveGlobalDecl(dg: *DeclGen, decl: *Module.Decl) Error!*const llvm.Value {
+        const gop = try dg.object.decl_map.getOrPut(dg.gpa, decl);
+        if (gop.found_existing) return gop.value_ptr.*;
+        errdefer assert(dg.object.decl_map.remove(decl));
+
+        const fqn = try decl.getFullyQualifiedName(dg.gpa);
+        defer dg.gpa.free(fqn);
+
+        const llvm_type = try dg.llvmType(decl.ty);
+        const llvm_addrspace = dg.llvmAddressSpace(decl.@"addrspace");
+        const llvm_global = dg.object.llvm_module.addGlobalInAddressSpace(llvm_type, fqn, llvm_addrspace);
+        gop.value_ptr.* = llvm_global;
+        return llvm_global;
     }
 
     fn llvmAddressSpace(self: DeclGen, address_space: std.builtin.AddressSpace) c_uint {
@@ -708,87 +725,87 @@ pub const DeclGen = struct {
         };
     }
 
-    fn llvmType(self: *DeclGen, t: Type) error{ OutOfMemory, CodegenFail }!*const llvm.Type {
-        const gpa = self.gpa;
+    fn llvmType(dg: *DeclGen, t: Type) Error!*const llvm.Type {
+        const gpa = dg.gpa;
         log.debug("llvmType for {}", .{t});
         switch (t.zigTypeTag()) {
-            .Void, .NoReturn => return self.context.voidType(),
+            .Void, .NoReturn => return dg.context.voidType(),
             .Int => {
-                const info = t.intInfo(self.module.getTarget());
-                return self.context.intType(info.bits);
+                const info = t.intInfo(dg.module.getTarget());
+                return dg.context.intType(info.bits);
             },
             .Enum => {
                 var buffer: Type.Payload.Bits = undefined;
                 const int_ty = t.intTagType(&buffer);
-                const bit_count = int_ty.intInfo(self.module.getTarget()).bits;
-                return self.context.intType(bit_count);
+                const bit_count = int_ty.intInfo(dg.module.getTarget()).bits;
+                return dg.context.intType(bit_count);
             },
-            .Float => switch (t.floatBits(self.module.getTarget())) {
-                16 => return self.context.halfType(),
-                32 => return self.context.floatType(),
-                64 => return self.context.doubleType(),
-                80 => return self.context.x86FP80Type(),
-                128 => return self.context.fp128Type(),
+            .Float => switch (t.floatBits(dg.module.getTarget())) {
+                16 => return dg.context.halfType(),
+                32 => return dg.context.floatType(),
+                64 => return dg.context.doubleType(),
+                80 => return dg.context.x86FP80Type(),
+                128 => return dg.context.fp128Type(),
                 else => unreachable,
             },
-            .Bool => return self.context.intType(1),
+            .Bool => return dg.context.intType(1),
             .Pointer => {
                 if (t.isSlice()) {
                     var buf: Type.SlicePtrFieldTypeBuffer = undefined;
                     const ptr_type = t.slicePtrFieldType(&buf);
 
                     const fields: [2]*const llvm.Type = .{
-                        try self.llvmType(ptr_type),
-                        try self.llvmType(Type.initTag(.usize)),
+                        try dg.llvmType(ptr_type),
+                        try dg.llvmType(Type.initTag(.usize)),
                     };
-                    return self.context.structType(&fields, fields.len, .False);
+                    return dg.context.structType(&fields, fields.len, .False);
                 } else {
-                    const elem_type = try self.llvmType(t.elemType());
-                    const llvm_addrspace = self.llvmAddressSpace(t.ptrAddressSpace());
+                    const elem_type = try dg.llvmType(t.elemType());
+                    const llvm_addrspace = dg.llvmAddressSpace(t.ptrAddressSpace());
                     return elem_type.pointerType(llvm_addrspace);
                 }
             },
             .Array => {
-                const elem_type = try self.llvmType(t.elemType());
+                const elem_type = try dg.llvmType(t.elemType());
                 const total_len = t.arrayLen() + @boolToInt(t.sentinel() != null);
                 return elem_type.arrayType(@intCast(c_uint, total_len));
             },
             .Optional => {
                 var buf: Type.Payload.ElemType = undefined;
                 const child_type = t.optionalChild(&buf);
-                const payload_llvm_ty = try self.llvmType(child_type);
+                const payload_llvm_ty = try dg.llvmType(child_type);
 
                 if (t.isPtrLikeOptional()) {
                     return payload_llvm_ty;
                 }
 
                 const fields: [2]*const llvm.Type = .{
-                    payload_llvm_ty, self.context.intType(1),
+                    payload_llvm_ty, dg.context.intType(1),
                 };
-                return self.context.structType(&fields, fields.len, .False);
+                return dg.context.structType(&fields, fields.len, .False);
             },
             .ErrorUnion => {
                 const error_type = t.errorUnionSet();
                 const payload_type = t.errorUnionPayload();
-                const llvm_error_type = try self.llvmType(error_type);
+                const llvm_error_type = try dg.llvmType(error_type);
                 if (!payload_type.hasCodeGenBits()) {
                     return llvm_error_type;
                 }
-                const llvm_payload_type = try self.llvmType(payload_type);
+                const llvm_payload_type = try dg.llvmType(payload_type);
 
                 const fields: [2]*const llvm.Type = .{ llvm_error_type, llvm_payload_type };
-                return self.context.structType(&fields, fields.len, .False);
+                return dg.context.structType(&fields, fields.len, .False);
             },
             .ErrorSet => {
-                return self.context.intType(16);
+                return dg.context.intType(16);
             },
             .Struct => {
-                const gop = try self.object.type_map.getOrPut(gpa, t);
+                const gop = try dg.object.type_map.getOrPut(gpa, t);
                 if (gop.found_existing) return gop.value_ptr.*;
 
                 // The Type memory is ephemeral; since we want to store a longer-lived
                 // reference, we need to copy it here.
-                gop.key_ptr.* = try t.copy(&self.object.type_map_arena.allocator);
+                gop.key_ptr.* = try t.copy(&dg.object.type_map_arena.allocator);
 
                 const struct_obj = t.castTag(.@"struct").?.data;
                 assert(struct_obj.haveFieldTypes());
@@ -796,7 +813,7 @@ pub const DeclGen = struct {
                 const name = try struct_obj.getFullyQualifiedName(gpa);
                 defer gpa.free(name);
 
-                const llvm_struct_ty = self.context.structCreateNamed(name);
+                const llvm_struct_ty = dg.context.structCreateNamed(name);
                 gop.value_ptr.* = llvm_struct_ty; // must be done before any recursive calls
 
                 var llvm_field_types: std.ArrayListUnmanaged(*const llvm.Type) = .{};
@@ -805,7 +822,7 @@ pub const DeclGen = struct {
 
                 for (struct_obj.fields.values()) |field| {
                     if (!field.ty.hasCodeGenBits()) continue;
-                    llvm_field_types.appendAssumeCapacity(try self.llvmType(field.ty));
+                    llvm_field_types.appendAssumeCapacity(try dg.llvmType(field.ty));
                 }
 
                 llvm_struct_ty.structSetBody(
@@ -821,42 +838,56 @@ pub const DeclGen = struct {
                 assert(union_obj.haveFieldTypes());
 
                 const enum_tag_ty = union_obj.tag_ty;
-                const enum_tag_llvm_ty = try self.llvmType(enum_tag_ty);
+                const enum_tag_llvm_ty = try dg.llvmType(enum_tag_ty);
 
                 if (union_obj.onlyTagHasCodegenBits()) {
                     return enum_tag_llvm_ty;
                 }
 
-                const target = self.module.getTarget();
+                const target = dg.module.getTarget();
                 const most_aligned_field_index = union_obj.mostAlignedField(target);
                 const most_aligned_field = union_obj.fields.values()[most_aligned_field_index];
                 // TODO handle when the most aligned field is different than the
                 // biggest sized field.
 
                 const llvm_fields = [_]*const llvm.Type{
-                    try self.llvmType(most_aligned_field.ty),
+                    try dg.llvmType(most_aligned_field.ty),
                     enum_tag_llvm_ty,
                 };
-                return self.context.structType(&llvm_fields, llvm_fields.len, .False);
+                return dg.context.structType(&llvm_fields, llvm_fields.len, .False);
             },
             .Fn => {
-                const ret_ty = try self.llvmType(t.fnReturnType());
-                const params_len = t.fnParamLen();
-                const llvm_params = try gpa.alloc(*const llvm.Type, params_len);
-                defer gpa.free(llvm_params);
-                for (llvm_params) |*llvm_param, i| {
-                    llvm_param.* = try self.llvmType(t.fnParamType(i));
+                const fn_info = t.fnInfo();
+                const target = dg.module.getTarget();
+                const sret = firstParamSRet(fn_info, target);
+                const return_type = fn_info.return_type;
+                const raw_llvm_ret_ty = try dg.llvmType(return_type);
+                const llvm_ret_ty = if (!return_type.hasCodeGenBits() or sret)
+                    dg.context.voidType()
+                else
+                    raw_llvm_ret_ty;
+
+                var llvm_params = std.ArrayList(*const llvm.Type).init(dg.gpa);
+                defer llvm_params.deinit();
+
+                if (sret) {
+                    try llvm_params.append(raw_llvm_ret_ty.pointerType(0));
+                }
+
+                for (fn_info.param_types) |param_ty| {
+                    if (!param_ty.hasCodeGenBits()) continue;
+
+                    const raw_llvm_ty = try dg.llvmType(param_ty);
+                    const actual_llvm_ty = if (!isByRef(param_ty)) raw_llvm_ty else raw_llvm_ty.pointerType(0);
+                    try llvm_params.append(actual_llvm_ty);
                 }
-                const is_var_args = t.fnIsVarArgs();
-                const llvm_fn_ty = llvm.functionType(
-                    ret_ty,
-                    llvm_params.ptr,
-                    @intCast(c_uint, llvm_params.len),
-                    llvm.Bool.fromBool(is_var_args),
+
+                return llvm.functionType(
+                    llvm_ret_ty,
+                    llvm_params.items.ptr,
+                    @intCast(c_uint, llvm_params.items.len),
+                    llvm.Bool.fromBool(fn_info.is_var_args),
                 );
-                // TODO make .Fn not both a pointer type and a prototype
-                const llvm_addrspace = self.llvmAddressSpace(.generic);
-                return llvm_fn_ty.pointerType(llvm_addrspace);
             },
             .ComptimeInt => unreachable,
             .ComptimeFloat => unreachable,
@@ -871,11 +902,11 @@ pub const DeclGen = struct {
             .Frame,
             .AnyFrame,
             .Vector,
-            => return self.todo("implement llvmType for type '{}'", .{t}),
+            => return dg.todo("implement llvmType for type '{}'", .{t}),
         }
     }
 
-    fn genTypedValue(self: *DeclGen, tv: TypedValue) error{ OutOfMemory, CodegenFail }!*const llvm.Value {
+    fn genTypedValue(self: *DeclGen, tv: TypedValue) Error!*const llvm.Value {
         if (tv.val.isUndef()) {
             const llvm_type = try self.llvmType(tv.ty);
             return llvm_type.getUndef();
@@ -961,9 +992,12 @@ pub const DeclGen = struct {
                     } else {
                         const decl = tv.val.castTag(.decl_ref).?.data;
                         decl.alive = true;
-                        const val = try self.resolveGlobalDecl(decl);
                         const llvm_type = try self.llvmType(tv.ty);
-                        return val.constBitCast(llvm_type);
+                        const llvm_val = if (decl.ty.zigTypeTag() == .Fn)
+                            try self.resolveLlvmFunction(decl)
+                        else
+                            try self.resolveGlobalDecl(decl);
+                        return llvm_val.constBitCast(llvm_type);
                     }
                 },
                 .variable => {
@@ -1047,17 +1081,23 @@ pub const DeclGen = struct {
                 return self.todo("handle more array values", .{});
             },
             .Optional => {
+                var buf: Type.Payload.ElemType = undefined;
+                const payload_ty = tv.ty.optionalChild(&buf);
+
                 if (tv.ty.isPtrLikeOptional()) {
-                    return self.todo("implement const of optional pointer", .{});
+                    if (tv.val.castTag(.opt_payload)) |payload| {
+                        return self.genTypedValue(.{ .ty = payload_ty, .val = payload.data });
+                    } else {
+                        const llvm_ty = try self.llvmType(tv.ty);
+                        return llvm_ty.constNull();
+                    }
                 }
-                var buf: Type.Payload.ElemType = undefined;
-                const payload_type = tv.ty.optionalChild(&buf);
                 const is_pl = !tv.val.isNull();
                 const llvm_i1 = self.context.intType(1);
 
                 const fields: [2]*const llvm.Value = .{
                     try self.genTypedValue(.{
-                        .ty = payload_type,
+                        .ty = payload_ty,
                         .val = if (tv.val.castTag(.opt_payload)) |pl| pl.data else Value.initTag(.undef),
                     }),
                     if (is_pl) llvm_i1.constAllOnes() else llvm_i1.constNull(),
@@ -1068,7 +1108,6 @@ pub const DeclGen = struct {
                 const fn_decl = switch (tv.val.tag()) {
                     .extern_fn => tv.val.castTag(.extern_fn).?.data,
                     .function => tv.val.castTag(.function).?.data.owner_decl,
-                    .decl_ref => tv.val.castTag(.decl_ref).?.data,
                     else => unreachable,
                 };
                 fn_decl.alive = true;
@@ -1153,10 +1192,14 @@ pub const DeclGen = struct {
         }
     }
 
-    fn addAttr(dg: *DeclGen, val: *const llvm.Value, index: llvm.AttributeIndex, name: []const u8) void {
+    fn addAttr(dg: DeclGen, val: *const llvm.Value, index: llvm.AttributeIndex, name: []const u8) void {
         return dg.addAttrInt(val, index, name, 0);
     }
 
+    fn addArgAttr(dg: DeclGen, fn_val: *const llvm.Value, param_index: u32, attr_name: []const u8) void {
+        return dg.addAttr(fn_val, param_index + 1, attr_name);
+    }
+
     fn removeAttr(val: *const llvm.Value, index: llvm.AttributeIndex, name: []const u8) void {
         const kind_id = llvm.getEnumAttributeKindForName(name.ptr, name.len);
         assert(kind_id != 0);
@@ -1164,7 +1207,7 @@ pub const DeclGen = struct {
     }
 
     fn addAttrInt(
-        dg: *DeclGen,
+        dg: DeclGen,
         val: *const llvm.Value,
         index: llvm.AttributeIndex,
         name: []const u8,
@@ -1176,7 +1219,7 @@ pub const DeclGen = struct {
         val.addAttributeAtIndex(index, llvm_attr);
     }
 
-    fn addFnAttr(dg: *DeclGen, val: *const llvm.Value, name: []const u8) void {
+    fn addFnAttr(dg: DeclGen, val: *const llvm.Value, name: []const u8) void {
         dg.addAttr(val, std.math.maxInt(llvm.AttributeIndex), name);
     }
 
@@ -1184,7 +1227,7 @@ pub const DeclGen = struct {
         removeAttr(fn_val, std.math.maxInt(llvm.AttributeIndex), name);
     }
 
-    fn addFnAttrInt(dg: *DeclGen, fn_val: *const llvm.Value, name: []const u8, int: u64) void {
+    fn addFnAttrInt(dg: DeclGen, fn_val: *const llvm.Value, name: []const u8, int: u64) void {
         return dg.addAttrInt(fn_val, std.math.maxInt(llvm.AttributeIndex), name, int);
     }
 
@@ -1227,8 +1270,12 @@ pub const FuncGen = struct {
     /// in other instructions. This table is cleared before every function is generated.
     func_inst_table: std.AutoHashMapUnmanaged(Air.Inst.Index, *const llvm.Value),
 
+    /// If the return type isByRef, this is the result pointer. Otherwise null.
+    ret_ptr: ?*const llvm.Value,
     /// These fields are used to refer to the LLVM value of the function parameters
     /// in an Arg instruction.
+    /// This list may be shorter than the list according to the zig type system;
+    /// it omits 0-bit types.
     args: []*const llvm.Value,
     arg_index: usize,
 
@@ -1258,7 +1305,7 @@ pub const FuncGen = struct {
         self.blocks.deinit(self.gpa);
     }
 
-    fn todo(self: *FuncGen, comptime format: []const u8, args: anytype) error{ OutOfMemory, CodegenFail } {
+    fn todo(self: *FuncGen, comptime format: []const u8, args: anytype) Error {
         @setCold(true);
         return self.dg.todo(format, args);
     }
@@ -1269,13 +1316,25 @@ pub const FuncGen = struct {
 
     fn resolveInst(self: *FuncGen, inst: Air.Inst.Ref) !*const llvm.Value {
         if (self.air.value(inst)) |val| {
-            return self.dg.genTypedValue(.{ .ty = self.air.typeOf(inst), .val = val });
+            const ty = self.air.typeOf(inst);
+            const llvm_val = try self.dg.genTypedValue(.{ .ty = ty, .val = val });
+            if (!isByRef(ty)) return llvm_val;
+
+            // We have an LLVM value but we need to create a global constant and
+            // set the value as its initializer, and then return a pointer to the global.
+            const target = self.dg.module.getTarget();
+            const global = self.dg.object.llvm_module.addGlobal(llvm_val.typeOf(), "");
+            global.setInitializer(llvm_val);
+            global.setLinkage(.Private);
+            global.setGlobalConstant(.True);
+            global.setAlignment(ty.abiAlignment(target));
+            return global;
         }
         const inst_index = Air.refToIndex(inst).?;
         return self.func_inst_table.get(inst_index).?;
     }
 
-    fn genBody(self: *FuncGen, body: []const Air.Inst.Index) error{ OutOfMemory, CodegenFail }!void {
+    fn genBody(self: *FuncGen, body: []const Air.Inst.Index) Error!void {
         const air_tags = self.air.instructions.items(.tag);
         for (body) |inst| {
             const opt_value: ?*const llvm.Value = switch (air_tags[inst]) {
@@ -1320,6 +1379,7 @@ pub const FuncGen = struct {
                 .is_err_ptr      => try self.airIsErr(inst, .NE, true),
 
                 .alloc          => try self.airAlloc(inst),
+                .ret_ptr        => try self.airRetPtr(inst),
                 .arg            => try self.airArg(inst),
                 .bitcast        => try self.airBitCast(inst),
                 .bool_to_int    => try self.airBoolToInt(inst),
@@ -1338,6 +1398,7 @@ pub const FuncGen = struct {
                 .loop           => try self.airLoop(inst),
                 .not            => try self.airNot(inst),
                 .ret            => try self.airRet(inst),
+                .ret_load       => try self.airRetLoad(inst),
                 .store          => try self.airStore(inst),
                 .assembly       => try self.airAssembly(inst),
                 .slice_ptr      => try self.airSliceField(inst, 0),
@@ -1370,6 +1431,7 @@ pub const FuncGen = struct {
                 .struct_field_ptr_index_2 => try self.airStructFieldPtrIndex(inst, 2),
                 .struct_field_ptr_index_3 => try self.airStructFieldPtrIndex(inst, 3),
 
+                .array_elem_val     => try self.airArrayElemVal(inst),
                 .slice_elem_val     => try self.airSliceElemVal(inst),
                 .ptr_slice_elem_val => try self.airPtrSliceElemVal(inst),
                 .ptr_elem_val       => try self.airPtrElemVal(inst),
@@ -1405,40 +1467,73 @@ pub const FuncGen = struct {
         const pl_op = self.air.instructions.items(.data)[inst].pl_op;
         const extra = self.air.extraData(Air.Call, pl_op.payload);
         const args = @bitCast([]const Air.Inst.Ref, self.air.extra[extra.end..][0..extra.data.args_len]);
-        const zig_fn_type = self.air.typeOf(pl_op.operand);
-        const return_type = zig_fn_type.fnReturnType();
+        const callee_ty = self.air.typeOf(pl_op.operand);
+        const zig_fn_ty = switch (callee_ty.zigTypeTag()) {
+            .Fn => callee_ty,
+            .Pointer => callee_ty.childType(),
+            else => unreachable,
+        };
+        const fn_info = zig_fn_ty.fnInfo();
+        const return_type = fn_info.return_type;
+        const llvm_ret_ty = try self.dg.llvmType(return_type);
         const llvm_fn = try self.resolveInst(pl_op.operand);
         const target = self.dg.module.getTarget();
+        const sret = firstParamSRet(fn_info, target);
 
-        const llvm_param_vals = try self.gpa.alloc(*const llvm.Value, args.len);
-        defer self.gpa.free(llvm_param_vals);
+        var llvm_args = std.ArrayList(*const llvm.Value).init(self.gpa);
+        defer llvm_args.deinit();
+
+        const ret_ptr = if (!sret) null else blk: {
+            const ret_ptr = self.buildAlloca(llvm_ret_ty);
+            ret_ptr.setAlignment(return_type.abiAlignment(target));
+            try llvm_args.append(ret_ptr);
+            break :blk ret_ptr;
+        };
 
         for (args) |arg, i| {
-            llvm_param_vals[i] = try self.resolveInst(arg);
+            const param_ty = fn_info.param_types[i];
+            if (!param_ty.hasCodeGenBits()) continue;
+
+            try llvm_args.append(try self.resolveInst(arg));
         }
 
         const call = self.builder.buildCall(
             llvm_fn,
-            llvm_param_vals.ptr,
-            @intCast(c_uint, args.len),
-            toLlvmCallConv(zig_fn_type.fnCallingConvention(), target),
+            llvm_args.items.ptr,
+            @intCast(c_uint, llvm_args.items.len),
+            toLlvmCallConv(zig_fn_ty.fnCallingConvention(), target),
             .Auto,
             "",
         );
 
         if (return_type.isNoReturn()) {
             _ = self.builder.buildUnreachable();
+            return null;
+        } else if (self.liveness.isUnused(inst) or !return_type.hasCodeGenBits()) {
+            return null;
+        } else if (sret) {
+            call.setCallSret(llvm_ret_ty);
+            return ret_ptr;
+        } else {
+            return call;
         }
-
-        // No need to store the LLVM value if the return type is void or noreturn
-        if (!return_type.hasCodeGenBits()) return null;
-
-        return call;
     }
 
     fn airRet(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
         const un_op = self.air.instructions.items(.data)[inst].un_op;
-        if (!self.air.typeOf(un_op).hasCodeGenBits()) {
+        const ret_ty = self.air.typeOf(un_op);
+        if (self.ret_ptr) |ret_ptr| {
+            const operand = try self.resolveInst(un_op);
+            var ptr_ty_payload: Type.Payload.ElemType = .{
+                .base = .{ .tag = .single_mut_pointer },
+                .data = ret_ty,
+            };
+            const ptr_ty = Type.initPayload(&ptr_ty_payload.base);
+            self.store(ret_ptr, ptr_ty, operand, .NotAtomic);
+            _ = self.builder.buildRetVoid();
+            return null;
+        }
+        if (!ret_ty.hasCodeGenBits()) {
             _ = self.builder.buildRetVoid();
             return null;
         }
@@ -1447,6 +1542,20 @@ pub const FuncGen = struct {
         return null;
     }
 
+    fn airRetLoad(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
+        const un_op = self.air.instructions.items(.data)[inst].un_op;
+        const ptr_ty = self.air.typeOf(un_op);
+        const ret_ty = ptr_ty.childType();
+        if (!ret_ty.hasCodeGenBits() or isByRef(ret_ty)) {
+            _ = self.builder.buildRetVoid();
+            return null;
+        }
+        const ptr = try self.resolveInst(un_op);
+        const loaded = self.builder.buildLoad(ptr, "");
+        _ = self.builder.buildRet(loaded);
+        return null;
+    }
+
     fn airCmp(self: *FuncGen, inst: Air.Inst.Index, op: math.CompareOperator) !?*const llvm.Value {
         if (self.liveness.isUnused(inst))
             return null;
@@ -1491,19 +1600,18 @@ pub const FuncGen = struct {
         const body = self.air.extra[extra.end..][0..extra.data.body_len];
         const parent_bb = self.context.createBasicBlock("Block");
 
-        // 5 breaks to a block seems like a reasonable default.
-        var break_bbs = try BreakBasicBlocks.initCapacity(self.gpa, 5);
-        var break_vals = try BreakValues.initCapacity(self.gpa, 5);
+        var break_bbs: BreakBasicBlocks = .{};
+        defer break_bbs.deinit(self.gpa);
+
+        var break_vals: BreakValues = .{};
+        defer break_vals.deinit(self.gpa);
+
         try self.blocks.putNoClobber(self.gpa, inst, .{
             .parent_bb = parent_bb,
             .break_bbs = &break_bbs,
             .break_vals = &break_vals,
         });
-        defer {
-            assert(self.blocks.remove(inst));
-            break_bbs.deinit(self.gpa);
-            break_vals.deinit(self.gpa);
-        }
+        defer assert(self.blocks.remove(inst));
 
         try self.genBody(body);
 
@@ -1514,7 +1622,18 @@ pub const FuncGen = struct {
         const inst_ty = self.air.typeOfIndex(inst);
         if (!inst_ty.hasCodeGenBits()) return null;
 
-        const phi_node = self.builder.buildPhi(try self.dg.llvmType(inst_ty), "");
+        const raw_llvm_ty = try self.dg.llvmType(inst_ty);
+
+        // If the zig tag type is a function, this represents an actual function body; not
+        // a pointer to it. LLVM IR allows the call instruction to use function bodies instead
+        // of function pointers, however the phi makes it a runtime value and therefore
+        // the LLVM type has to be wrapped in a pointer.
+        const llvm_ty = if (inst_ty.zigTypeTag() == .Fn)
+            raw_llvm_ty.pointerType(0)
+        else
+            raw_llvm_ty;
+
+        const phi_node = self.builder.buildPhi(llvm_ty, "");
         phi_node.addIncoming(
             break_vals.items.ptr,
             break_bbs.items.ptr,
@@ -1657,25 +1776,23 @@ pub const FuncGen = struct {
     }
 
     fn airSliceElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
-        const is_volatile = false; // TODO
-        if (!is_volatile and self.liveness.isUnused(inst))
-            return null;
-
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
-        const lhs = try self.resolveInst(bin_op.lhs);
-        const rhs = try self.resolveInst(bin_op.rhs);
-        const base_ptr = self.builder.buildExtractValue(lhs, 0, "");
-        const indices: [1]*const llvm.Value = .{rhs};
+        const slice_ty = self.air.typeOf(bin_op.lhs);
+        if (!slice_ty.isVolatilePtr() and self.liveness.isUnused(inst)) return null;
+
+        const slice = try self.resolveInst(bin_op.lhs);
+        const index = try self.resolveInst(bin_op.rhs);
+        const base_ptr = self.builder.buildExtractValue(slice, 0, "");
+        const indices: [1]*const llvm.Value = .{index};
         const ptr = self.builder.buildInBoundsGEP(base_ptr, &indices, indices.len, "");
-        return self.builder.buildLoad(ptr, "");
+        return self.load(ptr, slice_ty);
     }
 
     fn airPtrSliceElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
-        const is_volatile = false; // TODO
-        if (!is_volatile and self.liveness.isUnused(inst))
-            return null;
-
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const slice_ty = self.air.typeOf(bin_op.lhs).childType();
+        if (!slice_ty.isVolatilePtr() and self.liveness.isUnused(inst)) return null;
+
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
 
@@ -1686,18 +1803,35 @@ pub const FuncGen = struct {
 
         const indices: [1]*const llvm.Value = .{rhs};
         const ptr = self.builder.buildInBoundsGEP(base_ptr, &indices, indices.len, "");
-        return self.builder.buildLoad(ptr, "");
+        return self.load(ptr, slice_ty);
     }
 
-    fn airPtrElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
-        const is_volatile = false; // TODO
-        if (!is_volatile and self.liveness.isUnused(inst))
-            return null;
+    fn airArrayElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
 
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const array_ty = self.air.typeOf(bin_op.lhs);
+        const array_llvm_val = try self.resolveInst(bin_op.lhs);
+        const rhs = try self.resolveInst(bin_op.rhs);
+        assert(isByRef(array_ty));
+        const indices: [2]*const llvm.Value = .{ self.context.intType(32).constNull(), rhs };
+        const elem_ptr = self.builder.buildInBoundsGEP(array_llvm_val, &indices, indices.len, "");
+        const elem_ty = array_ty.childType();
+        if (isByRef(elem_ty)) {
+            return elem_ptr;
+        } else {
+            return self.builder.buildLoad(elem_ptr, "");
+        }
+    }
+
+    fn airPtrElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
+        const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const ptr_ty = self.air.typeOf(bin_op.lhs);
+        if (!ptr_ty.isVolatilePtr() and self.liveness.isUnused(inst)) return null;
+
         const base_ptr = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
-        const ptr = if (self.air.typeOf(bin_op.lhs).isSinglePointer()) ptr: {
+        const ptr = if (ptr_ty.isSinglePointer()) ptr: {
             // If this is a single-item pointer to an array, we need another index in the GEP.
             const indices: [2]*const llvm.Value = .{ self.context.intType(32).constNull(), rhs };
             break :ptr self.builder.buildInBoundsGEP(base_ptr, &indices, indices.len, "");
@@ -1705,7 +1839,7 @@ pub const FuncGen = struct {
             const indices: [1]*const llvm.Value = .{rhs};
             break :ptr self.builder.buildInBoundsGEP(base_ptr, &indices, indices.len, "");
         };
-        return self.builder.buildLoad(ptr, "");
+        return self.load(ptr, ptr_ty);
     }
 
     fn airPtrElemPtr(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
@@ -1727,17 +1861,16 @@ pub const FuncGen = struct {
     }
 
     fn airPtrPtrElemVal(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
-        const is_volatile = false; // TODO
-        if (!is_volatile and self.liveness.isUnused(inst))
-            return null;
-
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const ptr_ty = self.air.typeOf(bin_op.lhs).childType();
+        if (!ptr_ty.isVolatilePtr() and self.liveness.isUnused(inst)) return null;
+
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
         const base_ptr = self.builder.buildLoad(lhs, "");
         const indices: [1]*const llvm.Value = .{rhs};
         const ptr = self.builder.buildInBoundsGEP(base_ptr, &indices, indices.len, "");
-        return self.builder.buildLoad(ptr, "");
+        return self.load(ptr, ptr_ty);
     }
 
     fn airStructFieldPtr(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
@@ -1770,9 +1903,19 @@ pub const FuncGen = struct {
         const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
         const struct_field = self.air.extraData(Air.StructField, ty_pl.payload).data;
         const struct_ty = self.air.typeOf(struct_field.struct_operand);
-        const struct_byval = try self.resolveInst(struct_field.struct_operand);
+        const struct_llvm_val = try self.resolveInst(struct_field.struct_operand);
         const field_index = llvmFieldIndex(struct_ty, struct_field.field_index);
-        return self.builder.buildExtractValue(struct_byval, field_index, "");
+        if (isByRef(struct_ty)) {
+            const field_ptr = self.builder.buildStructGEP(struct_llvm_val, field_index, "");
+            const field_ty = struct_ty.structFieldType(struct_field.field_index);
+            if (isByRef(field_ty)) {
+                return field_ptr;
+            } else {
+                return self.builder.buildLoad(field_ptr, "");
+            }
+        } else {
+            return self.builder.buildExtractValue(struct_llvm_val, field_index, "");
+        }
     }
 
     fn airNot(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
@@ -2465,17 +2608,21 @@ pub const FuncGen = struct {
         self.arg_index += 1;
 
         const inst_ty = self.air.typeOfIndex(inst);
-        const ptr_val = self.buildAlloca(try self.dg.llvmType(inst_ty));
-        _ = self.builder.buildStore(arg_val, ptr_val);
-        return self.builder.buildLoad(ptr_val, "");
+        if (isByRef(inst_ty)) {
+            // TODO declare debug variable
+            return arg_val;
+        } else {
+            const ptr_val = self.buildAlloca(try self.dg.llvmType(inst_ty));
+            _ = self.builder.buildStore(arg_val, ptr_val);
+            // TODO declare debug variable
+            return arg_val;
+        }
     }
 
     fn airAlloc(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
         if (self.liveness.isUnused(inst)) return null;
-        // buildAlloca expects the pointee type, not the pointer type, so assert that
-        // a Payload.PointerSimple is passed to the alloc instruction.
         const ptr_ty = self.air.typeOfIndex(inst);
-        const pointee_type = ptr_ty.elemType();
+        const pointee_type = ptr_ty.childType();
         if (!pointee_type.hasCodeGenBits()) return null;
         const pointee_llvm_ty = try self.dg.llvmType(pointee_type);
         const target = self.dg.module.getTarget();
@@ -2484,6 +2631,19 @@ pub const FuncGen = struct {
         return alloca_inst;
     }
 
+    fn airRetPtr(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+        const ptr_ty = self.air.typeOfIndex(inst);
+        const ret_ty = ptr_ty.childType();
+        if (!ret_ty.hasCodeGenBits()) return null;
+        if (self.ret_ptr) |ret_ptr| return ret_ptr;
+        const ret_llvm_ty = try self.dg.llvmType(ret_ty);
+        const target = self.dg.module.getTarget();
+        const alloca_inst = self.buildAlloca(ret_llvm_ty);
+        alloca_inst.setAlignment(ptr_ty.ptrAlignment(target));
+        return alloca_inst;
+    }
+
     /// Use this instead of builder.buildAlloca, because this function makes sure to
     /// put the alloca instruction at the top of the function!
     fn buildAlloca(self: *FuncGen, t: *const llvm.Type) *const llvm.Value {
@@ -2513,7 +2673,7 @@ pub const FuncGen = struct {
         const dest_ptr = try self.resolveInst(bin_op.lhs);
         const ptr_ty = self.air.typeOf(bin_op.lhs);
         const src_operand = try self.resolveInst(bin_op.rhs);
-        _ = self.store(dest_ptr, ptr_ty, src_operand);
+        self.store(dest_ptr, ptr_ty, src_operand, .NotAtomic);
         return null;
     }
 
@@ -2658,11 +2818,11 @@ pub const FuncGen = struct {
         if (opt_abi_ty) |abi_ty| {
             // operand needs widening and truncating
             const casted_ptr = self.builder.buildBitCast(ptr, abi_ty.pointerType(0), "");
-            const load_inst = self.load(casted_ptr, ptr_ty);
+            const load_inst = self.load(casted_ptr, ptr_ty).?;
             load_inst.setOrdering(ordering);
             return self.builder.buildTrunc(load_inst, try self.dg.llvmType(operand_ty), "");
         }
-        const load_inst = self.load(ptr, ptr_ty);
+        const load_inst = self.load(ptr, ptr_ty).?;
         load_inst.setOrdering(ordering);
         return load_inst;
     }
@@ -2673,10 +2833,11 @@ pub const FuncGen = struct {
         ordering: llvm.AtomicOrdering,
     ) !?*const llvm.Value {
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
-        var ptr = try self.resolveInst(bin_op.lhs);
         const ptr_ty = self.air.typeOf(bin_op.lhs);
+        const operand_ty = ptr_ty.childType();
+        if (!operand_ty.hasCodeGenBits()) return null;
+        var ptr = try self.resolveInst(bin_op.lhs);
         var element = try self.resolveInst(bin_op.rhs);
-        const operand_ty = ptr_ty.elemType();
         const opt_abi_ty = self.dg.getAtomicAbiType(operand_ty, false);
 
         if (opt_abi_ty) |abi_ty| {
@@ -2688,8 +2849,7 @@ pub const FuncGen = struct {
                 element = self.builder.buildZExt(element, abi_ty, "");
             }
         }
-        const store_inst = self.store(ptr, ptr_ty, element);
-        store_inst.setOrdering(ordering);
+        self.store(ptr, ptr_ty, element, ordering);
         return null;
     }
 
@@ -2724,10 +2884,9 @@ pub const FuncGen = struct {
         const src_ptr = try self.resolveInst(extra.lhs);
         const src_ptr_ty = self.air.typeOf(extra.lhs);
         const len = try self.resolveInst(extra.rhs);
-        const u8_llvm_ty = self.context.intType(8);
-        const ptr_u8_llvm_ty = u8_llvm_ty.pointerType(0);
-        const dest_ptr_u8 = self.builder.buildBitCast(dest_ptr, ptr_u8_llvm_ty, "");
-        const src_ptr_u8 = self.builder.buildBitCast(src_ptr, ptr_u8_llvm_ty, "");
+        const llvm_ptr_u8 = self.context.intType(8).pointerType(0);
+        const dest_ptr_u8 = self.builder.buildBitCast(dest_ptr, llvm_ptr_u8, "");
+        const src_ptr_u8 = self.builder.buildBitCast(src_ptr, llvm_ptr_u8, "");
         const is_volatile = src_ptr_ty.isVolatilePtr() or dest_ptr_ty.isVolatilePtr();
         const target = self.dg.module.getTarget();
         _ = self.builder.buildMemCpy(
@@ -2843,7 +3002,10 @@ pub const FuncGen = struct {
         return self.llvmModule().getIntrinsicDeclaration(id, null, 0);
     }
 
-    fn load(self: *FuncGen, ptr: *const llvm.Value, ptr_ty: Type) *const llvm.Value {
+    fn load(self: *FuncGen, ptr: *const llvm.Value, ptr_ty: Type) ?*const llvm.Value {
+        const pointee_ty = ptr_ty.childType();
+        if (!pointee_ty.hasCodeGenBits()) return null;
+        if (isByRef(pointee_ty)) return ptr;
         const llvm_inst = self.builder.buildLoad(ptr, "");
         const target = self.dg.module.getTarget();
         llvm_inst.setAlignment(ptr_ty.ptrAlignment(target));
@@ -2856,12 +3018,31 @@ pub const FuncGen = struct {
         ptr: *const llvm.Value,
         ptr_ty: Type,
         elem: *const llvm.Value,
-    ) *const llvm.Value {
-        const llvm_inst = self.builder.buildStore(elem, ptr);
+        ordering: llvm.AtomicOrdering,
+    ) void {
+        const elem_ty = ptr_ty.childType();
+        if (!elem_ty.hasCodeGenBits()) {
+            return;
+        }
         const target = self.dg.module.getTarget();
-        llvm_inst.setAlignment(ptr_ty.ptrAlignment(target));
-        llvm_inst.setVolatile(llvm.Bool.fromBool(ptr_ty.isVolatilePtr()));
-        return llvm_inst;
+        if (!isByRef(elem_ty)) {
+            const store_inst = self.builder.buildStore(elem, ptr);
+            store_inst.setOrdering(ordering);
+            store_inst.setAlignment(ptr_ty.ptrAlignment(target));
+            store_inst.setVolatile(llvm.Bool.fromBool(ptr_ty.isVolatilePtr()));
+            return;
+        }
+        assert(ordering == .NotAtomic);
+        const llvm_ptr_u8 = self.context.intType(8).pointerType(0);
+        const size_bytes = elem_ty.abiSize(target);
+        _ = self.builder.buildMemCpy(
+            self.builder.buildBitCast(ptr, llvm_ptr_u8, ""),
+            ptr_ty.ptrAlignment(target),
+            self.builder.buildBitCast(elem, llvm_ptr_u8, ""),
+            elem_ty.abiAlignment(target),
+            self.context.intType(Type.usize.intInfo(target).bits).constInt(size_bytes, .False),
+            ptr_ty.isVolatilePtr(),
+        );
     }
 };
 
@@ -3113,3 +3294,54 @@ fn llvmFieldIndex(ty: Type, index: u32) c_uint {
     }
     return result;
 }
+
+fn firstParamSRet(fn_info: Type.Payload.Function.Data, target: std.Target) bool {
+    switch (fn_info.cc) {
+        .Unspecified, .Inline => return isByRef(fn_info.return_type),
+        .C => {},
+        else => return false,
+    }
+    switch (target.cpu.arch) {
+        .mips, .mipsel => return false,
+        .x86_64 => switch (target.os.tag) {
+            .windows => return @import("../arch/x86_64/abi.zig").classifyWindows(fn_info.return_type, target) == .memory,
+            else => return @import("../arch/x86_64/abi.zig").classifySystemV(fn_info.return_type, target)[0] == .memory,
+        },
+        else => return false, // TODO investigate C ABI for other architectures
+    }
+}
+
+fn isByRef(ty: Type) bool {
+    switch (ty.zigTypeTag()) {
+        .Type,
+        .ComptimeInt,
+        .ComptimeFloat,
+        .EnumLiteral,
+        .Undefined,
+        .Null,
+        .BoundFn,
+        .Opaque,
+        => unreachable,
+
+        .NoReturn,
+        .Void,
+        .Bool,
+        .Int,
+        .Float,
+        .Pointer,
+        .ErrorSet,
+        .Fn,
+        .Enum,
+        .Vector,
+        .AnyFrame,
+        => return false,
+
+        .Array, .Struct, .Frame => return ty.hasCodeGenBits(),
+        .Union => return ty.hasCodeGenBits(),
+        .ErrorUnion => return isByRef(ty.errorUnionPayload()),
+        .Optional => {
+            var buf: Type.Payload.ElemType = undefined;
+            return isByRef(ty.optionalChild(&buf));
+        },
+    }
+}
src/Air.zig
@@ -110,6 +110,10 @@ pub const Inst = struct {
         /// Allocates stack local memory.
         /// Uses the `ty` field.
         alloc,
+        /// If the function will pass the result by-ref, this instruction returns the
+        /// result pointer. Otherwise it is equivalent to `alloc`.
+        /// Uses the `ty` field.
+        ret_ptr,
         /// Inline assembly. Uses the `ty_pl` field. Payload is `Asm`.
         assembly,
         /// Bitwise AND. `&`.
@@ -160,6 +164,7 @@ pub const Inst = struct {
         /// Function call.
         /// Result type is the return type of the function being called.
         /// Uses the `pl_op` field with the `Call` payload. operand is the callee.
+        /// Triggers `resolveTypeLayout` on the return type of the callee.
         call,
         /// Count leading zeroes of an integer according to its representation in twos complement.
         /// Result type will always be an unsigned integer big enough to fit the answer.
@@ -257,7 +262,16 @@ pub const Inst = struct {
         /// Return a value from a function.
         /// Result type is always noreturn; no instructions in a block follow this one.
         /// Uses the `un_op` field.
+        /// Triggers `resolveTypeLayout` on the return type.
         ret,
+        /// This instruction communicates that the function's result value is inside
+        /// the operand, which is a pointer. If the function will pass the result by-ref,
+        /// the pointer operand is a `ret_ptr` instruction. Otherwise, this instruction
+        /// is equivalent to a `load` on the operand, followed by a `ret` on the loaded value.
+        /// Result type is always noreturn; no instructions in a block follow this one.
+        /// Uses the `un_op` field.
+        /// Triggers `resolveTypeLayout` on the return type.
+        ret_load,
         /// Write a value to a pointer. LHS is pointer, RHS is value.
         /// Result type is always void.
         /// Uses the `bin_op` field.
@@ -341,6 +355,10 @@ pub const Inst = struct {
         /// Given a slice value, return the pointer.
         /// Uses the `ty_op` field.
         slice_ptr,
+        /// Given an array value and element index, return the element value at that index.
+        /// Result type is the element type of the array operand.
+        /// Uses the `bin_op` field.
+        array_elem_val,
         /// Given a slice value, and element index, return the element value at that index.
         /// Result type is the element type of the slice operand.
         /// Uses the `bin_op` field.
@@ -644,7 +662,9 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
 
         .const_ty => return Type.initTag(.type),
 
-        .alloc => return datas[inst].ty,
+        .alloc,
+        .ret_ptr,
+        => return datas[inst].ty,
 
         .assembly,
         .block,
@@ -690,6 +710,7 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
         .cond_br,
         .switch_br,
         .ret,
+        .ret_load,
         .unreach,
         => return Type.initTag(.noreturn),
 
@@ -714,10 +735,14 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
 
         .call => {
             const callee_ty = air.typeOf(datas[inst].pl_op.operand);
-            return callee_ty.fnReturnType();
+            switch (callee_ty.zigTypeTag()) {
+                .Fn => return callee_ty.fnReturnType(),
+                .Pointer => return callee_ty.childType().fnReturnType(),
+                else => unreachable,
+            }
         },
 
-        .slice_elem_val, .ptr_elem_val => {
+        .slice_elem_val, .ptr_elem_val, .array_elem_val => {
             const ptr_ty = air.typeOf(datas[inst].bin_op.lhs);
             return ptr_ty.elemType();
         },
src/codegen.zig
@@ -855,6 +855,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     .shr      => try self.airShr(inst),
 
                     .alloc           => try self.airAlloc(inst),
+                    .ret_ptr         => try self.airRetPtr(inst),
                     .arg             => try self.airArg(inst),
                     .assembly        => try self.airAsm(inst),
                     .bitcast         => try self.airBitCast(inst),
@@ -883,6 +884,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     .not             => try self.airNot(inst),
                     .ptrtoint        => try self.airPtrToInt(inst),
                     .ret             => try self.airRet(inst),
+                    .ret_load        => try self.airRetLoad(inst),
                     .store           => try self.airStore(inst),
                     .struct_field_ptr=> try self.airStructFieldPtr(inst),
                     .struct_field_val=> try self.airStructFieldVal(inst),
@@ -914,6 +916,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     .slice_ptr       => try self.airSlicePtr(inst),
                     .slice_len       => try self.airSliceLen(inst),
 
+                    .array_elem_val      => try self.airArrayElemVal(inst),
                     .slice_elem_val      => try self.airSliceElemVal(inst),
                     .ptr_slice_elem_val  => try self.airPtrSliceElemVal(inst),
                     .ptr_elem_val        => try self.airPtrElemVal(inst),
@@ -1185,6 +1188,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return self.finishAir(inst, .{ .ptr_stack_offset = stack_offset }, .{ .none, .none, .none });
         }
 
+        fn airRetPtr(self: *Self, inst: Air.Inst.Index) !void {
+            const stack_offset = try self.allocMemPtr(inst);
+            return self.finishAir(inst, .{ .ptr_stack_offset = stack_offset }, .{ .none, .none, .none });
+        }
+
         fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
             const ty_op = self.air.instructions.items(.data)[inst].ty_op;
             const result: MCValue = if (self.liveness.isUnused(inst)) .dead else switch (arch) {
@@ -1557,6 +1565,14 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
         }
 
+        fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) !void {
+            const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+            const result: MCValue = if (self.liveness.isUnused(inst)) .dead else switch (arch) {
+                else => return self.fail("TODO implement array_elem_val for {}", .{self.target.cpu.arch}),
+            };
+            return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
+        }
+
         fn airPtrSliceElemVal(self: *Self, inst: Air.Inst.Index) !void {
             const is_volatile = false; // TODO
             const bin_op = self.air.instructions.items(.data)[inst].bin_op;
@@ -3213,6 +3229,14 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return self.finishAir(inst, .dead, .{ un_op, .none, .none });
         }
 
+        fn airRetLoad(self: *Self, inst: Air.Inst.Index) !void {
+            const un_op = self.air.instructions.items(.data)[inst].un_op;
+            const ptr = try self.resolveInst(un_op);
+            _ = ptr;
+            return self.fail("TODO implement airRetLoad for {}", .{self.target.cpu.arch});
+            //return self.finishAir(inst, .dead, .{ un_op, .none, .none });
+        }
+
         fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
             const bin_op = self.air.instructions.items(.data)[inst].bin_op;
             if (self.liveness.isUnused(inst))
src/Liveness.zig
@@ -250,6 +250,7 @@ fn analyzeInst(
         .bool_and,
         .bool_or,
         .store,
+        .array_elem_val,
         .slice_elem_val,
         .ptr_slice_elem_val,
         .ptr_elem_val,
@@ -270,6 +271,7 @@ fn analyzeInst(
 
         .arg,
         .alloc,
+        .ret_ptr,
         .constant,
         .const_ty,
         .breakpoint,
@@ -322,6 +324,7 @@ fn analyzeInst(
         .ptrtoint,
         .bool_to_int,
         .ret,
+        .ret_load,
         => {
             const operand = inst_datas[inst].un_op;
             return trackOperands(a, new_set, inst, main_tomb, .{ operand, .none, .none });
src/Module.zig
@@ -785,7 +785,7 @@ pub const Struct = struct {
     /// The Decl that corresponds to the struct itself.
     owner_decl: *Decl,
     /// Set of field names in declaration order.
-    fields: std.StringArrayHashMapUnmanaged(Field),
+    fields: Fields,
     /// Represents the declarations inside this struct.
     namespace: Namespace,
     /// Offset from `owner_decl`, points to the struct AST node.
@@ -805,6 +805,8 @@ pub const Struct = struct {
     /// is necessary to determine whether it has bits at runtime.
     known_has_bits: bool,
 
+    pub const Fields = std.StringArrayHashMapUnmanaged(Field);
+
     /// The `Type` and `Value` memory is owned by the arena of the Struct's owner_decl.
     pub const Field = struct {
         /// Uses `noreturn` to indicate `anytype`.
@@ -935,7 +937,7 @@ pub const Union = struct {
     /// This will be set to the null type until status is `have_field_types`.
     tag_ty: Type,
     /// Set of field names in declaration order.
-    fields: std.StringArrayHashMapUnmanaged(Field),
+    fields: Fields,
     /// Represents the declarations inside this union.
     namespace: Namespace,
     /// Offset from `owner_decl`, points to the union decl AST node.
@@ -958,6 +960,8 @@ pub const Union = struct {
         abi_align: Value,
     };
 
+    pub const Fields = std.StringArrayHashMapUnmanaged(Field);
+
     pub fn getFullyQualifiedName(s: *Union, gpa: *Allocator) ![]u8 {
         return s.owner_decl.getFullyQualifiedName(gpa);
     }
@@ -992,14 +996,18 @@ pub const Union = struct {
 
     pub fn mostAlignedField(u: Union, target: Target) u32 {
         assert(u.haveFieldTypes());
-        var most_alignment: u64 = 0;
+        var most_alignment: u32 = 0;
         var most_index: usize = undefined;
         for (u.fields.values()) |field, i| {
             if (!field.ty.hasCodeGenBits()) continue;
-            const field_align = if (field.abi_align.tag() == .abi_align_default)
-                field.ty.abiAlignment(target)
-            else
-                field.abi_align.toUnsignedInt();
+
+            const field_align = a: {
+                if (field.abi_align.tag() == .abi_align_default) {
+                    break :a field.ty.abiAlignment(target);
+                } else {
+                    break :a @intCast(u32, field.abi_align.toUnsignedInt());
+                }
+            };
             if (field_align > most_alignment) {
                 most_alignment = field_align;
                 most_index = i;
@@ -1007,6 +1015,69 @@ pub const Union = struct {
         }
         return @intCast(u32, most_index);
     }
+
+    pub fn abiAlignment(u: Union, target: Target, have_tag: bool) u32 {
+        var max_align: u32 = 0;
+        if (have_tag) max_align = u.tag_ty.abiAlignment(target);
+        for (u.fields.values()) |field| {
+            if (!field.ty.hasCodeGenBits()) continue;
+
+            const field_align = a: {
+                if (field.abi_align.tag() == .abi_align_default) {
+                    break :a field.ty.abiAlignment(target);
+                } else {
+                    break :a @intCast(u32, field.abi_align.toUnsignedInt());
+                }
+            };
+            max_align = @maximum(max_align, field_align);
+        }
+        assert(max_align != 0);
+        return max_align;
+    }
+
+    pub fn abiSize(u: Union, target: Target, have_tag: bool) u64 {
+        assert(u.haveFieldTypes());
+        const is_packed = u.layout == .Packed;
+        if (is_packed) @panic("TODO packed unions");
+
+        var payload_size: u64 = 0;
+        var payload_align: u32 = 0;
+        for (u.fields.values()) |field| {
+            if (!field.ty.hasCodeGenBits()) continue;
+
+            const field_align = a: {
+                if (field.abi_align.tag() == .abi_align_default) {
+                    break :a field.ty.abiAlignment(target);
+                } else {
+                    break :a @intCast(u32, field.abi_align.toUnsignedInt());
+                }
+            };
+            payload_size = @maximum(payload_size, field.ty.abiSize(target));
+            payload_align = @maximum(payload_align, field_align);
+        }
+        if (!have_tag) {
+            return std.mem.alignForwardGeneric(u64, payload_size, payload_align);
+        }
+        // Put the tag before or after the payload depending on which one's
+        // alignment is greater.
+        const tag_size = u.tag_ty.abiSize(target);
+        const tag_align = u.tag_ty.abiAlignment(target);
+        var size: u64 = 0;
+        if (tag_align >= payload_align) {
+            // {Tag, Payload}
+            size += tag_size;
+            size = std.mem.alignForwardGeneric(u64, size, payload_align);
+            size += payload_size;
+            size = std.mem.alignForwardGeneric(u64, size, tag_align);
+        } else {
+            // {Payload, Tag}
+            size += payload_size;
+            size = std.mem.alignForwardGeneric(u64, size, tag_align);
+            size += tag_size;
+            size = std.mem.alignForwardGeneric(u64, size, payload_align);
+        }
+        return size;
+    }
 };
 
 /// Some Fn struct memory is owned by the Decl's TypedValue.Managed arena allocator.
src/print_air.zig
@@ -128,6 +128,7 @@ const Writer = struct {
             .bool_and,
             .bool_or,
             .store,
+            .array_elem_val,
             .slice_elem_val,
             .ptr_slice_elem_val,
             .ptr_elem_val,
@@ -150,6 +151,7 @@ const Writer = struct {
             .ptrtoint,
             .bool_to_int,
             .ret,
+            .ret_load,
             => try w.writeUnOp(s, inst),
 
             .breakpoint,
@@ -158,6 +160,7 @@ const Writer = struct {
 
             .const_ty,
             .alloc,
+            .ret_ptr,
             => try w.writeTy(s, inst),
 
             .not,
src/Sema.zig
@@ -1814,7 +1814,7 @@ fn zirRetPtr(
         .pointee_type = sema.fn_ret_ty,
         .@"addrspace" = target_util.defaultAddressSpace(sema.mod.getTarget(), .local),
     });
-    return block.addTy(.alloc, ptr_type);
+    return block.addTy(.ret_ptr, ptr_type);
 }
 
 fn zirRef(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
@@ -3331,9 +3331,20 @@ fn analyzeCall(
 ) CompileError!Air.Inst.Ref {
     const mod = sema.mod;
 
-    const func_ty = sema.typeOf(func);
-    if (func_ty.zigTypeTag() != .Fn)
-        return sema.fail(block, func_src, "type '{}' not a function", .{func_ty});
+    const callee_ty = sema.typeOf(func);
+    const func_ty = func_ty: {
+        switch (callee_ty.zigTypeTag()) {
+            .Fn => break :func_ty callee_ty,
+            .Pointer => {
+                const ptr_info = callee_ty.ptrInfo().data;
+                if (ptr_info.size == .One and ptr_info.pointee_type.zigTypeTag() == .Fn) {
+                    break :func_ty ptr_info.pointee_type;
+                }
+            },
+            else => {},
+        }
+        return sema.fail(block, func_src, "type '{}' not a function", .{callee_ty});
+    };
 
     const func_ty_info = func_ty.fnInfo();
     const cc = func_ty_info.cc;
@@ -3393,6 +3404,7 @@ fn analyzeCall(
     const result: Air.Inst.Ref = if (is_inline_call) res: {
         const func_val = try sema.resolveConstValue(block, func_src, func);
         const module_fn = switch (func_val.tag()) {
+            .decl_ref => func_val.castTag(.decl_ref).?.data.val.castTag(.function).?.data,
             .function => func_val.castTag(.function).?.data,
             .extern_fn => return sema.fail(block, call_src, "{s} call of extern function", .{
                 @as([]const u8, if (is_comptime_call) "comptime" else "inline"),
@@ -3610,7 +3622,11 @@ fn analyzeCall(
         break :res res2;
     } else if (func_ty_info.is_generic) res: {
         const func_val = try sema.resolveConstValue(block, func_src, func);
-        const module_fn = func_val.castTag(.function).?.data;
+        const module_fn = switch (func_val.tag()) {
+            .function => func_val.castTag(.function).?.data,
+            .decl_ref => func_val.castTag(.decl_ref).?.data.val.castTag(.function).?.data,
+            else => unreachable,
+        };
         // Check the Module's generic function map with an adapted context, so that we
         // can match against `uncasted_args` rather than doing the work below to create a
         // generic Scope only to junk it if it matches an existing instantiation.
@@ -3880,6 +3896,8 @@ fn analyzeCall(
         }
 
         try sema.requireRuntimeBlock(block, call_src);
+        try sema.resolveTypeLayout(block, call_src, func_ty_info.return_type);
+
         try sema.air_extra.ensureUnusedCapacity(gpa, @typeInfo(Air.Call).Struct.fields.len +
             args.len);
         const func_inst = try block.addInst(.{
@@ -3954,6 +3972,8 @@ fn finishGenericCall(
             }
             total_i += 1;
         }
+
+        try sema.resolveTypeLayout(block, call_src, new_fn_ty.fnReturnType());
     }
     try sema.air_extra.ensureUnusedCapacity(sema.gpa, @typeInfo(Air.Call).Struct.fields.len +
         runtime_args_len);
@@ -4787,7 +4807,12 @@ fn funcCommon(
     }
 
     if (body_inst == 0) {
-        return sema.addType(fn_ty);
+        const fn_ptr_ty = try Type.ptr(sema.arena, .{
+            .pointee_type = fn_ty,
+            .@"addrspace" = .generic,
+            .mutable = false,
+        });
+        return sema.addType(fn_ptr_ty);
     }
 
     const is_inline = fn_ty.fnCallingConvention() == .Inline;
@@ -8366,13 +8391,15 @@ fn zirRetLoad(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Zir
 
     const inst_data = sema.code.instructions.items(.data)[inst].un_node;
     const src = inst_data.src();
-    // TODO: when implementing functions that accept a result location pointer,
-    // this logic will be updated to only do a load in case that the function's return
-    // type in fact does not need a result location pointer. Until then we assume
-    // the `ret_ptr` is the same as an `alloc` and do a load here.
     const ret_ptr = sema.resolveInst(inst_data.operand);
-    const operand = try sema.analyzeLoad(block, src, ret_ptr, src);
-    return sema.analyzeRet(block, operand, src, false);
+
+    if (block.is_comptime or block.inlining != null) {
+        const operand = try sema.analyzeLoad(block, src, ret_ptr, src);
+        return sema.analyzeRet(block, operand, src, false);
+    }
+    try sema.requireRuntimeBlock(block, src);
+    _ = try block.addUnOp(.ret_load, ret_ptr);
+    return always_noreturn;
 }
 
 fn analyzeRet(
@@ -8398,6 +8425,7 @@ fn analyzeRet(
         return always_noreturn;
     }
 
+    try sema.resolveTypeLayout(block, src, sema.fn_ret_ty);
     _ = try block.addUnOp(.ret, operand);
     return always_noreturn;
 }
@@ -8653,56 +8681,76 @@ fn zirStructInitAnon(sema: *Sema, block: *Block, inst: Zir.Inst.Index, is_ref: b
     return sema.fail(block, src, "TODO: Sema.zirStructInitAnon", .{});
 }
 
-fn zirArrayInit(sema: *Sema, block: *Block, inst: Zir.Inst.Index, is_ref: bool) CompileError!Air.Inst.Ref {
+fn zirArrayInit(
+    sema: *Sema,
+    block: *Block,
+    inst: Zir.Inst.Index,
+    is_ref: bool,
+) CompileError!Air.Inst.Ref {
+    const gpa = sema.gpa;
     const inst_data = sema.code.instructions.items(.data)[inst].pl_node;
     const src = inst_data.src();
 
     const extra = sema.code.extraData(Zir.Inst.MultiOp, inst_data.payload_index);
     const args = sema.code.refSlice(extra.end, extra.data.operands_len);
+    assert(args.len != 0);
+
+    const resolved_args = try gpa.alloc(Air.Inst.Ref, args.len);
+    defer gpa.free(resolved_args);
 
-    var resolved_args = try sema.mod.gpa.alloc(Air.Inst.Ref, args.len);
     for (args) |arg, i| resolved_args[i] = sema.resolveInst(arg);
 
-    var all_args_comptime = for (resolved_args) |arg| {
-        if ((try sema.resolveMaybeUndefVal(block, src, arg)) == null) break false;
-    } else true;
+    const elem_ty = sema.typeOf(resolved_args[0]);
+
+    const array_ty = try Type.Tag.array.create(sema.arena, .{
+        .len = resolved_args.len,
+        .elem_type = elem_ty,
+    });
+
+    const opt_runtime_src: ?LazySrcLoc = for (resolved_args) |arg| {
+        const arg_src = src; // TODO better source location
+        const comptime_known = try sema.isComptimeKnown(block, arg_src, arg);
+        if (!comptime_known) break arg_src;
+    } else null;
 
-    if (all_args_comptime) {
+    const runtime_src = opt_runtime_src orelse {
         var anon_decl = try block.startAnonDecl();
         defer anon_decl.deinit();
-        assert(!(resolved_args.len == 0));
-        const final_ty = try Type.Tag.array.create(anon_decl.arena(), .{
-            .len = resolved_args.len,
-            .elem_type = try sema.typeOf(resolved_args[0]).copy(anon_decl.arena()),
-        });
-        const buf = try anon_decl.arena().alloc(Value, resolved_args.len);
+
+        const elem_vals = try anon_decl.arena().alloc(Value, resolved_args.len);
         for (resolved_args) |arg, i| {
-            buf[i] = try (try sema.resolveMaybeUndefVal(block, src, arg)).?.copy(anon_decl.arena());
+            // We checked that all args are comptime above.
+            const arg_val = (sema.resolveMaybeUndefVal(block, src, arg) catch unreachable).?;
+            elem_vals[i] = try arg_val.copy(anon_decl.arena());
         }
 
-        const val = try Value.Tag.array.create(anon_decl.arena(), buf);
-        if (is_ref)
-            return sema.analyzeDeclRef(try anon_decl.finish(final_ty, val))
-        else
-            return sema.analyzeDeclVal(block, .unneeded, try anon_decl.finish(final_ty, val));
-    }
+        const val = try Value.Tag.array.create(anon_decl.arena(), elem_vals);
+        const decl = try anon_decl.finish(try array_ty.copy(anon_decl.arena()), val);
+        if (is_ref) {
+            return sema.analyzeDeclRef(decl);
+        } else {
+            return sema.analyzeDeclVal(block, .unneeded, decl);
+        }
+    };
 
-    assert(!(resolved_args.len == 0));
-    const array_ty = try Type.Tag.array.create(sema.arena, .{ .len = resolved_args.len, .elem_type = sema.typeOf(resolved_args[0]) });
-    const final_ty = try Type.ptr(sema.arena, .{
+    try sema.requireRuntimeBlock(block, runtime_src);
+
+    const alloc_ty = try Type.ptr(sema.arena, .{
         .pointee_type = array_ty,
         .@"addrspace" = target_util.defaultAddressSpace(sema.mod.getTarget(), .local),
     });
-    const alloc = try block.addTy(.alloc, final_ty);
+    const alloc = try block.addTy(.alloc, alloc_ty);
 
     for (resolved_args) |arg, i| {
-        const pointer_to_array_at_index = try block.addBinOp(.ptr_elem_ptr, alloc, try sema.addIntUnsigned(Type.initTag(.u64), i));
-        _ = try block.addBinOp(.store, pointer_to_array_at_index, arg);
+        const index = try sema.addIntUnsigned(Type.initTag(.u64), i);
+        const elem_ptr = try block.addBinOp(.ptr_elem_ptr, alloc, index);
+        _ = try block.addBinOp(.store, elem_ptr, arg);
+    }
+    if (is_ref) {
+        return alloc;
+    } else {
+        return sema.analyzeLoad(block, .unneeded, alloc, .unneeded);
     }
-    return if (is_ref)
-        alloc
-    else
-        try sema.analyzeLoad(block, .unneeded, alloc, .unneeded);
 }
 
 fn zirArrayInitAnon(sema: *Sema, block: *Block, inst: Zir.Inst.Index, is_ref: bool) CompileError!Air.Inst.Ref {
@@ -10111,7 +10159,8 @@ fn panicWithMsg(
     const arena = sema.arena;
 
     const this_feature_is_implemented_in_the_backend =
-        mod.comp.bin_file.options.object_format == .c;
+        mod.comp.bin_file.options.object_format == .c or
+        mod.comp.bin_file.options.use_llvm;
     if (!this_feature_is_implemented_in_the_backend) {
         // TODO implement this feature in all the backends and then delete this branch
         _ = try block.addNoOp(.breakpoint);
@@ -10579,8 +10628,9 @@ fn fieldCallBind(
                 const struct_ty = try sema.resolveTypeFields(block, src, concrete_ty);
                 const struct_obj = struct_ty.castTag(.@"struct").?.data;
 
-                const field_index = struct_obj.fields.getIndex(field_name) orelse
+                const field_index_usize = struct_obj.fields.getIndex(field_name) orelse
                     break :find_field;
+                const field_index = @intCast(u32, field_index_usize);
                 const field = struct_obj.fields.values()[field_index];
 
                 const ptr_field_ty = try Type.ptr(arena, .{
@@ -10601,33 +10651,7 @@ fn fieldCallBind(
                 }
 
                 try sema.requireRuntimeBlock(block, src);
-                const ptr_inst = ptr_inst: {
-                    const tag: Air.Inst.Tag = switch (field_index) {
-                        0 => .struct_field_ptr_index_0,
-                        1 => .struct_field_ptr_index_1,
-                        2 => .struct_field_ptr_index_2,
-                        3 => .struct_field_ptr_index_3,
-                        else => {
-                            break :ptr_inst try block.addInst(.{
-                                .tag = .struct_field_ptr,
-                                .data = .{ .ty_pl = .{
-                                    .ty = try sema.addType(ptr_field_ty),
-                                    .payload = try sema.addExtra(Air.StructField{
-                                        .struct_operand = object_ptr,
-                                        .field_index = @intCast(u32, field_index),
-                                    }),
-                                } },
-                            });
-                        },
-                    };
-                    break :ptr_inst try block.addInst(.{
-                        .tag = tag,
-                        .data = .{ .ty_op = .{
-                            .ty = try sema.addType(ptr_field_ty),
-                            .operand = object_ptr,
-                        } },
-                    });
-                };
+                const ptr_inst = try block.addStructFieldPtr(object_ptr, field_index, ptr_field_ty);
                 return sema.analyzeLoad(block, src, ptr_inst, src);
             },
             .Union => return sema.fail(block, src, "TODO implement field calls on unions", .{}),
@@ -10982,10 +11006,24 @@ fn elemVal(
                 }
             },
         },
+        .Array => {
+            if (try sema.resolveMaybeUndefVal(block, src, array_maybe_ptr)) |array_val| {
+                const elem_ty = maybe_ptr_ty.childType();
+                const opt_index_val = try sema.resolveDefinedValue(block, elem_index_src, elem_index);
+                if (array_val.isUndef()) return sema.addConstUndef(elem_ty);
+                if (opt_index_val) |index_val| {
+                    const index = @intCast(usize, index_val.toUnsignedInt());
+                    const elem_val = try array_val.elemValue(sema.arena, index);
+                    return sema.addConstant(elem_ty, elem_val);
+                }
+            }
+            try sema.requireRuntimeBlock(block, src);
+            return block.addBinOp(.array_elem_val, array_maybe_ptr, elem_index);
+        },
         else => return sema.fail(
             block,
             array_ptr_src,
-            "expected pointer, found '{}'",
+            "expected pointer or array; found '{}'",
             .{maybe_ptr_ty},
         ),
     }
@@ -11085,6 +11123,14 @@ fn coerce(
             return sema.wrapOptional(block, dest_type, intermediate, inst_src);
         },
         .Pointer => {
+            // Function body to function pointer.
+            if (inst_ty.zigTypeTag() == .Fn) {
+                const fn_val = try sema.resolveConstValue(block, inst_src, inst);
+                const fn_decl = fn_val.castTag(.function).?.data.owner_decl;
+                const inst_as_ptr = try sema.analyzeDeclRef(fn_decl);
+                return sema.coerce(block, dest_type, inst_as_ptr, inst_src);
+            }
+
             // Coercions where the source is a single pointer to an array.
             src_array_ptr: {
                 if (!inst_ty.isSinglePointer()) break :src_array_ptr;
@@ -11411,7 +11457,7 @@ fn storePtr2(
     if (ptr_ty.isConstPtr())
         return sema.fail(block, src, "cannot assign to constant", .{});
 
-    const elem_ty = ptr_ty.elemType();
+    const elem_ty = ptr_ty.childType();
     const operand = try sema.coerce(block, elem_ty, uncasted_operand, operand_src);
     if ((try sema.typeHasOnePossibleValue(block, src, elem_ty)) != null)
         return;
@@ -11429,6 +11475,7 @@ fn storePtr2(
     // TODO handle if the element type requires comptime
 
     try sema.requireRuntimeBlock(block, runtime_src);
+    try sema.resolveTypeLayout(block, src, elem_ty);
     _ = try block.addBinOp(air_tag, ptr, operand);
 }
 
src/type.zig
@@ -1707,32 +1707,10 @@ pub const Type = extern union {
                 const int_tag_ty = self.intTagType(&buffer);
                 return int_tag_ty.abiAlignment(target);
             },
-            .union_tagged => {
-                const union_obj = self.castTag(.union_tagged).?.data;
-                var biggest: u32 = union_obj.tag_ty.abiAlignment(target);
-                for (union_obj.fields.values()) |field| {
-                    if (!field.ty.hasCodeGenBits()) continue;
-                    const field_align = field.ty.abiAlignment(target);
-                    if (field_align > biggest) {
-                        biggest = field_align;
-                    }
-                }
-                assert(biggest != 0);
-                return biggest;
-            },
-            .@"union" => {
-                const union_obj = self.castTag(.@"union").?.data;
-                var biggest: u32 = 0;
-                for (union_obj.fields.values()) |field| {
-                    if (!field.ty.hasCodeGenBits()) continue;
-                    const field_align = field.ty.abiAlignment(target);
-                    if (field_align > biggest) {
-                        biggest = field_align;
-                    }
-                }
-                assert(biggest != 0);
-                return biggest;
-            },
+            // TODO pass `true` for have_tag when unions have a safety tag
+            .@"union" => return self.castTag(.@"union").?.data.abiAlignment(target, false),
+            .union_tagged => return self.castTag(.union_tagged).?.data.abiAlignment(target, true),
+
             .c_void,
             .void,
             .type,
@@ -1790,6 +1768,7 @@ pub const Type = extern union {
                 const is_packed = s.layout == .Packed;
                 if (is_packed) @panic("TODO packed structs");
                 var size: u64 = 0;
+                var big_align: u32 = 0;
                 for (s.fields.values()) |field| {
                     if (!field.ty.hasCodeGenBits()) continue;
 
@@ -1797,12 +1776,14 @@ pub const Type = extern union {
                         if (field.abi_align.tag() == .abi_align_default) {
                             break :a field.ty.abiAlignment(target);
                         } else {
-                            break :a field.abi_align.toUnsignedInt();
+                            break :a @intCast(u32, field.abi_align.toUnsignedInt());
                         }
                     };
+                    big_align = @maximum(big_align, field_align);
                     size = std.mem.alignForwardGeneric(u64, size, field_align);
                     size += field.ty.abiSize(target);
                 }
+                size = std.mem.alignForwardGeneric(u64, size, big_align);
                 return size;
             },
             .enum_simple, .enum_full, .enum_nonexhaustive, .enum_numbered => {
@@ -1810,9 +1791,9 @@ pub const Type = extern union {
                 const int_tag_ty = self.intTagType(&buffer);
                 return int_tag_ty.abiSize(target);
             },
-            .@"union", .union_tagged => {
-                @panic("TODO abiSize unions");
-            },
+            // TODO pass `true` for have_tag when unions have a safety tag
+            .@"union" => return self.castTag(.@"union").?.data.abiSize(target, false),
+            .union_tagged => return self.castTag(.union_tagged).?.data.abiSize(target, true),
 
             .u1,
             .u8,
@@ -2550,6 +2531,11 @@ pub const Type = extern union {
         };
     }
 
+    pub fn unionFields(ty: Type) Module.Union.Fields {
+        const union_obj = ty.cast(Payload.Union).?.data;
+        return union_obj.fields;
+    }
+
     pub fn unionFieldType(ty: Type, enum_tag: Value) Type {
         const union_obj = ty.cast(Payload.Union).?.data;
         const index = union_obj.tag_ty.enumTagFieldIndex(enum_tag).?;
@@ -2657,7 +2643,7 @@ pub const Type = extern union {
         };
     }
 
-    /// Asserts the type is an integer or enum.
+    /// Asserts the type is an integer, enum, or error set.
     pub fn intInfo(self: Type, target: Target) struct { signedness: std.builtin.Signedness, bits: u16 } {
         var ty = self;
         while (true) switch (ty.tag()) {
@@ -2700,6 +2686,11 @@ pub const Type = extern union {
                 return .{ .signedness = .unsigned, .bits = smallestUnsignedBits(field_count - 1) };
             },
 
+            .error_set, .error_set_single, .anyerror, .error_set_inferred => {
+                // TODO revisit this when error sets support custom int types
+                return .{ .signedness = .unsigned, .bits = 16 };
+            },
+
             else => unreachable,
         };
     }
@@ -3151,12 +3142,12 @@ pub const Type = extern union {
 
     /// Asserts the type is an enum or a union.
     /// TODO support unions
-    pub fn intTagType(self: Type, buffer: *Payload.Bits) Type {
-        switch (self.tag()) {
-            .enum_full, .enum_nonexhaustive => return self.cast(Payload.EnumFull).?.data.tag_ty,
-            .enum_numbered => return self.castTag(.enum_numbered).?.data.tag_ty,
+    pub fn intTagType(ty: Type, buffer: *Payload.Bits) Type {
+        switch (ty.tag()) {
+            .enum_full, .enum_nonexhaustive => return ty.cast(Payload.EnumFull).?.data.tag_ty,
+            .enum_numbered => return ty.castTag(.enum_numbered).?.data.tag_ty,
             .enum_simple => {
-                const enum_simple = self.castTag(.enum_simple).?.data;
+                const enum_simple = ty.castTag(.enum_simple).?.data;
                 const bits = std.math.log2_int_ceil(usize, enum_simple.fields.count());
                 buffer.* = .{
                     .base = .{ .tag = .int_unsigned },
@@ -3164,6 +3155,7 @@ pub const Type = extern union {
                 };
                 return Type.initPayload(&buffer.base);
             },
+            .union_tagged => return ty.castTag(.union_tagged).?.data.tag_ty.intTagType(buffer),
             else => unreachable,
         }
     }
@@ -3317,6 +3309,16 @@ pub const Type = extern union {
         }
     }
 
+    pub fn structFields(ty: Type) Module.Struct.Fields {
+        switch (ty.tag()) {
+            .@"struct" => {
+                const struct_obj = ty.castTag(.@"struct").?.data;
+                return struct_obj.fields;
+            },
+            else => unreachable,
+        }
+    }
+
     pub fn structFieldCount(ty: Type) usize {
         switch (ty.tag()) {
             .@"struct" => {
@@ -3815,7 +3817,7 @@ pub const Type = extern union {
                 bit_offset: u16 = 0,
                 host_size: u16 = 0,
                 @"allowzero": bool = false,
-                mutable: bool = true, // TODO change this to const, not mutable
+                mutable: bool = true, // TODO rename this to const, not mutable
                 @"volatile": bool = false,
                 size: std.builtin.TypeInfo.Pointer.Size = .One,
             };
test/behavior/bugs/9584.zig
@@ -57,4 +57,5 @@ test "bug 9584" {
         .x = flags,
     };
     try b(&x);
+    comptime if (@sizeOf(A) != 1) unreachable;
 }
test/behavior/array.zig
@@ -50,3 +50,29 @@ test "array literal with inferred length" {
     try expect(hex_mult.len == 4);
     try expect(hex_mult[1] == 256);
 }
+
+test "array dot len const expr" {
+    try expect(comptime x: {
+        break :x some_array.len == 4;
+    });
+}
+
+const ArrayDotLenConstExpr = struct {
+    y: [some_array.len]u8,
+};
+const some_array = [_]u8{ 0, 1, 2, 3 };
+
+test "array literal with specified size" {
+    var array = [2]u8{ 1, 2 };
+    try expect(array[0] == 1);
+    try expect(array[1] == 2);
+}
+
+test "array len field" {
+    var arr = [4]u8{ 0, 0, 0, 0 };
+    var ptr = &arr;
+    try expect(arr.len == 4);
+    comptime try expect(arr.len == 4);
+    try expect(ptr.len == 4);
+    comptime try expect(ptr.len == 4);
+}
test/behavior/array_stage1.zig
@@ -39,17 +39,6 @@ test "void arrays" {
     try expect(array.len == 4);
 }
 
-test "array dot len const expr" {
-    try expect(comptime x: {
-        break :x some_array.len == 4;
-    });
-}
-
-const ArrayDotLenConstExpr = struct {
-    y: [some_array.len]u8,
-};
-const some_array = [_]u8{ 0, 1, 2, 3 };
-
 test "nested arrays" {
     const array_of_strings = [_][]const u8{ "hello", "this", "is", "my", "thing" };
     for (array_of_strings) |s, i| {
@@ -76,24 +65,6 @@ test "set global var array via slice embedded in struct" {
     try expect(s_array[2].b == 3);
 }
 
-test "array literal with specified size" {
-    var array = [2]u8{
-        1,
-        2,
-    };
-    try expect(array[0] == 1);
-    try expect(array[1] == 2);
-}
-
-test "array len field" {
-    var arr = [4]u8{ 0, 0, 0, 0 };
-    var ptr = &arr;
-    try expect(arr.len == 4);
-    comptime try expect(arr.len == 4);
-    try expect(ptr.len == 4);
-    comptime try expect(ptr.len == 4);
-}
-
 test "single-item pointer to array indexing and slicing" {
     try testSingleItemPtrArrayIndexSlice();
     comptime try testSingleItemPtrArrayIndexSlice();
test/behavior/struct.zig
@@ -144,3 +144,11 @@ fn makeBar2(x: i32, y: i32) Bar {
         .y = y,
     };
 }
+
+test "return empty struct from fn" {
+    _ = testReturnEmptyStructFromFn();
+}
+const EmptyStruct2 = struct {};
+fn testReturnEmptyStructFromFn() EmptyStruct2 {
+    return EmptyStruct2{};
+}
test/behavior/struct_stage1.zig
@@ -72,9 +72,6 @@ const EmptyStruct = struct {
     }
 };
 
-test "return empty struct from fn" {
-    _ = testReturnEmptyStructFromFn();
-}
 const EmptyStruct2 = struct {};
 fn testReturnEmptyStructFromFn() EmptyStruct2 {
     return EmptyStruct2{};
test/behavior.zig
@@ -15,7 +15,6 @@ test {
     _ = @import("behavior/bugs/4769_a.zig");
     _ = @import("behavior/bugs/4769_b.zig");
     _ = @import("behavior/bugs/6850.zig");
-    _ = @import("behavior/bugs/9584.zig");
     _ = @import("behavior/call.zig");
     _ = @import("behavior/cast.zig");
     _ = @import("behavior/defer.zig");
@@ -104,6 +103,7 @@ test {
         _ = @import("behavior/bugs/7047.zig");
         _ = @import("behavior/bugs/7003.zig");
         _ = @import("behavior/bugs/7250.zig");
+        _ = @import("behavior/bugs/9584.zig");
         _ = @import("behavior/byteswap.zig");
         _ = @import("behavior/byval_arg_var.zig");
         _ = @import("behavior/call_stage1.zig");