master
   1pt: Zcu.PerThread,
   2air_instructions: std.MultiArrayList(Air.Inst),
   3air_extra: std.ArrayList(u32),
   4features: if (switch (dev.env) {
   5    .bootstrap => @import("../codegen/c.zig").legalizeFeatures(undefined),
   6    else => null,
   7}) |bootstrap_features| struct {
   8    fn init(features: *const Features) @This() {
   9        assert(features.eql(bootstrap_features.*));
  10        return .{};
  11    }
  12    /// `inline` to propagate comptime-known result.
  13    inline fn has(_: @This(), comptime feature: Feature) bool {
  14        return comptime bootstrap_features.contains(feature);
  15    }
  16    /// `inline` to propagate comptime-known result.
  17    inline fn hasAny(_: @This(), comptime features: []const Feature) bool {
  18        return comptime !bootstrap_features.intersectWith(.initMany(features)).eql(.initEmpty());
  19    }
  20} else struct {
  21    features: *const Features,
  22    /// `inline` to propagate whether `dev.check` returns.
  23    inline fn init(features: *const Features) @This() {
  24        dev.check(.legalize);
  25        return .{ .features = features };
  26    }
  27    fn has(rt: @This(), comptime feature: Feature) bool {
  28        return rt.features.contains(feature);
  29    }
  30    fn hasAny(rt: @This(), comptime features: []const Feature) bool {
  31        return !rt.features.intersectWith(comptime .initMany(features)).eql(comptime .initEmpty());
  32    }
  33},
  34
  35pub const Feature = enum {
  36    scalarize_add,
  37    scalarize_add_safe,
  38    scalarize_add_optimized,
  39    scalarize_add_wrap,
  40    scalarize_add_sat,
  41    scalarize_sub,
  42    scalarize_sub_safe,
  43    scalarize_sub_optimized,
  44    scalarize_sub_wrap,
  45    scalarize_sub_sat,
  46    scalarize_mul,
  47    scalarize_mul_safe,
  48    scalarize_mul_optimized,
  49    scalarize_mul_wrap,
  50    scalarize_mul_sat,
  51    scalarize_div_float,
  52    scalarize_div_float_optimized,
  53    scalarize_div_trunc,
  54    scalarize_div_trunc_optimized,
  55    scalarize_div_floor,
  56    scalarize_div_floor_optimized,
  57    scalarize_div_exact,
  58    scalarize_div_exact_optimized,
  59    scalarize_rem,
  60    scalarize_rem_optimized,
  61    scalarize_mod,
  62    scalarize_mod_optimized,
  63    scalarize_max,
  64    scalarize_min,
  65    scalarize_add_with_overflow,
  66    scalarize_sub_with_overflow,
  67    scalarize_mul_with_overflow,
  68    scalarize_shl_with_overflow,
  69    scalarize_bit_and,
  70    scalarize_bit_or,
  71    scalarize_shr,
  72    scalarize_shr_exact,
  73    scalarize_shl,
  74    scalarize_shl_exact,
  75    scalarize_shl_sat,
  76    scalarize_xor,
  77    scalarize_not,
  78    /// Scalarize `bitcast` from or to an array or vector type to `bitcast`s of the elements.
  79    /// This does not apply if `@bitSizeOf(Elem) == 8 * @sizeOf(Elem)`.
  80    /// When this feature is enabled, all remaining `bitcast`s can be lowered using the old bitcast
  81    /// semantics (reinterpret memory) instead of the new bitcast semantics (copy logical bits) and
  82    /// the behavior will be equivalent. However, the behavior of `@bitSize` on arrays must be
  83    /// changed in `Type.zig` before enabling this feature to conform to the new bitcast semantics.
  84    scalarize_bitcast,
  85    scalarize_clz,
  86    scalarize_ctz,
  87    scalarize_popcount,
  88    scalarize_byte_swap,
  89    scalarize_bit_reverse,
  90    scalarize_sqrt,
  91    scalarize_sin,
  92    scalarize_cos,
  93    scalarize_tan,
  94    scalarize_exp,
  95    scalarize_exp2,
  96    scalarize_log,
  97    scalarize_log2,
  98    scalarize_log10,
  99    scalarize_abs,
 100    scalarize_floor,
 101    scalarize_ceil,
 102    scalarize_round,
 103    scalarize_trunc_float,
 104    scalarize_neg,
 105    scalarize_neg_optimized,
 106    scalarize_cmp_vector,
 107    scalarize_cmp_vector_optimized,
 108    scalarize_fptrunc,
 109    scalarize_fpext,
 110    scalarize_intcast,
 111    scalarize_intcast_safe,
 112    scalarize_trunc,
 113    scalarize_int_from_float,
 114    scalarize_int_from_float_optimized,
 115    scalarize_int_from_float_safe,
 116    scalarize_int_from_float_optimized_safe,
 117    scalarize_float_from_int,
 118    scalarize_reduce,
 119    scalarize_reduce_optimized,
 120    scalarize_shuffle_one,
 121    scalarize_shuffle_two,
 122    scalarize_select,
 123    scalarize_mul_add,
 124
 125    /// Legalize (shift lhs, (splat rhs)) -> (shift lhs, rhs)
 126    unsplat_shift_rhs,
 127    /// Legalize reduce of a one element vector to a bitcast.
 128    reduce_one_elem_to_bitcast,
 129    /// Legalize splat to a one element vector to a bitcast.
 130    splat_one_elem_to_bitcast,
 131
 132    /// Replace `intcast_safe` with an explicit safety check which `call`s the panic function on failure.
 133    /// Not compatible with `scalarize_intcast_safe`.
 134    expand_intcast_safe,
 135    /// Replace `int_from_float_safe` with an explicit safety check which `call`s the panic function on failure.
 136    /// Not compatible with `scalarize_int_from_float_safe`.
 137    expand_int_from_float_safe,
 138    /// Replace `int_from_float_optimized_safe` with an explicit safety check which `call`s the panic function on failure.
 139    /// Not compatible with `scalarize_int_from_float_optimized_safe`.
 140    expand_int_from_float_optimized_safe,
 141    /// Replace `add_safe` with an explicit safety check which `call`s the panic function on failure.
 142    /// Not compatible with `scalarize_add_safe`.
 143    expand_add_safe,
 144    /// Replace `sub_safe` with an explicit safety check which `call`s the panic function on failure.
 145    /// Not compatible with `scalarize_sub_safe`.
 146    expand_sub_safe,
 147    /// Replace `mul_safe` with an explicit safety check which `call`s the panic function on failure.
 148    /// Not compatible with `scalarize_mul_safe`.
 149    expand_mul_safe,
 150
 151    /// Replace `load` from a packed pointer with a non-packed `load`, `shr`, `truncate`.
 152    /// Currently assumes little endian and a specific integer layout where the lsb of every integer is the lsb of the
 153    /// first byte of memory until bit pointers know their backing type.
 154    expand_packed_load,
 155    /// Replace `store` and `store_safe` to a packed pointer with a non-packed `load`/`store`, `bit_and`, `bit_or`, and `shl`.
 156    /// Currently assumes little endian and a specific integer layout where the lsb of every integer is the lsb of the
 157    /// first byte of memory until bit pointers know their backing type.
 158    expand_packed_store,
 159    /// Replace `struct_field_val` of a packed field with a `bitcast` to integer, `shr`, `trunc`, and `bitcast` to field type.
 160    expand_packed_struct_field_val,
 161    /// Replace `aggregate_init` of a packed struct with a sequence of `shl_exact`, `bitcast`, `intcast`, and `bit_or`.
 162    expand_packed_aggregate_init,
 163
 164    /// Replace all arithmetic operations on 16-bit floating-point types with calls to soft-float
 165    /// routines in compiler_rt, including `fptrunc`/`fpext`/`float_from_int`/`int_from_float`
 166    /// where the operand or target type is a 16-bit floating-point type. This feature implies:
 167    ///
 168    /// * scalarization of 16-bit float vector operations
 169    /// * expansion of safety-checked 16-bit float operations
 170    ///
 171    /// If this feature is enabled, the following AIR instruction tags may be emitted:
 172    /// * `.legalize_vec_elem_val`
 173    /// * `.legalize_vec_store_elem`
 174    /// * `.legalize_compiler_rt_call`
 175    soft_f16,
 176    /// Like `soft_f16`, but for 32-bit floating-point types.
 177    soft_f32,
 178    /// Like `soft_f16`, but for 64-bit floating-point types.
 179    soft_f64,
 180    /// Like `soft_f16`, but for 80-bit floating-point types.
 181    soft_f80,
 182    /// Like `soft_f16`, but for 128-bit floating-point types.
 183    soft_f128,
 184
 185    fn scalarize(tag: Air.Inst.Tag) Feature {
 186        return switch (tag) {
 187            else => unreachable,
 188            .add => .scalarize_add,
 189            .add_safe => .scalarize_add_safe,
 190            .add_optimized => .scalarize_add_optimized,
 191            .add_wrap => .scalarize_add_wrap,
 192            .add_sat => .scalarize_add_sat,
 193            .sub => .scalarize_sub,
 194            .sub_safe => .scalarize_sub_safe,
 195            .sub_optimized => .scalarize_sub_optimized,
 196            .sub_wrap => .scalarize_sub_wrap,
 197            .sub_sat => .scalarize_sub_sat,
 198            .mul => .scalarize_mul,
 199            .mul_safe => .scalarize_mul_safe,
 200            .mul_optimized => .scalarize_mul_optimized,
 201            .mul_wrap => .scalarize_mul_wrap,
 202            .mul_sat => .scalarize_mul_sat,
 203            .div_float => .scalarize_div_float,
 204            .div_float_optimized => .scalarize_div_float_optimized,
 205            .div_trunc => .scalarize_div_trunc,
 206            .div_trunc_optimized => .scalarize_div_trunc_optimized,
 207            .div_floor => .scalarize_div_floor,
 208            .div_floor_optimized => .scalarize_div_floor_optimized,
 209            .div_exact => .scalarize_div_exact,
 210            .div_exact_optimized => .scalarize_div_exact_optimized,
 211            .rem => .scalarize_rem,
 212            .rem_optimized => .scalarize_rem_optimized,
 213            .mod => .scalarize_mod,
 214            .mod_optimized => .scalarize_mod_optimized,
 215            .max => .scalarize_max,
 216            .min => .scalarize_min,
 217            .add_with_overflow => .scalarize_add_with_overflow,
 218            .sub_with_overflow => .scalarize_sub_with_overflow,
 219            .mul_with_overflow => .scalarize_mul_with_overflow,
 220            .shl_with_overflow => .scalarize_shl_with_overflow,
 221            .bit_and => .scalarize_bit_and,
 222            .bit_or => .scalarize_bit_or,
 223            .shr => .scalarize_shr,
 224            .shr_exact => .scalarize_shr_exact,
 225            .shl => .scalarize_shl,
 226            .shl_exact => .scalarize_shl_exact,
 227            .shl_sat => .scalarize_shl_sat,
 228            .xor => .scalarize_xor,
 229            .not => .scalarize_not,
 230            .bitcast => .scalarize_bitcast,
 231            .clz => .scalarize_clz,
 232            .ctz => .scalarize_ctz,
 233            .popcount => .scalarize_popcount,
 234            .byte_swap => .scalarize_byte_swap,
 235            .bit_reverse => .scalarize_bit_reverse,
 236            .sqrt => .scalarize_sqrt,
 237            .sin => .scalarize_sin,
 238            .cos => .scalarize_cos,
 239            .tan => .scalarize_tan,
 240            .exp => .scalarize_exp,
 241            .exp2 => .scalarize_exp2,
 242            .log => .scalarize_log,
 243            .log2 => .scalarize_log2,
 244            .log10 => .scalarize_log10,
 245            .abs => .scalarize_abs,
 246            .floor => .scalarize_floor,
 247            .ceil => .scalarize_ceil,
 248            .round => .scalarize_round,
 249            .trunc_float => .scalarize_trunc_float,
 250            .neg => .scalarize_neg,
 251            .neg_optimized => .scalarize_neg_optimized,
 252            .cmp_vector => .scalarize_cmp_vector,
 253            .cmp_vector_optimized => .scalarize_cmp_vector_optimized,
 254            .fptrunc => .scalarize_fptrunc,
 255            .fpext => .scalarize_fpext,
 256            .intcast => .scalarize_intcast,
 257            .intcast_safe => .scalarize_intcast_safe,
 258            .trunc => .scalarize_trunc,
 259            .int_from_float => .scalarize_int_from_float,
 260            .int_from_float_optimized => .scalarize_int_from_float_optimized,
 261            .int_from_float_safe => .scalarize_int_from_float_safe,
 262            .int_from_float_optimized_safe => .scalarize_int_from_float_optimized_safe,
 263            .float_from_int => .scalarize_float_from_int,
 264            .reduce => .scalarize_reduce,
 265            .reduce_optimized => .scalarize_reduce_optimized,
 266            .shuffle_one => .scalarize_shuffle_one,
 267            .shuffle_two => .scalarize_shuffle_two,
 268            .select => .scalarize_select,
 269            .mul_add => .scalarize_mul_add,
 270        };
 271    }
 272};
 273
 274pub const Features = std.enums.EnumSet(Feature);
 275
 276pub const Error = std.mem.Allocator.Error;
 277
 278pub fn legalize(air: *Air, pt: Zcu.PerThread, features: *const Features) Error!void {
 279    assert(!features.eql(comptime .initEmpty())); // backend asked to run legalize, but no features were enabled
 280    var l: Legalize = .{
 281        .pt = pt,
 282        .air_instructions = air.instructions.toMultiArrayList(),
 283        .air_extra = air.extra,
 284        .features = .init(features),
 285    };
 286    defer air.* = l.getTmpAir();
 287    const main_extra = l.extraData(Air.Block, l.air_extra.items[@intFromEnum(Air.ExtraIndex.main_block)]);
 288    try l.legalizeBody(main_extra.end, main_extra.data.body_len);
 289}
 290
 291fn getTmpAir(l: *const Legalize) Air {
 292    return .{
 293        .instructions = l.air_instructions.slice(),
 294        .extra = l.air_extra,
 295    };
 296}
 297
 298fn typeOf(l: *const Legalize, ref: Air.Inst.Ref) Type {
 299    return l.getTmpAir().typeOf(ref, &l.pt.zcu.intern_pool);
 300}
 301
 302fn typeOfIndex(l: *const Legalize, inst: Air.Inst.Index) Type {
 303    return l.getTmpAir().typeOfIndex(inst, &l.pt.zcu.intern_pool);
 304}
 305
 306fn extraData(l: *const Legalize, comptime T: type, index: usize) @TypeOf(Air.extraData(undefined, T, undefined)) {
 307    return l.getTmpAir().extraData(T, index);
 308}
 309
 310fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void {
 311    // In zig1, this function needs a lot of eval branch quota, because all of the inlined feature
 312    // checks are comptime-evaluated (to ensure unused features are not included in the binary).
 313    @setEvalBranchQuota(4000);
 314
 315    const zcu = l.pt.zcu;
 316    const ip = &zcu.intern_pool;
 317    for (0..body_len) |body_index| {
 318        const inst: Air.Inst.Index = @enumFromInt(l.air_extra.items[body_start + body_index]);
 319        inst: switch (l.air_instructions.items(.tag)[@intFromEnum(inst)]) {
 320            .arg => {},
 321            inline .add,
 322            .add_optimized,
 323            .sub,
 324            .sub_optimized,
 325            .mul,
 326            .mul_optimized,
 327            .div_float,
 328            .div_float_optimized,
 329            .div_exact,
 330            .div_exact_optimized,
 331            .rem,
 332            .rem_optimized,
 333            .min,
 334            .max,
 335            => |air_tag| {
 336                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 337                const ty = l.typeOf(bin_op.lhs);
 338                switch (l.wantScalarizeOrSoftFloat(air_tag, ty)) {
 339                    .none => {},
 340                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)),
 341                    .soft_float => continue :inst try l.compilerRtCall(
 342                        inst,
 343                        softFloatFunc(air_tag, ty, zcu),
 344                        &.{ bin_op.lhs, bin_op.rhs },
 345                        l.typeOf(bin_op.lhs),
 346                    ),
 347                }
 348            },
 349            inline .div_trunc,
 350            .div_trunc_optimized,
 351            .div_floor,
 352            .div_floor_optimized,
 353            => |air_tag| {
 354                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 355                switch (l.wantScalarizeOrSoftFloat(air_tag, l.typeOf(bin_op.lhs))) {
 356                    .none => {},
 357                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)),
 358                    .soft_float => continue :inst l.replaceInst(inst, .block, try l.softFloatDivTruncFloorBlockPayload(
 359                        inst,
 360                        bin_op.lhs,
 361                        bin_op.rhs,
 362                        air_tag,
 363                    )),
 364                }
 365            },
 366            inline .mod, .mod_optimized => |air_tag| {
 367                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 368                switch (l.wantScalarizeOrSoftFloat(air_tag, l.typeOf(bin_op.lhs))) {
 369                    .none => {},
 370                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)),
 371                    .soft_float => continue :inst l.replaceInst(inst, .block, try l.softFloatModBlockPayload(
 372                        inst,
 373                        bin_op.lhs,
 374                        bin_op.rhs,
 375                    )),
 376                }
 377            },
 378            inline .add_wrap,
 379            .add_sat,
 380            .sub_wrap,
 381            .sub_sat,
 382            .mul_wrap,
 383            .mul_sat,
 384            .bit_and,
 385            .bit_or,
 386            .xor,
 387            => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) {
 388                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 389                if (l.typeOf(bin_op.lhs).isVector(zcu)) {
 390                    continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op));
 391                }
 392            },
 393            .add_safe => if (l.features.has(.expand_add_safe)) {
 394                assert(!l.features.has(.scalarize_add_safe)); // it doesn't make sense to do both
 395                continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .add_with_overflow));
 396            } else if (l.features.has(.scalarize_add_safe)) {
 397                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 398                if (l.typeOf(bin_op.lhs).isVector(zcu)) {
 399                    continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op));
 400                }
 401            },
 402            .sub_safe => if (l.features.has(.expand_sub_safe)) {
 403                assert(!l.features.has(.scalarize_sub_safe)); // it doesn't make sense to do both
 404                continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .sub_with_overflow));
 405            } else if (l.features.has(.scalarize_sub_safe)) {
 406                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 407                if (l.typeOf(bin_op.lhs).isVector(zcu)) {
 408                    continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op));
 409                }
 410            },
 411            .mul_safe => if (l.features.has(.expand_mul_safe)) {
 412                assert(!l.features.has(.scalarize_mul_safe)); // it doesn't make sense to do both
 413                continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .mul_with_overflow));
 414            } else if (l.features.has(.scalarize_mul_safe)) {
 415                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 416                if (l.typeOf(bin_op.lhs).isVector(zcu)) {
 417                    continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op));
 418                }
 419            },
 420            .ptr_add, .ptr_sub => {},
 421            inline .add_with_overflow,
 422            .sub_with_overflow,
 423            .mul_with_overflow,
 424            .shl_with_overflow,
 425            => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) {
 426                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 427                if (ty_pl.ty.toType().fieldType(0, zcu).isVector(zcu)) {
 428                    continue :inst l.replaceInst(inst, .block, try l.scalarizeOverflowBlockPayload(inst));
 429                }
 430            },
 431            .alloc => {},
 432            .inferred_alloc, .inferred_alloc_comptime => unreachable,
 433            .ret_ptr, .assembly => {},
 434            inline .shr,
 435            .shr_exact,
 436            .shl,
 437            .shl_exact,
 438            .shl_sat,
 439            => |air_tag| if (l.features.hasAny(&.{
 440                .unsplat_shift_rhs,
 441                .scalarize(air_tag),
 442            })) {
 443                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 444                if (l.typeOf(bin_op.rhs).isVector(zcu)) {
 445                    if (l.features.has(.unsplat_shift_rhs)) {
 446                        if (bin_op.rhs.toInterned()) |rhs_ip_index| switch (ip.indexToKey(rhs_ip_index)) {
 447                            else => {},
 448                            .aggregate => |aggregate| switch (aggregate.storage) {
 449                                else => {},
 450                                .repeated_elem => |splat| continue :inst l.replaceInst(inst, air_tag, .{ .bin_op = .{
 451                                    .lhs = bin_op.lhs,
 452                                    .rhs = Air.internedToRef(splat),
 453                                } }),
 454                            },
 455                        } else {
 456                            const rhs_inst = bin_op.rhs.toIndex().?;
 457                            switch (l.air_instructions.items(.tag)[@intFromEnum(rhs_inst)]) {
 458                                else => {},
 459                                .splat => continue :inst l.replaceInst(inst, air_tag, .{ .bin_op = .{
 460                                    .lhs = bin_op.lhs,
 461                                    .rhs = l.air_instructions.items(.data)[@intFromEnum(rhs_inst)].ty_op.operand,
 462                                } }),
 463                            }
 464                        }
 465                    }
 466                    if (l.features.has(comptime .scalarize(air_tag))) {
 467                        continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op));
 468                    }
 469                }
 470            },
 471            inline .not,
 472            .clz,
 473            .ctz,
 474            .popcount,
 475            .byte_swap,
 476            .bit_reverse,
 477            .intcast,
 478            .trunc,
 479            => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) {
 480                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 481                if (ty_op.ty.toType().isVector(zcu)) {
 482                    continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op));
 483                }
 484            },
 485            .abs => {
 486                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 487                switch (l.wantScalarizeOrSoftFloat(.abs, ty_op.ty.toType())) {
 488                    .none => {},
 489                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)),
 490                    .soft_float => continue :inst try l.compilerRtCall(
 491                        inst,
 492                        softFloatFunc(.abs, ty_op.ty.toType(), zcu),
 493                        &.{ty_op.operand},
 494                        ty_op.ty.toType(),
 495                    ),
 496                }
 497            },
 498            .fptrunc => {
 499                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 500                const src_ty = l.typeOf(ty_op.operand);
 501                const dest_ty = ty_op.ty.toType();
 502                if (src_ty.zigTypeTag(zcu) == .vector) {
 503                    if (l.features.has(.scalarize_fptrunc) or
 504                        l.wantSoftFloatScalar(src_ty.childType(zcu)) or
 505                        l.wantSoftFloatScalar(dest_ty.childType(zcu)))
 506                    {
 507                        continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op));
 508                    }
 509                } else if (l.wantSoftFloatScalar(src_ty) or l.wantSoftFloatScalar(dest_ty)) {
 510                    continue :inst try l.compilerRtCall(inst, l.softFptruncFunc(src_ty, dest_ty), &.{ty_op.operand}, dest_ty);
 511                }
 512            },
 513            .fpext => {
 514                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 515                const src_ty = l.typeOf(ty_op.operand);
 516                const dest_ty = ty_op.ty.toType();
 517                if (src_ty.zigTypeTag(zcu) == .vector) {
 518                    if (l.features.has(.scalarize_fpext) or
 519                        l.wantSoftFloatScalar(src_ty.childType(zcu)) or
 520                        l.wantSoftFloatScalar(dest_ty.childType(zcu)))
 521                    {
 522                        continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op));
 523                    }
 524                } else if (l.wantSoftFloatScalar(src_ty) or l.wantSoftFloatScalar(dest_ty)) {
 525                    continue :inst try l.compilerRtCall(inst, l.softFpextFunc(src_ty, dest_ty), &.{ty_op.operand}, dest_ty);
 526                }
 527            },
 528            inline .int_from_float, .int_from_float_optimized => |air_tag| {
 529                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 530                switch (l.wantScalarizeOrSoftFloat(air_tag, l.typeOf(ty_op.operand))) {
 531                    .none => {},
 532                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)),
 533                    .soft_float => switch (try l.softIntFromFloat(inst)) {
 534                        .call => |func| continue :inst try l.compilerRtCall(inst, func, &.{ty_op.operand}, ty_op.ty.toType()),
 535                        .block_payload => |data| continue :inst l.replaceInst(inst, .block, data),
 536                    },
 537                }
 538            },
 539            .float_from_int => {
 540                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 541                const dest_ty = ty_op.ty.toType();
 542                switch (l.wantScalarizeOrSoftFloat(.float_from_int, dest_ty)) {
 543                    .none => {},
 544                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)),
 545                    .soft_float => switch (try l.softFloatFromInt(inst)) {
 546                        .call => |func| continue :inst try l.compilerRtCall(inst, func, &.{ty_op.operand}, dest_ty),
 547                        .block_payload => |data| continue :inst l.replaceInst(inst, .block, data),
 548                    },
 549                }
 550            },
 551            .bitcast => if (l.features.has(.scalarize_bitcast)) {
 552                if (try l.scalarizeBitcastBlockPayload(inst)) |payload| {
 553                    continue :inst l.replaceInst(inst, .block, payload);
 554                }
 555            },
 556            .intcast_safe => if (l.features.has(.expand_intcast_safe)) {
 557                assert(!l.features.has(.scalarize_intcast_safe)); // it doesn't make sense to do both
 558                continue :inst l.replaceInst(inst, .block, try l.safeIntcastBlockPayload(inst));
 559            } else if (l.features.has(.scalarize_intcast_safe)) {
 560                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 561                if (ty_op.ty.toType().isVector(zcu)) {
 562                    continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op));
 563                }
 564            },
 565            inline .int_from_float_safe,
 566            .int_from_float_optimized_safe,
 567            => |air_tag| {
 568                const optimized = air_tag == .int_from_float_optimized_safe;
 569                const expand_feature = switch (air_tag) {
 570                    .int_from_float_safe => .expand_int_from_float_safe,
 571                    .int_from_float_optimized_safe => .expand_int_from_float_optimized_safe,
 572                    else => unreachable,
 573                };
 574                if (l.features.has(expand_feature)) {
 575                    assert(!l.features.has(.scalarize(air_tag)));
 576                    continue :inst l.replaceInst(inst, .block, try l.safeIntFromFloatBlockPayload(inst, optimized));
 577                }
 578                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 579                switch (l.wantScalarizeOrSoftFloat(air_tag, l.typeOf(ty_op.operand))) {
 580                    .none => {},
 581                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)),
 582                    // Expand the safety check so that soft-float can rewrite the unchecked operation.
 583                    .soft_float => continue :inst l.replaceInst(inst, .block, try l.safeIntFromFloatBlockPayload(inst, optimized)),
 584                }
 585            },
 586            .block, .loop => {
 587                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 588                const extra = l.extraData(Air.Block, ty_pl.payload);
 589                try l.legalizeBody(extra.end, extra.data.body_len);
 590            },
 591            .repeat,
 592            .br,
 593            .trap,
 594            .breakpoint,
 595            .ret_addr,
 596            .frame_addr,
 597            .call,
 598            .call_always_tail,
 599            .call_never_tail,
 600            .call_never_inline,
 601            => {},
 602            inline .sqrt,
 603            .sin,
 604            .cos,
 605            .tan,
 606            .exp,
 607            .exp2,
 608            .log,
 609            .log2,
 610            .log10,
 611            .floor,
 612            .ceil,
 613            .round,
 614            .trunc_float,
 615            => |air_tag| {
 616                const operand = l.air_instructions.items(.data)[@intFromEnum(inst)].un_op;
 617                const ty = l.typeOf(operand);
 618                switch (l.wantScalarizeOrSoftFloat(air_tag, ty)) {
 619                    .none => {},
 620                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .un_op)),
 621                    .soft_float => continue :inst try l.compilerRtCall(
 622                        inst,
 623                        softFloatFunc(air_tag, ty, zcu),
 624                        &.{operand},
 625                        l.typeOf(operand),
 626                    ),
 627                }
 628            },
 629            inline .neg, .neg_optimized => |air_tag| {
 630                const operand = l.air_instructions.items(.data)[@intFromEnum(inst)].un_op;
 631                switch (l.wantScalarizeOrSoftFloat(air_tag, l.typeOf(operand))) {
 632                    .none => {},
 633                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .un_op)),
 634                    .soft_float => continue :inst l.replaceInst(inst, .block, try l.softFloatNegBlockPayload(inst, operand)),
 635                }
 636            },
 637            .cmp_lt,
 638            .cmp_lt_optimized,
 639            .cmp_lte,
 640            .cmp_lte_optimized,
 641            .cmp_eq,
 642            .cmp_eq_optimized,
 643            .cmp_gte,
 644            .cmp_gte_optimized,
 645            .cmp_gt,
 646            .cmp_gt_optimized,
 647            .cmp_neq,
 648            .cmp_neq_optimized,
 649            => |air_tag| {
 650                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 651                const ty = l.typeOf(bin_op.lhs);
 652                if (l.wantSoftFloatScalar(ty)) {
 653                    continue :inst l.replaceInst(
 654                        inst,
 655                        .block,
 656                        try l.softFloatCmpBlockPayload(inst, ty, air_tag.toCmpOp().?, bin_op.lhs, bin_op.rhs),
 657                    );
 658                }
 659            },
 660            inline .cmp_vector, .cmp_vector_optimized => |air_tag| {
 661                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 662                const payload = l.extraData(Air.VectorCmp, ty_pl.payload).data;
 663                switch (l.wantScalarizeOrSoftFloat(air_tag, l.typeOf(payload.lhs))) {
 664                    .none => {},
 665                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .cmp_vector)),
 666                    .soft_float => unreachable, // the operand is not a scalar
 667                }
 668            },
 669            .cond_br => {
 670                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
 671                const extra = l.extraData(Air.CondBr, pl_op.payload);
 672                try l.legalizeBody(extra.end, extra.data.then_body_len);
 673                try l.legalizeBody(extra.end + extra.data.then_body_len, extra.data.else_body_len);
 674            },
 675            .switch_br, .loop_switch_br => {
 676                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
 677                const extra = l.extraData(Air.SwitchBr, pl_op.payload);
 678                const hint_bag_count = std.math.divCeil(usize, extra.data.cases_len + 1, 10) catch unreachable;
 679                var extra_index = extra.end + hint_bag_count;
 680                for (0..extra.data.cases_len) |_| {
 681                    const case_extra = l.extraData(Air.SwitchBr.Case, extra_index);
 682                    const case_body_start = case_extra.end + case_extra.data.items_len + case_extra.data.ranges_len * 2;
 683                    try l.legalizeBody(case_body_start, case_extra.data.body_len);
 684                    extra_index = case_body_start + case_extra.data.body_len;
 685                }
 686                try l.legalizeBody(extra_index, extra.data.else_body_len);
 687            },
 688            .switch_dispatch => {},
 689            .@"try", .try_cold => {
 690                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
 691                const extra = l.extraData(Air.Try, pl_op.payload);
 692                try l.legalizeBody(extra.end, extra.data.body_len);
 693            },
 694            .try_ptr, .try_ptr_cold => {
 695                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 696                const extra = l.extraData(Air.TryPtr, ty_pl.payload);
 697                try l.legalizeBody(extra.end, extra.data.body_len);
 698            },
 699            .dbg_stmt, .dbg_empty_stmt => {},
 700            .dbg_inline_block => {
 701                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 702                const extra = l.extraData(Air.DbgInlineBlock, ty_pl.payload);
 703                try l.legalizeBody(extra.end, extra.data.body_len);
 704            },
 705            .dbg_var_ptr,
 706            .dbg_var_val,
 707            .dbg_arg_inline,
 708            .is_null,
 709            .is_non_null,
 710            .is_null_ptr,
 711            .is_non_null_ptr,
 712            .is_err,
 713            .is_non_err,
 714            .is_err_ptr,
 715            .is_non_err_ptr,
 716            .bool_and,
 717            .bool_or,
 718            => {},
 719            .load => if (l.features.has(.expand_packed_load)) {
 720                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 721                const ptr_info = l.typeOf(ty_op.operand).ptrInfo(zcu);
 722                if (ptr_info.packed_offset.host_size > 0 and ptr_info.flags.vector_index == .none) {
 723                    continue :inst l.replaceInst(inst, .block, try l.packedLoadBlockPayload(inst));
 724                }
 725            },
 726            .ret, .ret_safe, .ret_load => {},
 727            .store, .store_safe => if (l.features.has(.expand_packed_store)) {
 728                const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op;
 729                const ptr_info = l.typeOf(bin_op.lhs).ptrInfo(zcu);
 730                if (ptr_info.packed_offset.host_size > 0 and ptr_info.flags.vector_index == .none) {
 731                    continue :inst l.replaceInst(inst, .block, try l.packedStoreBlockPayload(inst));
 732                }
 733            },
 734            .unreach,
 735            .optional_payload,
 736            .optional_payload_ptr,
 737            .optional_payload_ptr_set,
 738            .wrap_optional,
 739            .unwrap_errunion_payload,
 740            .unwrap_errunion_err,
 741            .unwrap_errunion_payload_ptr,
 742            .unwrap_errunion_err_ptr,
 743            .errunion_payload_ptr_set,
 744            .wrap_errunion_payload,
 745            .wrap_errunion_err,
 746            .struct_field_ptr,
 747            .struct_field_ptr_index_0,
 748            .struct_field_ptr_index_1,
 749            .struct_field_ptr_index_2,
 750            .struct_field_ptr_index_3,
 751            => {},
 752            .struct_field_val => if (l.features.has(.expand_packed_struct_field_val)) {
 753                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 754                const extra = l.extraData(Air.StructField, ty_pl.payload).data;
 755                switch (l.typeOf(extra.struct_operand).containerLayout(zcu)) {
 756                    .auto, .@"extern" => {},
 757                    .@"packed" => continue :inst l.replaceInst(inst, .block, try l.packedStructFieldValBlockPayload(inst)),
 758                }
 759            },
 760            .set_union_tag,
 761            .get_union_tag,
 762            .slice,
 763            .slice_len,
 764            .slice_ptr,
 765            .ptr_slice_len_ptr,
 766            .ptr_slice_ptr_ptr,
 767            .array_elem_val,
 768            .slice_elem_val,
 769            .slice_elem_ptr,
 770            .ptr_elem_val,
 771            .ptr_elem_ptr,
 772            .array_to_slice,
 773            => {},
 774            inline .reduce, .reduce_optimized => |air_tag| {
 775                const reduce = l.air_instructions.items(.data)[@intFromEnum(inst)].reduce;
 776                const vector_ty = l.typeOf(reduce.operand);
 777                if (l.features.has(.reduce_one_elem_to_bitcast)) {
 778                    switch (vector_ty.vectorLen(zcu)) {
 779                        0 => unreachable,
 780                        1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
 781                            .ty = .fromType(vector_ty.childType(zcu)),
 782                            .operand = reduce.operand,
 783                        } }),
 784                        else => {},
 785                    }
 786                }
 787                switch (l.wantScalarizeOrSoftFloat(air_tag, vector_ty)) {
 788                    .none => {},
 789                    .scalarize => continue :inst l.replaceInst(
 790                        inst,
 791                        .block,
 792                        try l.scalarizeReduceBlockPayload(inst, air_tag == .reduce_optimized),
 793                    ),
 794                    .soft_float => unreachable, // the operand is not a scalar
 795                }
 796            },
 797            .splat => if (l.features.has(.splat_one_elem_to_bitcast)) {
 798                const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op;
 799                switch (ty_op.ty.toType().vectorLen(zcu)) {
 800                    0 => unreachable,
 801                    1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
 802                        .ty = ty_op.ty,
 803                        .operand = ty_op.operand,
 804                    } }),
 805                    else => {},
 806                }
 807            },
 808            .shuffle_one => {
 809                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 810                switch (l.wantScalarizeOrSoftFloat(.shuffle_one, ty_pl.ty.toType())) {
 811                    .none => {},
 812                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeShuffleOneBlockPayload(inst)),
 813                    .soft_float => unreachable, // the operand is not a scalar
 814                }
 815            },
 816            .shuffle_two => {
 817                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 818                switch (l.wantScalarizeOrSoftFloat(.shuffle_two, ty_pl.ty.toType())) {
 819                    .none => {},
 820                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeShuffleTwoBlockPayload(inst)),
 821                    .soft_float => unreachable, // the operand is not a scalar
 822                }
 823            },
 824            .select => {
 825                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
 826                const bin = l.extraData(Air.Bin, pl_op.payload).data;
 827                switch (l.wantScalarizeOrSoftFloat(.select, l.typeOf(bin.lhs))) {
 828                    .none => {},
 829                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .select)),
 830                    .soft_float => unreachable, // the operand is not a scalar
 831                }
 832            },
 833            .memset,
 834            .memset_safe,
 835            .memcpy,
 836            .memmove,
 837            .cmpxchg_weak,
 838            .cmpxchg_strong,
 839            .atomic_load,
 840            .atomic_store_unordered,
 841            .atomic_store_monotonic,
 842            .atomic_store_release,
 843            .atomic_store_seq_cst,
 844            .atomic_rmw,
 845            .is_named_enum_value,
 846            .tag_name,
 847            .error_name,
 848            .error_set_has_value,
 849            => {},
 850            .aggregate_init => if (l.features.has(.expand_packed_aggregate_init)) {
 851                const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl;
 852                const agg_ty = ty_pl.ty.toType();
 853                switch (agg_ty.zigTypeTag(zcu)) {
 854                    else => {},
 855                    .@"union" => unreachable,
 856                    .@"struct" => switch (agg_ty.containerLayout(zcu)) {
 857                        .auto, .@"extern" => {},
 858                        .@"packed" => switch (agg_ty.structFieldCount(zcu)) {
 859                            0 => unreachable,
 860                            // An `aggregate_init` of a packed struct with 1 field is just a fancy bitcast.
 861                            1 => continue :inst l.replaceInst(inst, .bitcast, .{ .ty_op = .{
 862                                .ty = .fromType(agg_ty),
 863                                .operand = @enumFromInt(l.air_extra.items[ty_pl.payload]),
 864                            } }),
 865                            else => continue :inst l.replaceInst(inst, .block, try l.packedAggregateInitBlockPayload(inst)),
 866                        },
 867                    },
 868                }
 869            },
 870            .union_init, .prefetch => {},
 871            .mul_add => {
 872                const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op;
 873                const ty = l.typeOf(pl_op.operand);
 874                switch (l.wantScalarizeOrSoftFloat(.mul_add, ty)) {
 875                    .none => {},
 876                    .scalarize => continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .pl_op_bin)),
 877                    .soft_float => {
 878                        const bin = l.extraData(Air.Bin, pl_op.payload).data;
 879                        const func = softFloatFunc(.mul_add, ty, zcu);
 880                        continue :inst try l.compilerRtCall(inst, func, &.{ bin.lhs, bin.rhs, pl_op.operand }, ty);
 881                    },
 882                }
 883            },
 884            .field_parent_ptr,
 885            .wasm_memory_size,
 886            .wasm_memory_grow,
 887            .cmp_lt_errors_len,
 888            .err_return_trace,
 889            .set_err_return_trace,
 890            .addrspace_cast,
 891            .save_err_return_trace_index,
 892            .runtime_nav_ptr,
 893            .c_va_arg,
 894            .c_va_copy,
 895            .c_va_end,
 896            .c_va_start,
 897            .work_item_id,
 898            .work_group_size,
 899            .work_group_id,
 900            .legalize_vec_elem_val,
 901            .legalize_vec_store_elem,
 902            .legalize_compiler_rt_call,
 903            => {},
 904        }
 905    }
 906}
 907
 908const ScalarizeForm = enum { un_op, ty_op, bin_op, pl_op_bin, cmp_vector, select };
 909fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, form: ScalarizeForm) Error!Air.Inst.Data {
 910    const pt = l.pt;
 911    const zcu = pt.zcu;
 912
 913    const orig = l.air_instructions.get(@intFromEnum(orig_inst));
 914    const res_ty = l.typeOfIndex(orig_inst);
 915    const result_is_array = switch (res_ty.zigTypeTag(zcu)) {
 916        .vector => false,
 917        .array => true,
 918        else => unreachable,
 919    };
 920    const res_len = res_ty.arrayLen(zcu);
 921    const res_elem_ty = res_ty.childType(zcu);
 922
 923    if (result_is_array) {
 924        // This is only allowed when legalizing an elementwise bitcast.
 925        assert(orig.tag == .bitcast);
 926        assert(form == .ty_op);
 927    }
 928
 929    // Our output will be a loop doing elementwise stores:
 930    //
 931    // %1 = block(@Vector(N, Scalar), {
 932    //   %2 = alloc(*usize)
 933    //   %3 = alloc(*@Vector(N, Scalar))
 934    //   %4 = store(%2, @zero_usize)
 935    //   %5 = loop({
 936    //     %6 = load(%2)
 937    //     %7 = <scalar result of operation at index %5>
 938    //     %8 = legalize_vec_store_elem(%3, %5, %6)
 939    //     %9 = cmp_eq(%6, <usize, N-1>)
 940    //     %10 = cond_br(%9, {
 941    //       %11 = load(%3)
 942    //       %12 = br(%1, %11)
 943    //     }, {
 944    //       %13 = add(%6, @one_usize)
 945    //       %14 = store(%2, %13)
 946    //       %15 = repeat(%5)
 947    //     })
 948    //   })
 949    // })
 950    //
 951    // If scalarizing an elementwise bitcast, the result might be an array, in which case
 952    // `legalize_vec_store_elem` becomes two instructions (`ptr_elem_ptr` and `store`).
 953    // Therefore, there are 13 or 14 instructions in the block, plus however many are
 954    // needed to compute each result element for `form`.
 955    const inst_per_form: usize = switch (form) {
 956        .un_op, .ty_op => 2,
 957        .bin_op, .cmp_vector => 3,
 958        .pl_op_bin => 4,
 959        .select => 7,
 960    };
 961    const max_inst_per_form = 7; // maximum value in the above switch
 962    var inst_buf: [14 + max_inst_per_form]Air.Inst.Index = undefined;
 963
 964    var main_block: Block = .init(&inst_buf);
 965    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
 966
 967    const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef();
 968    const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(res_ty)).toRef();
 969
 970    _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize);
 971
 972    var loop: Loop = .init(l, &main_block);
 973    loop.block = .init(main_block.stealRemainingCapacity());
 974
 975    const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
 976    const elem_val: Air.Inst.Ref = switch (form) {
 977        .un_op => elem: {
 978            const orig_operand = orig.data.un_op;
 979            const operand = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operand, index_val).toRef();
 980            break :elem loop.block.addUnOp(l, orig.tag, operand).toRef();
 981        },
 982        .ty_op => elem: {
 983            const orig_operand = orig.data.ty_op.operand;
 984            const operand_is_array = switch (l.typeOf(orig_operand).zigTypeTag(zcu)) {
 985                .vector => false,
 986                .array => true,
 987                else => unreachable,
 988            };
 989            const operand = loop.block.addBinOp(
 990                l,
 991                if (operand_is_array) .array_elem_val else .legalize_vec_elem_val,
 992                orig_operand,
 993                index_val,
 994            ).toRef();
 995            break :elem loop.block.addTyOp(l, orig.tag, res_elem_ty, operand).toRef();
 996        },
 997        .bin_op => elem: {
 998            const orig_bin = orig.data.bin_op;
 999            const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.lhs, index_val).toRef();
1000            const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.rhs, index_val).toRef();
1001            break :elem loop.block.addBinOp(l, orig.tag, lhs, rhs).toRef();
1002        },
1003        .pl_op_bin => elem: {
1004            const orig_operand = orig.data.pl_op.operand;
1005            const orig_bin = l.extraData(Air.Bin, orig.data.pl_op.payload).data;
1006            const operand = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operand, index_val).toRef();
1007            const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.lhs, index_val).toRef();
1008            const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.rhs, index_val).toRef();
1009            break :elem loop.block.add(l, .{
1010                .tag = orig.tag,
1011                .data = .{ .pl_op = .{
1012                    .operand = operand,
1013                    .payload = try l.addExtra(Air.Bin, .{ .lhs = lhs, .rhs = rhs }),
1014                } },
1015            }).toRef();
1016        },
1017        .cmp_vector => elem: {
1018            const orig_payload = l.extraData(Air.VectorCmp, orig.data.ty_pl.payload).data;
1019            const cmp_op = orig_payload.compareOperator();
1020            const optimized = switch (orig.tag) {
1021                .cmp_vector => false,
1022                .cmp_vector_optimized => true,
1023                else => unreachable,
1024            };
1025            const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_payload.lhs, index_val).toRef();
1026            const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_payload.rhs, index_val).toRef();
1027            break :elem loop.block.addCmpScalar(l, cmp_op, lhs, rhs, optimized).toRef();
1028        },
1029        .select => elem: {
1030            const orig_cond = orig.data.pl_op.operand;
1031            const orig_bin = l.extraData(Air.Bin, orig.data.pl_op.payload).data;
1032
1033            const elem_block_inst = loop.block.add(l, .{
1034                .tag = .block,
1035                .data = .{ .ty_pl = .{
1036                    .ty = .fromType(res_elem_ty),
1037                    .payload = undefined,
1038                } },
1039            });
1040            var elem_block: Block = .init(loop.block.stealCapacity(2));
1041            const cond = elem_block.addBinOp(l, .legalize_vec_elem_val, orig_cond, index_val).toRef();
1042
1043            var condbr: CondBr = .init(l, cond, &elem_block, .{});
1044
1045            condbr.then_block = .init(loop.block.stealCapacity(2));
1046            const lhs = condbr.then_block.addBinOp(l, .legalize_vec_elem_val, orig_bin.lhs, index_val).toRef();
1047            condbr.then_block.addBr(l, elem_block_inst, lhs);
1048
1049            condbr.else_block = .init(loop.block.stealCapacity(2));
1050            const rhs = condbr.else_block.addBinOp(l, .legalize_vec_elem_val, orig_bin.rhs, index_val).toRef();
1051            condbr.else_block.addBr(l, elem_block_inst, rhs);
1052
1053            try condbr.finish(l);
1054
1055            const inst_data = l.air_instructions.items(.data);
1056            inst_data[@intFromEnum(elem_block_inst)].ty_pl.payload = try l.addBlockBody(elem_block.body());
1057
1058            break :elem elem_block_inst.toRef();
1059        },
1060    };
1061    _ = loop.block.stealCapacity(max_inst_per_form - inst_per_form);
1062    if (result_is_array) {
1063        const elem_ptr = loop.block.add(l, .{
1064            .tag = .ptr_elem_ptr,
1065            .data = .{ .ty_pl = .{
1066                .ty = .fromType(try pt.singleMutPtrType(res_elem_ty)),
1067                .payload = try l.addExtra(Air.Bin, .{
1068                    .lhs = result_ptr,
1069                    .rhs = index_val,
1070                }),
1071            } },
1072        }).toRef();
1073        _ = loop.block.addBinOp(l, .store, elem_ptr, elem_val);
1074    } else {
1075        _ = loop.block.add(l, .{
1076            .tag = .legalize_vec_store_elem,
1077            .data = .{ .pl_op = .{
1078                .operand = result_ptr,
1079                .payload = try l.addExtra(Air.Bin, .{
1080                    .lhs = index_val,
1081                    .rhs = elem_val,
1082                }),
1083            } },
1084        });
1085        _ = loop.block.stealCapacity(1);
1086    }
1087    const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, res_len - 1))).toRef();
1088
1089    var condbr: CondBr = .init(l, is_end_val, &loop.block, .{});
1090    condbr.then_block = .init(loop.block.stealRemainingCapacity());
1091    const result_val = condbr.then_block.addTyOp(l, .load, res_ty, result_ptr).toRef();
1092    condbr.then_block.addBr(l, orig_inst, result_val);
1093
1094    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
1095    const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef();
1096    _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val);
1097    _ = condbr.else_block.add(l, .{
1098        .tag = .repeat,
1099        .data = .{ .repeat = .{ .loop_inst = loop.inst } },
1100    });
1101
1102    try condbr.finish(l);
1103
1104    try loop.finish(l);
1105
1106    return .{ .ty_pl = .{
1107        .ty = .fromType(res_ty),
1108        .payload = try l.addBlockBody(main_block.body()),
1109    } };
1110}
1111fn scalarizeShuffleOneBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
1112    const pt = l.pt;
1113    const zcu = pt.zcu;
1114    const gpa = zcu.gpa;
1115
1116    const shuffle = l.getTmpAir().unwrapShuffleOne(zcu, orig_inst);
1117
1118    // We're going to emit something like this:
1119    //
1120    //   var x: @Vector(N, T) = all_comptime_known_elems;
1121    //   for (out_idxs, in_idxs) |i, j| x[i] = operand[j];
1122    //
1123    // So we must first compute `out_idxs` and `in_idxs`.
1124
1125    var sfba_state = std.heap.stackFallback(512, gpa);
1126    const sfba = sfba_state.get();
1127
1128    const out_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len);
1129    defer sfba.free(out_idxs_buf);
1130
1131    const in_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len);
1132    defer sfba.free(in_idxs_buf);
1133
1134    var n: usize = 0;
1135    for (shuffle.mask, 0..) |mask, out_idx| switch (mask.unwrap()) {
1136        .value => {},
1137        .elem => |in_idx| {
1138            out_idxs_buf[n] = (try pt.intValue(.usize, out_idx)).toIntern();
1139            in_idxs_buf[n] = (try pt.intValue(.usize, in_idx)).toIntern();
1140            n += 1;
1141        },
1142    };
1143
1144    const init_val: Value = init: {
1145        const undef_val = try pt.undefValue(shuffle.result_ty.childType(zcu));
1146        const elems = try sfba.alloc(InternPool.Index, shuffle.mask.len);
1147        defer sfba.free(elems);
1148        for (shuffle.mask, elems) |mask, *elem| elem.* = switch (mask.unwrap()) {
1149            .value => |ip_index| ip_index,
1150            .elem => undef_val.toIntern(),
1151        };
1152        break :init try pt.aggregateValue(shuffle.result_ty, elems);
1153    };
1154
1155    // %1 = block(@Vector(N, T), {
1156    //   %2 = alloc(*@Vector(N, T))
1157    //   %3 = alloc(*usize)
1158    //   %4 = store(%2, <init_val>)
1159    //   %5 = [addScalarizedShuffle]
1160    //   %6 = load(%2)
1161    //   %7 = br(%1, %6)
1162    // })
1163
1164    var inst_buf: [6]Air.Inst.Index = undefined;
1165    var main_block: Block = .init(&inst_buf);
1166    try l.air_instructions.ensureUnusedCapacity(gpa, 19);
1167
1168    const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(shuffle.result_ty)).toRef();
1169    const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef();
1170
1171    _ = main_block.addBinOp(l, .store, result_ptr, .fromValue(init_val));
1172
1173    try l.addScalarizedShuffle(
1174        &main_block,
1175        shuffle.operand,
1176        result_ptr,
1177        index_ptr,
1178        out_idxs_buf[0..n],
1179        in_idxs_buf[0..n],
1180    );
1181
1182    const result_val = main_block.addTyOp(l, .load, shuffle.result_ty, result_ptr).toRef();
1183    main_block.addBr(l, orig_inst, result_val);
1184
1185    return .{ .ty_pl = .{
1186        .ty = .fromType(shuffle.result_ty),
1187        .payload = try l.addBlockBody(main_block.body()),
1188    } };
1189}
1190fn scalarizeShuffleTwoBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
1191    const pt = l.pt;
1192    const zcu = pt.zcu;
1193    const gpa = zcu.gpa;
1194
1195    const shuffle = l.getTmpAir().unwrapShuffleTwo(zcu, orig_inst);
1196
1197    // We're going to emit something like this:
1198    //
1199    //   var x: @Vector(N, T) = undefined;
1200    //   for (out_idxs_a, in_idxs_a) |i, j| x[i] = operand_a[j];
1201    //   for (out_idxs_b, in_idxs_b) |i, j| x[i] = operand_b[j];
1202    //
1203    // The AIR will look like this:
1204    //
1205    //   %1 = block(@Vector(N, T), {
1206    //     %2 = alloc(*@Vector(N, T))
1207    //     %3 = alloc(*usize)
1208    //     %4 = store(%2, <@Vector(N, T), undefined>)
1209    //     %5 = [addScalarizedShuffle]
1210    //     %6 = [addScalarizedShuffle]
1211    //     %7 = load(%2)
1212    //     %8 = br(%1, %7)
1213    //   })
1214
1215    var sfba_state = std.heap.stackFallback(512, gpa);
1216    const sfba = sfba_state.get();
1217
1218    const out_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len);
1219    defer sfba.free(out_idxs_buf);
1220
1221    const in_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len);
1222    defer sfba.free(in_idxs_buf);
1223
1224    // Iterate `shuffle.mask` before doing anything, because modifying AIR invalidates it.
1225    const out_idxs_a, const in_idxs_a, const out_idxs_b, const in_idxs_b = idxs: {
1226        var n: usize = 0;
1227        for (shuffle.mask, 0..) |mask, out_idx| switch (mask.unwrap()) {
1228            .undef, .b_elem => {},
1229            .a_elem => |in_idx| {
1230                out_idxs_buf[n] = (try pt.intValue(.usize, out_idx)).toIntern();
1231                in_idxs_buf[n] = (try pt.intValue(.usize, in_idx)).toIntern();
1232                n += 1;
1233            },
1234        };
1235        const a_len = n;
1236        for (shuffle.mask, 0..) |mask, out_idx| switch (mask.unwrap()) {
1237            .undef, .a_elem => {},
1238            .b_elem => |in_idx| {
1239                out_idxs_buf[n] = (try pt.intValue(.usize, out_idx)).toIntern();
1240                in_idxs_buf[n] = (try pt.intValue(.usize, in_idx)).toIntern();
1241                n += 1;
1242            },
1243        };
1244        break :idxs .{
1245            out_idxs_buf[0..a_len],
1246            in_idxs_buf[0..a_len],
1247            out_idxs_buf[a_len..n],
1248            in_idxs_buf[a_len..n],
1249        };
1250    };
1251
1252    var inst_buf: [7]Air.Inst.Index = undefined;
1253    var main_block: Block = .init(&inst_buf);
1254    try l.air_instructions.ensureUnusedCapacity(gpa, 33);
1255
1256    const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(shuffle.result_ty)).toRef();
1257    const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef();
1258
1259    _ = main_block.addBinOp(l, .store, result_ptr, .fromValue(try pt.undefValue(shuffle.result_ty)));
1260
1261    if (out_idxs_a.len == 0) {
1262        _ = main_block.stealCapacity(1);
1263    } else {
1264        try l.addScalarizedShuffle(
1265            &main_block,
1266            shuffle.operand_a,
1267            result_ptr,
1268            index_ptr,
1269            out_idxs_a,
1270            in_idxs_a,
1271        );
1272    }
1273
1274    if (out_idxs_b.len == 0) {
1275        _ = main_block.stealCapacity(1);
1276    } else {
1277        try l.addScalarizedShuffle(
1278            &main_block,
1279            shuffle.operand_b,
1280            result_ptr,
1281            index_ptr,
1282            out_idxs_b,
1283            in_idxs_b,
1284        );
1285    }
1286
1287    const result_val = main_block.addTyOp(l, .load, shuffle.result_ty, result_ptr).toRef();
1288    main_block.addBr(l, orig_inst, result_val);
1289
1290    return .{ .ty_pl = .{
1291        .ty = .fromType(shuffle.result_ty),
1292        .payload = try l.addBlockBody(main_block.body()),
1293    } };
1294}
1295/// Adds code to `parent_block` which behaves like this loop:
1296///
1297///   for (out_idxs, in_idxs) |i, j| result_vec_ptr[i] = operand_vec[j];
1298///
1299/// The actual AIR adds exactly one instruction to `parent_block` itself and 14 instructions
1300/// overall, and is as follows:
1301///
1302///   %1 = block(void, {
1303///     %2 = store(index_ptr, @zero_usize)
1304///     %3 = loop({
1305///       %4 = load(index_ptr)
1306///       %5 = ptr_elem_val(out_idxs_ptr, %4)
1307///       %6 = ptr_elem_val(in_idxs_ptr, %4)
1308///       %7 = legalize_vec_elem_val(operand_vec, %6)
1309///       %8 = legalize_vec_store_elem(result_vec_ptr, %4, %7)
1310///       %9 = cmp_eq(%4, <usize, out_idxs.len-1>)
1311///       %10 = cond_br(%9, {
1312///         %11 = br(%1, @void_value)
1313///       }, {
1314///         %12 = add(%4, @one_usize)
1315///         %13 = store(index_ptr, %12)
1316///         %14 = repeat(%3)
1317///       })
1318///     })
1319///   })
1320///
1321/// The caller is responsible for reserving space in `l.air_instructions`.
1322fn addScalarizedShuffle(
1323    l: *Legalize,
1324    parent_block: *Block,
1325    operand_vec: Air.Inst.Ref,
1326    result_vec_ptr: Air.Inst.Ref,
1327    index_ptr: Air.Inst.Ref,
1328    out_idxs: []const InternPool.Index,
1329    in_idxs: []const InternPool.Index,
1330) Error!void {
1331    const pt = l.pt;
1332
1333    assert(out_idxs.len == in_idxs.len);
1334    const n = out_idxs.len;
1335
1336    const idxs_ty = try pt.arrayType(.{ .len = n, .child = .usize_type });
1337    const idxs_ptr_ty = try pt.singleConstPtrType(idxs_ty);
1338    const manyptr_usize_ty = try pt.manyConstPtrType(.usize);
1339
1340    const out_idxs_ptr = try pt.intern(.{ .ptr = .{
1341        .ty = manyptr_usize_ty.toIntern(),
1342        .base_addr = .{ .uav = .{
1343            .val = (try pt.aggregateValue(idxs_ty, out_idxs)).toIntern(),
1344            .orig_ty = idxs_ptr_ty.toIntern(),
1345        } },
1346        .byte_offset = 0,
1347    } });
1348    const in_idxs_ptr = try pt.intern(.{ .ptr = .{
1349        .ty = manyptr_usize_ty.toIntern(),
1350        .base_addr = .{ .uav = .{
1351            .val = (try pt.aggregateValue(idxs_ty, in_idxs)).toIntern(),
1352            .orig_ty = idxs_ptr_ty.toIntern(),
1353        } },
1354        .byte_offset = 0,
1355    } });
1356
1357    const main_block_inst = parent_block.add(l, .{
1358        .tag = .block,
1359        .data = .{ .ty_pl = .{
1360            .ty = .void_type,
1361            .payload = undefined,
1362        } },
1363    });
1364
1365    var inst_buf: [13]Air.Inst.Index = undefined;
1366    var main_block: Block = .init(&inst_buf);
1367
1368    _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize);
1369
1370    var loop: Loop = .init(l, &main_block);
1371    loop.block = .init(main_block.stealRemainingCapacity());
1372
1373    const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
1374    const in_idx_val = loop.block.addBinOp(l, .ptr_elem_val, .fromIntern(in_idxs_ptr), index_val).toRef();
1375    const out_idx_val = loop.block.addBinOp(l, .ptr_elem_val, .fromIntern(out_idxs_ptr), index_val).toRef();
1376
1377    const elem_val = loop.block.addBinOp(l, .legalize_vec_elem_val, operand_vec, in_idx_val).toRef();
1378    _ = loop.block.add(l, .{
1379        .tag = .legalize_vec_store_elem,
1380        .data = .{ .pl_op = .{
1381            .operand = result_vec_ptr,
1382            .payload = try l.addExtra(Air.Bin, .{
1383                .lhs = out_idx_val,
1384                .rhs = elem_val,
1385            }),
1386        } },
1387    });
1388
1389    const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, n - 1))).toRef();
1390    var condbr: CondBr = .init(l, is_end_val, &loop.block, .{});
1391    condbr.then_block = .init(loop.block.stealRemainingCapacity());
1392    condbr.then_block.addBr(l, main_block_inst, .void_value);
1393
1394    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
1395    const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef();
1396    _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val);
1397    _ = condbr.else_block.add(l, .{
1398        .tag = .repeat,
1399        .data = .{ .repeat = .{ .loop_inst = loop.inst } },
1400    });
1401
1402    try condbr.finish(l);
1403    try loop.finish(l);
1404
1405    const inst_data = l.air_instructions.items(.data);
1406    inst_data[@intFromEnum(main_block_inst)].ty_pl.payload = try l.addBlockBody(main_block.body());
1407}
1408fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?Air.Inst.Data {
1409    const pt = l.pt;
1410    const zcu = pt.zcu;
1411
1412    const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
1413
1414    const dest_ty = ty_op.ty.toType();
1415    const dest_legal = switch (dest_ty.zigTypeTag(zcu)) {
1416        else => true,
1417        .array, .vector => legal: {
1418            if (dest_ty.arrayLen(zcu) == 1) break :legal true;
1419            const dest_elem_ty = dest_ty.childType(zcu);
1420            break :legal dest_elem_ty.bitSize(zcu) == 8 * dest_elem_ty.abiSize(zcu);
1421        },
1422    };
1423
1424    const operand_ty = l.typeOf(ty_op.operand);
1425    const operand_legal = switch (operand_ty.zigTypeTag(zcu)) {
1426        else => true,
1427        .array, .vector => legal: {
1428            if (operand_ty.arrayLen(zcu) == 1) break :legal true;
1429            const operand_elem_ty = operand_ty.childType(zcu);
1430            break :legal operand_elem_ty.bitSize(zcu) == 8 * operand_elem_ty.abiSize(zcu);
1431        },
1432    };
1433
1434    if (dest_legal and operand_legal) return null;
1435
1436    if (!operand_legal and !dest_legal and operand_ty.arrayLen(zcu) == dest_ty.arrayLen(zcu)) {
1437        // from_ty and to_ty are both arrays or vectors of types with the same bit size,
1438        // so we can do an elementwise bitcast.
1439        return try l.scalarizeBlockPayload(orig_inst, .ty_op);
1440    }
1441
1442    // Fallback path. Our strategy is to use an unsigned integer type as an intermediate
1443    // "bag of bits" representation which can be manipulated by bitwise operations.
1444
1445    const num_bits: u16 = @intCast(dest_ty.bitSize(zcu));
1446    assert(operand_ty.bitSize(zcu) == num_bits);
1447    const uint_ty = try pt.intType(.unsigned, num_bits);
1448    const shift_ty = try pt.intType(.unsigned, std.math.log2_int_ceil(u16, num_bits));
1449
1450    var inst_buf: [39]Air.Inst.Index = undefined;
1451    var main_block: Block = .init(&inst_buf);
1452    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
1453
1454    // First, convert `operand_ty` to `uint_ty` (`uN`).
1455
1456    const uint_val: Air.Inst.Ref = uint_val: {
1457        if (operand_legal) {
1458            _ = main_block.stealCapacity(19);
1459            break :uint_val main_block.addBitCast(l, uint_ty, ty_op.operand);
1460        }
1461
1462        // %1 = block({
1463        //   %2 = alloc(*usize)
1464        //   %3 = alloc(*uN)
1465        //   %4 = store(%2, <usize, operand_len>)
1466        //   %5 = store(%3, <uN, 0>)
1467        //   %6 = loop({
1468        //     %7 = load(%2)
1469        //     %8 = array_elem_val(orig_operand, %7)
1470        //     %9 = bitcast(uE, %8)
1471        //     %10 = intcast(uN, %9)
1472        //     %11 = load(%3)
1473        //     %12 = shl_exact(%11, <uS, E>)
1474        //     %13 = bit_or(%12, %10)
1475        //     %14 = cmp_eq(%4, @zero_usize)
1476        //     %15 = cond_br(%14, {
1477        //       %16 = br(%1, %13)
1478        //     }, {
1479        //       %17 = store(%3, %13)
1480        //       %18 = sub(%7, @one_usize)
1481        //       %19 = store(%2, %18)
1482        //       %20 = repeat(%6)
1483        //     })
1484        //   })
1485        // })
1486
1487        const elem_bits = operand_ty.childType(zcu).bitSize(zcu);
1488        const elem_bits_val = try pt.intValue(shift_ty, elem_bits);
1489        const elem_uint_ty = try pt.intType(.unsigned, @intCast(elem_bits));
1490
1491        const uint_block_inst = main_block.add(l, .{
1492            .tag = .block,
1493            .data = .{ .ty_pl = .{
1494                .ty = .fromType(uint_ty),
1495                .payload = undefined,
1496            } },
1497        });
1498        var uint_block: Block = .init(main_block.stealCapacity(19));
1499
1500        const index_ptr = uint_block.addTy(l, .alloc, .ptr_usize).toRef();
1501        const result_ptr = uint_block.addTy(l, .alloc, try pt.singleMutPtrType(uint_ty)).toRef();
1502        _ = uint_block.addBinOp(
1503            l,
1504            .store,
1505            index_ptr,
1506            .fromValue(try pt.intValue(.usize, operand_ty.arrayLen(zcu))),
1507        );
1508        _ = uint_block.addBinOp(l, .store, result_ptr, .fromValue(try pt.intValue(uint_ty, 0)));
1509
1510        var loop: Loop = .init(l, &uint_block);
1511        loop.block = .init(uint_block.stealRemainingCapacity());
1512
1513        const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
1514        const raw_elem = loop.block.addBinOp(
1515            l,
1516            if (operand_ty.zigTypeTag(zcu) == .vector) .legalize_vec_elem_val else .array_elem_val,
1517            ty_op.operand,
1518            index_val,
1519        ).toRef();
1520        const elem_uint = loop.block.addBitCast(l, elem_uint_ty, raw_elem);
1521        const elem_extended = loop.block.addTyOp(l, .intcast, uint_ty, elem_uint).toRef();
1522        const old_result = loop.block.addTyOp(l, .load, uint_ty, result_ptr).toRef();
1523        const shifted_result = loop.block.addBinOp(l, .shl_exact, old_result, .fromValue(elem_bits_val)).toRef();
1524        const new_result = loop.block.addBinOp(l, .bit_or, shifted_result, elem_extended).toRef();
1525
1526        const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .zero_usize).toRef();
1527        var condbr: CondBr = .init(l, is_end_val, &loop.block, .{});
1528
1529        condbr.then_block = .init(loop.block.stealRemainingCapacity());
1530        condbr.then_block.addBr(l, uint_block_inst, new_result);
1531
1532        condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
1533        _ = condbr.else_block.addBinOp(l, .store, result_ptr, new_result);
1534        const new_index_val = condbr.else_block.addBinOp(l, .sub, index_val, .one_usize).toRef();
1535        _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val);
1536        _ = condbr.else_block.add(l, .{
1537            .tag = .repeat,
1538            .data = .{ .repeat = .{ .loop_inst = loop.inst } },
1539        });
1540
1541        try condbr.finish(l);
1542        try loop.finish(l);
1543
1544        const inst_data = l.air_instructions.items(.data);
1545        inst_data[@intFromEnum(uint_block_inst)].ty_pl.payload = try l.addBlockBody(uint_block.body());
1546
1547        break :uint_val uint_block_inst.toRef();
1548    };
1549
1550    // Now convert `uint_ty` (`uN`) to `dest_ty`.
1551
1552    if (dest_legal) {
1553        _ = main_block.stealCapacity(17);
1554        const result = main_block.addBitCast(l, dest_ty, uint_val);
1555        main_block.addBr(l, orig_inst, result);
1556    } else {
1557        // %1 = alloc(*usize)
1558        // %2 = alloc(*@Vector(N, Result))
1559        // %3 = store(%1, @zero_usize)
1560        // %4 = loop({
1561        //   %5 = load(%1)
1562        //   %6 = mul(%5, <usize, E>)
1563        //   %7 = intcast(uS, %6)
1564        //   %8 = shr(uint_val, %7)
1565        //   %9 = trunc(uE, %8)
1566        //   %10 = bitcast(Result, %9)
1567        //   %11 = legalize_vec_store_elem(%2, %5, %10)
1568        //   %12 = cmp_eq(%5, <usize, vec_len>)
1569        //   %13 = cond_br(%12, {
1570        //     %14 = load(%2)
1571        //     %15 = br(%0, %14)
1572        //   }, {
1573        //     %16 = add(%5, @one_usize)
1574        //     %17 = store(%1, %16)
1575        //     %18 = repeat(%4)
1576        //   })
1577        // })
1578        //
1579        // The result might be an array, in which case `legalize_vec_store_elem`
1580        // becomes `ptr_elem_ptr` followed by `store`.
1581
1582        const elem_ty = dest_ty.childType(zcu);
1583        const elem_bits = elem_ty.bitSize(zcu);
1584        const elem_uint_ty = try pt.intType(.unsigned, @intCast(elem_bits));
1585
1586        const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef();
1587        const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(dest_ty)).toRef();
1588        _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize);
1589
1590        var loop: Loop = .init(l, &main_block);
1591        loop.block = .init(main_block.stealRemainingCapacity());
1592
1593        const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
1594        const bit_offset = loop.block.addBinOp(l, .mul, index_val, .fromValue(try pt.intValue(.usize, elem_bits))).toRef();
1595        const casted_bit_offset = loop.block.addTyOp(l, .intcast, shift_ty, bit_offset).toRef();
1596        const shifted_uint = loop.block.addBinOp(l, .shr, index_val, casted_bit_offset).toRef();
1597        const elem_uint = loop.block.addTyOp(l, .trunc, elem_uint_ty, shifted_uint).toRef();
1598        const elem_val = loop.block.addBitCast(l, elem_ty, elem_uint);
1599        switch (dest_ty.zigTypeTag(zcu)) {
1600            .array => {
1601                const elem_ptr = loop.block.add(l, .{
1602                    .tag = .ptr_elem_ptr,
1603                    .data = .{ .ty_pl = .{
1604                        .ty = .fromType(try pt.singleMutPtrType(elem_ty)),
1605                        .payload = try l.addExtra(Air.Bin, .{
1606                            .lhs = result_ptr,
1607                            .rhs = index_val,
1608                        }),
1609                    } },
1610                }).toRef();
1611                _ = loop.block.addBinOp(l, .store, elem_ptr, elem_val);
1612            },
1613            .vector => {
1614                _ = loop.block.add(l, .{
1615                    .tag = .legalize_vec_store_elem,
1616                    .data = .{ .pl_op = .{
1617                        .operand = result_ptr,
1618                        .payload = try l.addExtra(Air.Bin, .{
1619                            .lhs = index_val,
1620                            .rhs = elem_val,
1621                        }),
1622                    } },
1623                });
1624                _ = loop.block.stealCapacity(1);
1625            },
1626            else => unreachable,
1627        }
1628
1629        const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, dest_ty.arrayLen(zcu) - 1))).toRef();
1630
1631        var condbr: CondBr = .init(l, is_end_val, &loop.block, .{});
1632
1633        condbr.then_block = .init(loop.block.stealRemainingCapacity());
1634        const result_val = condbr.then_block.addTyOp(l, .load, dest_ty, result_ptr).toRef();
1635        condbr.then_block.addBr(l, orig_inst, result_val);
1636
1637        condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
1638        const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef();
1639        _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val);
1640        _ = condbr.else_block.add(l, .{
1641            .tag = .repeat,
1642            .data = .{ .repeat = .{ .loop_inst = loop.inst } },
1643        });
1644
1645        try condbr.finish(l);
1646        try loop.finish(l);
1647    }
1648
1649    return .{ .ty_pl = .{
1650        .ty = .fromType(dest_ty),
1651        .payload = try l.addBlockBody(main_block.body()),
1652    } };
1653}
1654fn scalarizeOverflowBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
1655    const pt = l.pt;
1656    const zcu = pt.zcu;
1657
1658    const orig = l.air_instructions.get(@intFromEnum(orig_inst));
1659    const orig_operands = l.extraData(Air.Bin, orig.data.ty_pl.payload).data;
1660
1661    const vec_tuple_ty = l.typeOfIndex(orig_inst);
1662    const vec_int_ty = vec_tuple_ty.fieldType(0, zcu);
1663    const vec_overflow_ty = vec_tuple_ty.fieldType(1, zcu);
1664
1665    assert(l.typeOf(orig_operands.lhs).toIntern() == vec_int_ty.toIntern());
1666    if (orig.tag != .shl_with_overflow) {
1667        assert(l.typeOf(orig_operands.rhs).toIntern() == vec_int_ty.toIntern());
1668    }
1669
1670    const scalar_int_ty = vec_int_ty.childType(zcu);
1671    const scalar_tuple_ty = try pt.overflowArithmeticTupleType(scalar_int_ty);
1672
1673    // %1 = block(struct { @Vector(N, Int), @Vector(N, u1) }, {
1674    //   %2 = alloc(*usize)
1675    //   %3 = alloc(*struct { @Vector(N, Int), @Vector(N, u1) })
1676    //   %4 = struct_field_ptr_index_0(*@Vector(N, Int), %3)
1677    //   %5 = struct_field_ptr_index_1(*@Vector(N, u1), %3)
1678    //   %6 = store(%2, @zero_usize)
1679    //   %7 = loop({
1680    //     %8 = load(%2)
1681    //     %9 = legalize_vec_elem_val(orig_lhs, %8)
1682    //     %10 = legalize_vec_elem_val(orig_rhs, %8)
1683    //     %11 = ???_with_overflow(struct { Int, u1 }, %9, %10)
1684    //     %12 = struct_field_val(%11, 0)
1685    //     %13 = struct_field_val(%11, 1)
1686    //     %14 = legalize_vec_store_elem(%4, %8, %12)
1687    //     %15 = legalize_vec_store_elem(%4, %8, %13)
1688    //     %16 = cmp_eq(%8, <usize, N-1>)
1689    //     %17 = cond_br(%16, {
1690    //       %18 = load(%3)
1691    //       %19 = br(%1, %18)
1692    //     }, {
1693    //       %20 = add(%8, @one_usize)
1694    //       %21 = store(%2, %20)
1695    //       %22 = repeat(%7)
1696    //     })
1697    //   })
1698    // })
1699
1700    const elems_len = vec_int_ty.vectorLen(zcu);
1701
1702    var inst_buf: [21]Air.Inst.Index = undefined;
1703    var main_block: Block = .init(&inst_buf);
1704    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
1705
1706    const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef();
1707    const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(vec_tuple_ty)).toRef();
1708    const result_int_ptr = main_block.addTyOp(
1709        l,
1710        .struct_field_ptr_index_0,
1711        try pt.singleMutPtrType(vec_int_ty),
1712        result_ptr,
1713    ).toRef();
1714    const result_overflow_ptr = main_block.addTyOp(
1715        l,
1716        .struct_field_ptr_index_1,
1717        try pt.singleMutPtrType(vec_overflow_ty),
1718        result_ptr,
1719    ).toRef();
1720
1721    _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize);
1722
1723    var loop: Loop = .init(l, &main_block);
1724    loop.block = .init(main_block.stealRemainingCapacity());
1725
1726    const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
1727    const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operands.lhs, index_val).toRef();
1728    const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operands.rhs, index_val).toRef();
1729    const elem_result = loop.block.add(l, .{
1730        .tag = orig.tag,
1731        .data = .{ .ty_pl = .{
1732            .ty = .fromType(scalar_tuple_ty),
1733            .payload = try l.addExtra(Air.Bin, .{ .lhs = lhs, .rhs = rhs }),
1734        } },
1735    }).toRef();
1736    const int_elem = loop.block.add(l, .{
1737        .tag = .struct_field_val,
1738        .data = .{ .ty_pl = .{
1739            .ty = .fromType(scalar_int_ty),
1740            .payload = try l.addExtra(Air.StructField, .{
1741                .struct_operand = elem_result,
1742                .field_index = 0,
1743            }),
1744        } },
1745    }).toRef();
1746    const overflow_elem = loop.block.add(l, .{
1747        .tag = .struct_field_val,
1748        .data = .{ .ty_pl = .{
1749            .ty = .u1_type,
1750            .payload = try l.addExtra(Air.StructField, .{
1751                .struct_operand = elem_result,
1752                .field_index = 1,
1753            }),
1754        } },
1755    }).toRef();
1756    _ = loop.block.add(l, .{
1757        .tag = .legalize_vec_store_elem,
1758        .data = .{ .pl_op = .{
1759            .operand = result_int_ptr,
1760            .payload = try l.addExtra(Air.Bin, .{
1761                .lhs = index_val,
1762                .rhs = int_elem,
1763            }),
1764        } },
1765    });
1766    _ = loop.block.add(l, .{
1767        .tag = .legalize_vec_store_elem,
1768        .data = .{ .pl_op = .{
1769            .operand = result_overflow_ptr,
1770            .payload = try l.addExtra(Air.Bin, .{
1771                .lhs = index_val,
1772                .rhs = overflow_elem,
1773            }),
1774        } },
1775    });
1776
1777    const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, elems_len - 1))).toRef();
1778    var condbr: CondBr = .init(l, is_end_val, &loop.block, .{});
1779
1780    condbr.then_block = .init(loop.block.stealRemainingCapacity());
1781    const result_val = condbr.then_block.addTyOp(l, .load, vec_tuple_ty, result_ptr).toRef();
1782    condbr.then_block.addBr(l, orig_inst, result_val);
1783
1784    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
1785    const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef();
1786    _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val);
1787    _ = condbr.else_block.add(l, .{
1788        .tag = .repeat,
1789        .data = .{ .repeat = .{ .loop_inst = loop.inst } },
1790    });
1791
1792    try condbr.finish(l);
1793    try loop.finish(l);
1794
1795    return .{ .ty_pl = .{
1796        .ty = .fromType(vec_tuple_ty),
1797        .payload = try l.addBlockBody(main_block.body()),
1798    } };
1799}
1800fn scalarizeReduceBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, optimized: bool) Error!Air.Inst.Data {
1801    const pt = l.pt;
1802    const zcu = pt.zcu;
1803
1804    const reduce = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].reduce;
1805
1806    const vector_ty = l.typeOf(reduce.operand);
1807    const scalar_ty = vector_ty.childType(zcu);
1808
1809    const ident_val: Value = switch (reduce.operation) {
1810        // identity for add is 0; identity for OR and XOR is all 0 bits
1811        .Or, .Xor, .Add => switch (scalar_ty.zigTypeTag(zcu)) {
1812            .int => try pt.intValue(scalar_ty, 0),
1813            .float => try pt.floatValue(scalar_ty, 0.0),
1814            else => unreachable,
1815        },
1816        // identity for multiplication is 1
1817        .Mul => switch (scalar_ty.zigTypeTag(zcu)) {
1818            .int => try pt.intValue(scalar_ty, 1),
1819            .float => try pt.floatValue(scalar_ty, 1.0),
1820            else => unreachable,
1821        },
1822        // identity for AND is all 1 bits
1823        .And => switch (scalar_ty.intInfo(zcu).signedness) {
1824            .unsigned => try scalar_ty.maxIntScalar(pt, scalar_ty),
1825            .signed => try pt.intValue(scalar_ty, -1),
1826        },
1827        // identity for @min is maximum value
1828        .Min => switch (scalar_ty.zigTypeTag(zcu)) {
1829            .int => try scalar_ty.maxIntScalar(pt, scalar_ty),
1830            .float => try pt.floatValue(scalar_ty, std.math.inf(f32)),
1831            else => unreachable,
1832        },
1833        // identity for @max is minimum value
1834        .Max => switch (scalar_ty.zigTypeTag(zcu)) {
1835            .int => try scalar_ty.minIntScalar(pt, scalar_ty),
1836            .float => try pt.floatValue(scalar_ty, -std.math.inf(f32)),
1837            else => unreachable,
1838        },
1839    };
1840
1841    const op_tag: Air.Inst.Tag = switch (reduce.operation) {
1842        .Or => .bit_or,
1843        .And => .bit_and,
1844        .Xor => .xor,
1845        .Min => .min,
1846        .Max => .max,
1847        .Add => switch (scalar_ty.zigTypeTag(zcu)) {
1848            .int => .add_wrap,
1849            .float => if (optimized) .add_optimized else .add,
1850            else => unreachable,
1851        },
1852        .Mul => switch (scalar_ty.zigTypeTag(zcu)) {
1853            .int => .mul_wrap,
1854            .float => if (optimized) .mul_optimized else .mul,
1855            else => unreachable,
1856        },
1857    };
1858
1859    // %1 = block(Scalar, {
1860    //   %2 = alloc(*usize)
1861    //   %3 = alloc(*Scalar)
1862    //   %4 = store(%2, @zero_usize)
1863    //   %5 = store(%3, <Scalar, 0>)  // or whatever the identity is for this operator
1864    //   %6 = loop({
1865    //     %7 = load(%2)
1866    //     %8 = legalize_vec_elem_val(orig_operand, %7)
1867    //     %9 = load(%3)
1868    //     %10 = add(%8, %9)  // or whatever the operator is
1869    //     %11 = cmp_eq(%7, <usize, N-1>)
1870    //     %12 = cond_br(%11, {
1871    //       %13 = br(%1, %10)
1872    //     }, {
1873    //       %14 = store(%3, %10)
1874    //       %15 = add(%7, @one_usize)
1875    //       %16 = store(%2, %15)
1876    //       %17 = repeat(%6)
1877    //     })
1878    //   })
1879    // })
1880
1881    var inst_buf: [16]Air.Inst.Index = undefined;
1882    var main_block: Block = .init(&inst_buf);
1883    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
1884
1885    const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef();
1886    const accum_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(scalar_ty)).toRef();
1887    _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize);
1888    _ = main_block.addBinOp(l, .store, accum_ptr, .fromValue(ident_val));
1889
1890    var loop: Loop = .init(l, &main_block);
1891    loop.block = .init(main_block.stealRemainingCapacity());
1892
1893    const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef();
1894    const elem_val = loop.block.addBinOp(l, .legalize_vec_elem_val, reduce.operand, index_val).toRef();
1895    const old_accum = loop.block.addTyOp(l, .load, scalar_ty, accum_ptr).toRef();
1896    const new_accum = loop.block.addBinOp(l, op_tag, old_accum, elem_val).toRef();
1897
1898    const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, vector_ty.vectorLen(zcu) - 1))).toRef();
1899
1900    var condbr: CondBr = .init(l, is_end_val, &loop.block, .{});
1901
1902    condbr.then_block = .init(loop.block.stealRemainingCapacity());
1903    condbr.then_block.addBr(l, orig_inst, new_accum);
1904
1905    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
1906    _ = condbr.else_block.addBinOp(l, .store, accum_ptr, new_accum);
1907    const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef();
1908    _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val);
1909    _ = condbr.else_block.add(l, .{
1910        .tag = .repeat,
1911        .data = .{ .repeat = .{ .loop_inst = loop.inst } },
1912    });
1913
1914    try condbr.finish(l);
1915    try loop.finish(l);
1916
1917    return .{ .ty_pl = .{
1918        .ty = .fromType(scalar_ty),
1919        .payload = try l.addBlockBody(main_block.body()),
1920    } };
1921}
1922
1923fn safeIntcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
1924    const pt = l.pt;
1925    const zcu = pt.zcu;
1926    const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
1927
1928    const operand_ref = ty_op.operand;
1929    const operand_ty = l.typeOf(operand_ref);
1930    const dest_ty = ty_op.ty.toType();
1931
1932    const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
1933    const operand_scalar_ty = operand_ty.scalarType(zcu);
1934    const dest_scalar_ty = dest_ty.scalarType(zcu);
1935
1936    assert(operand_scalar_ty.zigTypeTag(zcu) == .int);
1937    const dest_is_enum = switch (dest_scalar_ty.zigTypeTag(zcu)) {
1938        .int => false,
1939        .@"enum" => true,
1940        else => unreachable,
1941    };
1942
1943    const operand_info = operand_scalar_ty.intInfo(zcu);
1944    const dest_info = dest_scalar_ty.intInfo(zcu);
1945
1946    const have_min_check, const have_max_check = c: {
1947        const dest_pos_bits = dest_info.bits - @intFromBool(dest_info.signedness == .signed);
1948        const operand_pos_bits = operand_info.bits - @intFromBool(operand_info.signedness == .signed);
1949        const dest_allows_neg = dest_info.signedness == .signed and dest_info.bits > 0;
1950        const operand_allows_neg = operand_info.signedness == .signed and operand_info.bits > 0;
1951        break :c .{
1952            operand_allows_neg and (!dest_allows_neg or dest_info.bits < operand_info.bits),
1953            dest_pos_bits < operand_pos_bits,
1954        };
1955    };
1956
1957    // The worst-case scenario in terms of total instructions and total condbrs is the case where
1958    // the result type is an exhaustive enum whose tag type is smaller than the operand type:
1959    //
1960    // %x = block({
1961    //   %1 = cmp_lt(%y, @min_allowed_int)
1962    //   %2 = cmp_gt(%y, @max_allowed_int)
1963    //   %3 = bool_or(%1, %2)
1964    //   %4 = cond_br(%3, {
1965    //     %5 = call(@panic.invalidEnumValue, [])
1966    //     %6 = unreach()
1967    //   }, {
1968    //     %7 = intcast(@res_ty, %y)
1969    //     %8 = is_named_enum_value(%7)
1970    //     %9 = cond_br(%8, {
1971    //       %10 = br(%x, %7)
1972    //     }, {
1973    //       %11 = call(@panic.invalidEnumValue, [])
1974    //       %12 = unreach()
1975    //     })
1976    //   })
1977    // })
1978    //
1979    // Note that vectors of enums don't exist -- the worst case for vectors is this:
1980    //
1981    // %x = block({
1982    //   %1 = cmp_lt(%y, @min_allowed_int)
1983    //   %2 = cmp_gt(%y, @max_allowed_int)
1984    //   %3 = bool_or(%1, %2)
1985    //   %4 = reduce(%3, .@"or")
1986    //   %5 = cond_br(%4, {
1987    //     %6 = call(@panic.invalidEnumValue, [])
1988    //     %7 = unreach()
1989    //   }, {
1990    //     %8 = intcast(@res_ty, %y)
1991    //     %9 = br(%x, %8)
1992    //   })
1993    // })
1994
1995    var inst_buf: [12]Air.Inst.Index = undefined;
1996    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
1997    var condbr_buf: [2]CondBr = undefined;
1998    var condbr_idx: usize = 0;
1999
2000    var main_block: Block = .init(&inst_buf);
2001    var cur_block: *Block = &main_block;
2002
2003    const panic_id: Zcu.SimplePanicId = if (dest_is_enum) .invalid_enum_value else .integer_out_of_bounds;
2004
2005    if (have_min_check or have_max_check) {
2006        const dest_int_ty = if (dest_is_enum) dest_ty.intTagType(zcu) else dest_ty;
2007        const condbr = &condbr_buf[condbr_idx];
2008        condbr_idx += 1;
2009        const below_min_inst: Air.Inst.Index = if (have_min_check) inst: {
2010            const min_val_ref = Air.internedToRef((try dest_int_ty.minInt(pt, operand_ty)).toIntern());
2011            break :inst try cur_block.addCmp(l, .lt, operand_ref, min_val_ref, .{ .vector = is_vector });
2012        } else undefined;
2013        const above_max_inst: Air.Inst.Index = if (have_max_check) inst: {
2014            const max_val_ref = Air.internedToRef((try dest_int_ty.maxInt(pt, operand_ty)).toIntern());
2015            break :inst try cur_block.addCmp(l, .gt, operand_ref, max_val_ref, .{ .vector = is_vector });
2016        } else undefined;
2017        const out_of_range_inst: Air.Inst.Index = inst: {
2018            if (have_min_check and have_max_check) break :inst cur_block.add(l, .{
2019                .tag = .bool_or,
2020                .data = .{ .bin_op = .{
2021                    .lhs = below_min_inst.toRef(),
2022                    .rhs = above_max_inst.toRef(),
2023                } },
2024            });
2025            if (have_min_check) break :inst below_min_inst;
2026            if (have_max_check) break :inst above_max_inst;
2027            unreachable;
2028        };
2029        const scalar_out_of_range_inst: Air.Inst.Index = if (is_vector) cur_block.add(l, .{
2030            .tag = .reduce,
2031            .data = .{ .reduce = .{
2032                .operand = out_of_range_inst.toRef(),
2033                .operation = .Or,
2034            } },
2035        }) else out_of_range_inst;
2036        condbr.* = .init(l, scalar_out_of_range_inst.toRef(), cur_block, .{ .true = .cold });
2037        condbr.then_block = .init(cur_block.stealRemainingCapacity());
2038        try condbr.then_block.addPanic(l, panic_id);
2039        condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
2040        cur_block = &condbr.else_block;
2041    }
2042
2043    // Now we know we're in-range, we can intcast:
2044    const cast_inst = cur_block.add(l, .{
2045        .tag = .intcast,
2046        .data = .{ .ty_op = .{
2047            .ty = Air.internedToRef(dest_ty.toIntern()),
2048            .operand = operand_ref,
2049        } },
2050    });
2051    // For ints we're already done, but for exhaustive enums we must check this is a valid tag.
2052    if (dest_is_enum and !dest_ty.isNonexhaustiveEnum(zcu) and zcu.backendSupportsFeature(.is_named_enum_value)) {
2053        assert(!is_vector); // vectors of enums don't exist
2054        // We are building this:
2055        //   %1 = is_named_enum_value(%cast_inst)
2056        //   %2 = cond_br(%1, {
2057        //     <new cursor>
2058        //   }, {
2059        //     <panic>
2060        //   })
2061        const is_named_inst = cur_block.add(l, .{
2062            .tag = .is_named_enum_value,
2063            .data = .{ .un_op = cast_inst.toRef() },
2064        });
2065        const condbr = &condbr_buf[condbr_idx];
2066        condbr_idx += 1;
2067        condbr.* = .init(l, is_named_inst.toRef(), cur_block, .{ .false = .cold });
2068        condbr.else_block = .init(cur_block.stealRemainingCapacity());
2069        try condbr.else_block.addPanic(l, panic_id);
2070        condbr.then_block = .init(condbr.else_block.stealRemainingCapacity());
2071        cur_block = &condbr.then_block;
2072    }
2073    // Finally, just `br` to our outer `block`.
2074    _ = cur_block.add(l, .{
2075        .tag = .br,
2076        .data = .{ .br = .{
2077            .block_inst = orig_inst,
2078            .operand = cast_inst.toRef(),
2079        } },
2080    });
2081    // We might not have used all of the instructions; that's intentional.
2082    _ = cur_block.stealRemainingCapacity();
2083
2084    for (condbr_buf[0..condbr_idx]) |*condbr| try condbr.finish(l);
2085    return .{ .ty_pl = .{
2086        .ty = Air.internedToRef(dest_ty.toIntern()),
2087        .payload = try l.addBlockBody(main_block.body()),
2088    } };
2089}
2090fn safeIntFromFloatBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, optimized: bool) Error!Air.Inst.Data {
2091    const pt = l.pt;
2092    const zcu = pt.zcu;
2093    const gpa = zcu.gpa;
2094    const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
2095
2096    const operand_ref = ty_op.operand;
2097    const operand_ty = l.typeOf(operand_ref);
2098    const dest_ty = ty_op.ty.toType();
2099
2100    const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
2101    const dest_scalar_ty = dest_ty.scalarType(zcu);
2102    const int_info = dest_scalar_ty.intInfo(zcu);
2103
2104    // We emit 9 instructions in the worst case.
2105    var inst_buf: [9]Air.Inst.Index = undefined;
2106    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
2107    var main_block: Block = .init(&inst_buf);
2108
2109    // This check is a bit annoying because of floating-point rounding and the fact that this
2110    // builtin truncates. We'll use a bigint for our calculations, because we need to construct
2111    // integers exceeding the bounds of the result integer type, and we need to convert it to a
2112    // float with a specific rounding mode to avoid errors.
2113    // Our bigint may exceed the twos complement limit by one, so add an extra limb.
2114    const limbs = try gpa.alloc(
2115        std.math.big.Limb,
2116        std.math.big.int.calcTwosCompLimbCount(int_info.bits) + 1,
2117    );
2118    defer gpa.free(limbs);
2119    var big: std.math.big.int.Mutable = .init(limbs, 0);
2120
2121    // Check if the operand is lower than `min_int` when truncated to an integer.
2122    big.setTwosCompIntLimit(.min, int_info.signedness, int_info.bits);
2123    const below_min_inst: Air.Inst.Index = if (!big.positive or big.eqlZero()) bad: {
2124        // `min_int <= 0`, so check for `x <= min_int - 1`.
2125        big.addScalar(big.toConst(), -1);
2126        // For `<=`, we must round the RHS down, so that this value is the first `x` which returns `true`.
2127        const limit_val = try floatFromBigIntVal(pt, is_vector, operand_ty, big.toConst(), .floor);
2128        break :bad try main_block.addCmp(l, .lte, operand_ref, Air.internedToRef(limit_val.toIntern()), .{
2129            .vector = is_vector,
2130            .optimized = optimized,
2131        });
2132    } else {
2133        // `min_int > 0`, which is currently impossible. It would become possible under #3806, in
2134        // which case we must detect `x < min_int`.
2135        unreachable;
2136    };
2137
2138    // Check if the operand is greater than `max_int` when truncated to an integer.
2139    big.setTwosCompIntLimit(.max, int_info.signedness, int_info.bits);
2140    const above_max_inst: Air.Inst.Index = if (big.positive or big.eqlZero()) bad: {
2141        // `max_int >= 0`, so check for `x >= max_int + 1`.
2142        big.addScalar(big.toConst(), 1);
2143        // For `>=`, we must round the RHS up, so that this value is the first `x` which returns `true`.
2144        const limit_val = try floatFromBigIntVal(pt, is_vector, operand_ty, big.toConst(), .ceil);
2145        break :bad try main_block.addCmp(l, .gte, operand_ref, Air.internedToRef(limit_val.toIntern()), .{
2146            .vector = is_vector,
2147            .optimized = optimized,
2148        });
2149    } else {
2150        // `max_int < 0`, which is currently impossible. It would become possible under #3806, in
2151        // which case we must detect `x > max_int`.
2152        unreachable;
2153    };
2154
2155    // Combine the conditions.
2156    const out_of_bounds_inst: Air.Inst.Index = main_block.add(l, .{
2157        .tag = .bool_or,
2158        .data = .{ .bin_op = .{
2159            .lhs = below_min_inst.toRef(),
2160            .rhs = above_max_inst.toRef(),
2161        } },
2162    });
2163    const scalar_out_of_bounds_inst: Air.Inst.Index = if (is_vector) main_block.add(l, .{
2164        .tag = .reduce,
2165        .data = .{ .reduce = .{
2166            .operand = out_of_bounds_inst.toRef(),
2167            .operation = .Or,
2168        } },
2169    }) else out_of_bounds_inst;
2170
2171    // Now emit the actual condbr. "true" will be safety panic. "false" will be "ok", meaning we do
2172    // the `int_from_float` and `br` the result to `orig_inst`.
2173    var condbr: CondBr = .init(l, scalar_out_of_bounds_inst.toRef(), &main_block, .{ .true = .cold });
2174    condbr.then_block = .init(main_block.stealRemainingCapacity());
2175    try condbr.then_block.addPanic(l, .integer_part_out_of_bounds);
2176    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
2177    const cast_inst = condbr.else_block.add(l, .{
2178        .tag = if (optimized) .int_from_float_optimized else .int_from_float,
2179        .data = .{ .ty_op = .{
2180            .ty = Air.internedToRef(dest_ty.toIntern()),
2181            .operand = operand_ref,
2182        } },
2183    });
2184    _ = condbr.else_block.add(l, .{
2185        .tag = .br,
2186        .data = .{ .br = .{
2187            .block_inst = orig_inst,
2188            .operand = cast_inst.toRef(),
2189        } },
2190    });
2191    _ = condbr.else_block.stealRemainingCapacity(); // we might not have used it all
2192    try condbr.finish(l);
2193
2194    return .{ .ty_pl = .{
2195        .ty = Air.internedToRef(dest_ty.toIntern()),
2196        .payload = try l.addBlockBody(main_block.body()),
2197    } };
2198}
2199fn safeArithmeticBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, overflow_op_tag: Air.Inst.Tag) Error!Air.Inst.Data {
2200    const pt = l.pt;
2201    const zcu = pt.zcu;
2202    const bin_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].bin_op;
2203
2204    const operand_ty = l.typeOf(bin_op.lhs);
2205    assert(l.typeOf(bin_op.rhs).toIntern() == operand_ty.toIntern());
2206    const is_vector = operand_ty.zigTypeTag(zcu) == .vector;
2207
2208    const overflow_tuple_ty = try pt.overflowArithmeticTupleType(operand_ty);
2209    const overflow_bits_ty = overflow_tuple_ty.fieldType(1, zcu);
2210
2211    // The worst-case scenario is a vector operand:
2212    //
2213    // %1 = add_with_overflow(%x, %y)
2214    // %2 = struct_field_val(%1, .@"1")
2215    // %3 = reduce(%2, .@"or")
2216    // %4 = bitcast(%3, @bool_type)
2217    // %5 = cond_br(%4, {
2218    //   %6 = call(@panic.integerOverflow, [])
2219    //   %7 = unreach()
2220    // }, {
2221    //   %8 = struct_field_val(%1, .@"0")
2222    //   %9 = br(%z, %8)
2223    // })
2224    var inst_buf: [9]Air.Inst.Index = undefined;
2225    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
2226
2227    var main_block: Block = .init(&inst_buf);
2228
2229    const overflow_op_inst = main_block.add(l, .{
2230        .tag = overflow_op_tag,
2231        .data = .{ .ty_pl = .{
2232            .ty = Air.internedToRef(overflow_tuple_ty.toIntern()),
2233            .payload = try l.addExtra(Air.Bin, .{
2234                .lhs = bin_op.lhs,
2235                .rhs = bin_op.rhs,
2236            }),
2237        } },
2238    });
2239    const overflow_bits_inst = main_block.add(l, .{
2240        .tag = .struct_field_val,
2241        .data = .{ .ty_pl = .{
2242            .ty = Air.internedToRef(overflow_bits_ty.toIntern()),
2243            .payload = try l.addExtra(Air.StructField, .{
2244                .struct_operand = overflow_op_inst.toRef(),
2245                .field_index = 1,
2246            }),
2247        } },
2248    });
2249    const any_overflow_bit_inst = if (is_vector) main_block.add(l, .{
2250        .tag = .reduce,
2251        .data = .{ .reduce = .{
2252            .operand = overflow_bits_inst.toRef(),
2253            .operation = .Or,
2254        } },
2255    }) else overflow_bits_inst;
2256    const any_overflow_inst = try main_block.addCmp(l, .eq, any_overflow_bit_inst.toRef(), .one_u1, .{});
2257
2258    var condbr: CondBr = .init(l, any_overflow_inst.toRef(), &main_block, .{ .true = .cold });
2259    condbr.then_block = .init(main_block.stealRemainingCapacity());
2260    try condbr.then_block.addPanic(l, .integer_overflow);
2261    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
2262
2263    const result_inst = condbr.else_block.add(l, .{
2264        .tag = .struct_field_val,
2265        .data = .{ .ty_pl = .{
2266            .ty = Air.internedToRef(operand_ty.toIntern()),
2267            .payload = try l.addExtra(Air.StructField, .{
2268                .struct_operand = overflow_op_inst.toRef(),
2269                .field_index = 0,
2270            }),
2271        } },
2272    });
2273    _ = condbr.else_block.add(l, .{
2274        .tag = .br,
2275        .data = .{ .br = .{
2276            .block_inst = orig_inst,
2277            .operand = result_inst.toRef(),
2278        } },
2279    });
2280    // We might not have used all of the instructions; that's intentional.
2281    _ = condbr.else_block.stealRemainingCapacity();
2282
2283    try condbr.finish(l);
2284    return .{ .ty_pl = .{
2285        .ty = Air.internedToRef(operand_ty.toIntern()),
2286        .payload = try l.addBlockBody(main_block.body()),
2287    } };
2288}
2289
2290fn packedLoadBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
2291    const pt = l.pt;
2292    const zcu = pt.zcu;
2293
2294    const orig_ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
2295    const res_ty = orig_ty_op.ty.toType();
2296    const res_int_ty = try pt.intType(.unsigned, @intCast(res_ty.bitSize(zcu)));
2297    const ptr_ty = l.typeOf(orig_ty_op.operand);
2298    const ptr_info = ptr_ty.ptrInfo(zcu);
2299    // This relies on a heap of possibly invalid assumptions to work around not knowing the actual backing type.
2300    const load_bits = 8 * ptr_info.packed_offset.host_size;
2301    const load_ty = try pt.intType(.unsigned, load_bits);
2302
2303    var inst_buf: [6]Air.Inst.Index = undefined;
2304    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
2305
2306    var res_block: Block = .init(&inst_buf);
2307    _ = res_block.add(l, .{
2308        .tag = .br,
2309        .data = .{ .br = .{
2310            .block_inst = orig_inst,
2311            .operand = res_block.addBitCast(l, res_ty, res_block.add(l, .{
2312                .tag = .trunc,
2313                .data = .{ .ty_op = .{
2314                    .ty = Air.internedToRef(res_int_ty.toIntern()),
2315                    .operand = res_block.add(l, .{
2316                        .tag = .shr,
2317                        .data = .{ .bin_op = .{
2318                            .lhs = res_block.add(l, .{
2319                                .tag = .load,
2320                                .data = .{ .ty_op = .{
2321                                    .ty = Air.internedToRef(load_ty.toIntern()),
2322                                    .operand = res_block.addBitCast(l, load_ptr_ty: {
2323                                        var load_ptr_info = ptr_info;
2324                                        load_ptr_info.child = load_ty.toIntern();
2325                                        load_ptr_info.flags.vector_index = .none;
2326                                        load_ptr_info.packed_offset = .{ .host_size = 0, .bit_offset = 0 };
2327                                        break :load_ptr_ty try pt.ptrType(load_ptr_info);
2328                                    }, orig_ty_op.operand),
2329                                } },
2330                            }).toRef(),
2331                            .rhs = try pt.intRef(
2332                                try pt.intType(.unsigned, std.math.log2_int_ceil(u16, load_bits)),
2333                                ptr_info.packed_offset.bit_offset,
2334                            ),
2335                        } },
2336                    }).toRef(),
2337                } },
2338            }).toRef()),
2339        } },
2340    });
2341    return .{ .ty_pl = .{
2342        .ty = Air.internedToRef(res_ty.toIntern()),
2343        .payload = try l.addBlockBody(res_block.body()),
2344    } };
2345}
2346fn packedStoreBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
2347    const pt = l.pt;
2348    const zcu = pt.zcu;
2349
2350    const orig_bin_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].bin_op;
2351    const ptr_ty = l.typeOf(orig_bin_op.lhs);
2352    const ptr_info = ptr_ty.ptrInfo(zcu);
2353    const operand_ty = l.typeOf(orig_bin_op.rhs);
2354    const operand_bits: u16 = @intCast(operand_ty.bitSize(zcu));
2355    const operand_int_ty = try pt.intType(.unsigned, operand_bits);
2356    // This relies on a heap of possibly invalid assumptions to work around not knowing the actual backing type.
2357    const load_store_bits = 8 * ptr_info.packed_offset.host_size;
2358    const load_store_ty = try pt.intType(.unsigned, load_store_bits);
2359
2360    var inst_buf: [9]Air.Inst.Index = undefined;
2361    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
2362
2363    var res_block: Block = .init(&inst_buf);
2364    {
2365        const backing_ptr_inst = res_block.add(l, .{
2366            .tag = .bitcast,
2367            .data = .{ .ty_op = .{
2368                .ty = Air.internedToRef((load_store_ptr_ty: {
2369                    var load_ptr_info = ptr_info;
2370                    load_ptr_info.child = load_store_ty.toIntern();
2371                    load_ptr_info.flags.vector_index = .none;
2372                    load_ptr_info.packed_offset = .{ .host_size = 0, .bit_offset = 0 };
2373                    break :load_store_ptr_ty try pt.ptrType(load_ptr_info);
2374                }).toIntern()),
2375                .operand = orig_bin_op.lhs,
2376            } },
2377        });
2378        _ = res_block.add(l, .{
2379            .tag = .store,
2380            .data = .{ .bin_op = .{
2381                .lhs = backing_ptr_inst.toRef(),
2382                .rhs = res_block.add(l, .{
2383                    .tag = .bit_or,
2384                    .data = .{ .bin_op = .{
2385                        .lhs = res_block.add(l, .{
2386                            .tag = .bit_and,
2387                            .data = .{ .bin_op = .{
2388                                .lhs = res_block.add(l, .{
2389                                    .tag = .load,
2390                                    .data = .{ .ty_op = .{
2391                                        .ty = Air.internedToRef(load_store_ty.toIntern()),
2392                                        .operand = backing_ptr_inst.toRef(),
2393                                    } },
2394                                }).toRef(),
2395                                .rhs = Air.internedToRef((keep_mask: {
2396                                    const ExpectedContents = [std.math.big.int.calcTwosCompLimbCount(256)]std.math.big.Limb;
2397                                    var stack align(@max(@alignOf(ExpectedContents), @alignOf(std.heap.StackFallbackAllocator(0)))) =
2398                                        std.heap.stackFallback(@sizeOf(ExpectedContents), zcu.gpa);
2399                                    const gpa = stack.get();
2400
2401                                    var mask_big_int: std.math.big.int.Mutable = .{
2402                                        .limbs = try gpa.alloc(
2403                                            std.math.big.Limb,
2404                                            std.math.big.int.calcTwosCompLimbCount(load_store_bits),
2405                                        ),
2406                                        .len = undefined,
2407                                        .positive = undefined,
2408                                    };
2409                                    defer gpa.free(mask_big_int.limbs);
2410                                    mask_big_int.setTwosCompIntLimit(.max, .unsigned, operand_bits);
2411                                    mask_big_int.shiftLeft(mask_big_int.toConst(), ptr_info.packed_offset.bit_offset);
2412                                    mask_big_int.bitNotWrap(mask_big_int.toConst(), .unsigned, load_store_bits);
2413                                    break :keep_mask try pt.intValue_big(load_store_ty, mask_big_int.toConst());
2414                                }).toIntern()),
2415                            } },
2416                        }).toRef(),
2417                        .rhs = res_block.add(l, .{
2418                            .tag = .shl_exact,
2419                            .data = .{ .bin_op = .{
2420                                .lhs = res_block.add(l, .{
2421                                    .tag = .intcast,
2422                                    .data = .{ .ty_op = .{
2423                                        .ty = Air.internedToRef(load_store_ty.toIntern()),
2424                                        .operand = res_block.addBitCast(l, operand_int_ty, orig_bin_op.rhs),
2425                                    } },
2426                                }).toRef(),
2427                                .rhs = try pt.intRef(
2428                                    try pt.intType(.unsigned, std.math.log2_int_ceil(u16, load_store_bits)),
2429                                    ptr_info.packed_offset.bit_offset,
2430                                ),
2431                            } },
2432                        }).toRef(),
2433                    } },
2434                }).toRef(),
2435            } },
2436        });
2437        _ = res_block.add(l, .{
2438            .tag = .br,
2439            .data = .{ .br = .{
2440                .block_inst = orig_inst,
2441                .operand = .void_value,
2442            } },
2443        });
2444    }
2445    return .{ .ty_pl = .{
2446        .ty = .void_type,
2447        .payload = try l.addBlockBody(res_block.body()),
2448    } };
2449}
2450fn packedStructFieldValBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
2451    const pt = l.pt;
2452    const zcu = pt.zcu;
2453
2454    const orig_ty_pl = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_pl;
2455    const orig_extra = l.extraData(Air.StructField, orig_ty_pl.payload).data;
2456    const field_ty = orig_ty_pl.ty.toType();
2457    const agg_ty = l.typeOf(orig_extra.struct_operand);
2458
2459    const agg_bits: u16 = @intCast(agg_ty.bitSize(zcu));
2460    const bit_offset = zcu.structPackedFieldBitOffset(zcu.typeToStruct(agg_ty).?, orig_extra.field_index);
2461
2462    const agg_int_ty = try pt.intType(.unsigned, agg_bits);
2463    const field_int_ty = try pt.intType(.unsigned, @intCast(field_ty.bitSize(zcu)));
2464
2465    const agg_shift_ty = try pt.intType(.unsigned, std.math.log2_int_ceil(u16, agg_bits));
2466    const bit_offset_ref: Air.Inst.Ref = .fromValue(try pt.intValue(agg_shift_ty, bit_offset));
2467
2468    var inst_buf: [5]Air.Inst.Index = undefined;
2469    var main_block: Block = .init(&inst_buf);
2470    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
2471
2472    const agg_int = main_block.addBitCast(l, agg_int_ty, orig_extra.struct_operand);
2473    const shifted_agg_int = main_block.addBinOp(l, .shr, agg_int, bit_offset_ref).toRef();
2474    const field_int = main_block.addTyOp(l, .trunc, field_int_ty, shifted_agg_int).toRef();
2475    const field_val = main_block.addBitCast(l, field_ty, field_int);
2476    main_block.addBr(l, orig_inst, field_val);
2477
2478    return .{ .ty_pl = .{
2479        .ty = .fromType(field_ty),
2480        .payload = try l.addBlockBody(main_block.body()),
2481    } };
2482}
2483fn packedAggregateInitBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data {
2484    const pt = l.pt;
2485    const zcu = pt.zcu;
2486    const gpa = zcu.gpa;
2487
2488    const orig_ty_pl = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_pl;
2489    const agg_ty = orig_ty_pl.ty.toType();
2490    const agg_field_count = agg_ty.structFieldCount(zcu);
2491
2492    var sfba_state = std.heap.stackFallback(@sizeOf([4 * 32 + 2]Air.Inst.Index), gpa);
2493    const sfba = sfba_state.get();
2494
2495    const inst_buf = try sfba.alloc(Air.Inst.Index, 4 * agg_field_count + 2);
2496    defer sfba.free(inst_buf);
2497
2498    var main_block: Block = .init(inst_buf);
2499    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
2500
2501    const num_bits: u16 = @intCast(agg_ty.bitSize(zcu));
2502    const shift_ty = try pt.intType(.unsigned, std.math.log2_int_ceil(u16, num_bits));
2503    const uint_ty = try pt.intType(.unsigned, num_bits);
2504    var cur_uint: Air.Inst.Ref = .fromValue(try pt.intValue(uint_ty, 0));
2505
2506    var field_idx = agg_field_count;
2507    while (field_idx > 0) {
2508        field_idx -= 1;
2509        const field_ty = agg_ty.fieldType(field_idx, zcu);
2510        const field_uint_ty = try pt.intType(.unsigned, @intCast(field_ty.bitSize(zcu)));
2511        const field_bit_size_ref: Air.Inst.Ref = .fromValue(try pt.intValue(shift_ty, field_ty.bitSize(zcu)));
2512        const field_val: Air.Inst.Ref = @enumFromInt(l.air_extra.items[orig_ty_pl.payload + field_idx]);
2513
2514        const shifted = main_block.addBinOp(l, .shl_exact, cur_uint, field_bit_size_ref).toRef();
2515        const field_as_uint = main_block.addBitCast(l, field_uint_ty, field_val);
2516        const field_extended = main_block.addTyOp(l, .intcast, uint_ty, field_as_uint).toRef();
2517        cur_uint = main_block.addBinOp(l, .bit_or, shifted, field_extended).toRef();
2518    }
2519
2520    const result = main_block.addBitCast(l, agg_ty, cur_uint);
2521    main_block.addBr(l, orig_inst, result);
2522
2523    return .{ .ty_pl = .{
2524        .ty = .fromType(agg_ty),
2525        .payload = try l.addBlockBody(main_block.body()),
2526    } };
2527}
2528
2529/// Given a `std.math.big.int.Const`, converts it to a `Value` which is a float of type `float_ty`
2530/// representing the same numeric value. If the integer cannot be exactly represented, `round`
2531/// decides whether the value should be rounded up or down. If `is_vector`, then `float_ty` is
2532/// instead a vector of floats, and the result value is a vector containing the converted scalar
2533/// repeated N times.
2534fn floatFromBigIntVal(
2535    pt: Zcu.PerThread,
2536    is_vector: bool,
2537    float_ty: Type,
2538    x: std.math.big.int.Const,
2539    round: std.math.big.int.Round,
2540) Error!Value {
2541    const zcu = pt.zcu;
2542    const scalar_ty = switch (is_vector) {
2543        true => float_ty.childType(zcu),
2544        false => float_ty,
2545    };
2546    assert(scalar_ty.zigTypeTag(zcu) == .float);
2547    const scalar_val: Value = switch (scalar_ty.floatBits(zcu.getTarget())) {
2548        16 => try pt.floatValue(scalar_ty, x.toFloat(f16, round)[0]),
2549        32 => try pt.floatValue(scalar_ty, x.toFloat(f32, round)[0]),
2550        64 => try pt.floatValue(scalar_ty, x.toFloat(f64, round)[0]),
2551        80 => try pt.floatValue(scalar_ty, x.toFloat(f80, round)[0]),
2552        128 => try pt.floatValue(scalar_ty, x.toFloat(f128, round)[0]),
2553        else => unreachable,
2554    };
2555    if (is_vector) {
2556        return pt.aggregateSplatValue(float_ty, scalar_val);
2557    } else {
2558        return scalar_val;
2559    }
2560}
2561
2562const Block = struct {
2563    instructions: []Air.Inst.Index,
2564    len: usize,
2565
2566    /// There are two common usages of the API:
2567    /// * `buf.len` is exactly the number of instructions which will be in this block
2568    /// * `buf.len` is no smaller than necessary, and `b.stealRemainingCapacity` will be used
2569    fn init(buf: []Air.Inst.Index) Block {
2570        return .{
2571            .instructions = buf,
2572            .len = 0,
2573        };
2574    }
2575
2576    /// Like `Legalize.addInstAssumeCapacity`, but also appends the instruction to `b`.
2577    fn add(b: *Block, l: *Legalize, inst_data: Air.Inst) Air.Inst.Index {
2578        const inst = l.addInstAssumeCapacity(inst_data);
2579        b.instructions[b.len] = inst;
2580        b.len += 1;
2581        return inst;
2582    }
2583    fn addBr(b: *Block, l: *Legalize, target: Air.Inst.Index, operand: Air.Inst.Ref) void {
2584        _ = b.add(l, .{
2585            .tag = .br,
2586            .data = .{ .br = .{ .block_inst = target, .operand = operand } },
2587        });
2588    }
2589    fn addTy(b: *Block, l: *Legalize, tag: Air.Inst.Tag, ty: Type) Air.Inst.Index {
2590        return b.add(l, .{ .tag = tag, .data = .{ .ty = ty } });
2591    }
2592    fn addBinOp(b: *Block, l: *Legalize, tag: Air.Inst.Tag, lhs: Air.Inst.Ref, rhs: Air.Inst.Ref) Air.Inst.Index {
2593        return b.add(l, .{
2594            .tag = tag,
2595            .data = .{ .bin_op = .{ .lhs = lhs, .rhs = rhs } },
2596        });
2597    }
2598    fn addUnOp(b: *Block, l: *Legalize, tag: Air.Inst.Tag, operand: Air.Inst.Ref) Air.Inst.Index {
2599        return b.add(l, .{
2600            .tag = tag,
2601            .data = .{ .un_op = operand },
2602        });
2603    }
2604    fn addTyOp(b: *Block, l: *Legalize, tag: Air.Inst.Tag, ty: Type, operand: Air.Inst.Ref) Air.Inst.Index {
2605        return b.add(l, .{
2606            .tag = tag,
2607            .data = .{ .ty_op = .{
2608                .ty = .fromType(ty),
2609                .operand = operand,
2610            } },
2611        });
2612    }
2613
2614    fn addCompilerRtCall(b: *Block, l: *Legalize, func: Air.CompilerRtFunc, args: []const Air.Inst.Ref) Error!Air.Inst.Index {
2615        return b.add(l, .{
2616            .tag = .legalize_compiler_rt_call,
2617            .data = .{ .legalize_compiler_rt_call = .{
2618                .func = func,
2619                .payload = payload: {
2620                    const extra_len = @typeInfo(Air.Call).@"struct".fields.len + args.len;
2621                    try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, extra_len);
2622                    const index = l.addExtra(Air.Call, .{ .args_len = @intCast(args.len) }) catch unreachable;
2623                    l.air_extra.appendSliceAssumeCapacity(@ptrCast(args));
2624                    break :payload index;
2625                },
2626            } },
2627        });
2628    }
2629
2630    /// Adds the code to call the panic handler `panic_id`. This is usually `.call` then `.unreach`,
2631    /// but if `Zcu.Feature.panic_fn` is unsupported, we lower to `.trap` instead.
2632    fn addPanic(b: *Block, l: *Legalize, panic_id: Zcu.SimplePanicId) Error!void {
2633        const zcu = l.pt.zcu;
2634        if (!zcu.backendSupportsFeature(.panic_fn)) {
2635            _ = b.add(l, .{
2636                .tag = .trap,
2637                .data = .{ .no_op = {} },
2638            });
2639            return;
2640        }
2641        const panic_fn_val = zcu.builtin_decl_values.get(panic_id.toBuiltin());
2642        _ = b.add(l, .{
2643            .tag = .call,
2644            .data = .{ .pl_op = .{
2645                .operand = Air.internedToRef(panic_fn_val),
2646                .payload = try l.addExtra(Air.Call, .{ .args_len = 0 }),
2647            } },
2648        });
2649        _ = b.add(l, .{
2650            .tag = .unreach,
2651            .data = .{ .no_op = {} },
2652        });
2653    }
2654
2655    /// Adds a `cmp_*` instruction (including maybe `cmp_vector`) to `b`. This is a fairly thin wrapper
2656    /// around `add`, although it does compute the result type if `is_vector` (`@Vector(n, bool)`).
2657    fn addCmp(
2658        b: *Block,
2659        l: *Legalize,
2660        op: std.math.CompareOperator,
2661        lhs: Air.Inst.Ref,
2662        rhs: Air.Inst.Ref,
2663        opts: struct { optimized: bool = false, vector: bool = false },
2664    ) Error!Air.Inst.Index {
2665        const pt = l.pt;
2666        if (opts.vector) {
2667            const bool_vec_ty = try pt.vectorType(.{
2668                .child = .bool_type,
2669                .len = l.typeOf(lhs).vectorLen(pt.zcu),
2670            });
2671            return b.add(l, .{
2672                .tag = if (opts.optimized) .cmp_vector_optimized else .cmp_vector,
2673                .data = .{ .ty_pl = .{
2674                    .ty = Air.internedToRef(bool_vec_ty.toIntern()),
2675                    .payload = try l.addExtra(Air.VectorCmp, .{
2676                        .lhs = lhs,
2677                        .rhs = rhs,
2678                        .op = Air.VectorCmp.encodeOp(op),
2679                    }),
2680                } },
2681            });
2682        }
2683        return addCmpScalar(b, l, op, lhs, rhs, opts.optimized);
2684    }
2685
2686    /// Similar to `addCmp`, but for scalars only. Unlike `addCmp`, this function is
2687    /// infallible, because it doesn't need to add entries to `extra`.
2688    fn addCmpScalar(
2689        b: *Block,
2690        l: *Legalize,
2691        op: std.math.CompareOperator,
2692        lhs: Air.Inst.Ref,
2693        rhs: Air.Inst.Ref,
2694        optimized: bool,
2695    ) Air.Inst.Index {
2696        return b.add(l, .{
2697            .tag = .fromCmpOp(op, optimized),
2698            .data = .{ .bin_op = .{
2699                .lhs = lhs,
2700                .rhs = rhs,
2701            } },
2702        });
2703    }
2704
2705    /// Adds a `bitcast` instruction to `b`. This is a thin wrapper that omits the instruction for
2706    /// no-op casts.
2707    fn addBitCast(
2708        b: *Block,
2709        l: *Legalize,
2710        ty: Type,
2711        operand: Air.Inst.Ref,
2712    ) Air.Inst.Ref {
2713        if (ty.toIntern() != l.typeOf(operand).toIntern()) return b.add(l, .{
2714            .tag = .bitcast,
2715            .data = .{ .ty_op = .{
2716                .ty = Air.internedToRef(ty.toIntern()),
2717                .operand = operand,
2718            } },
2719        }).toRef();
2720        _ = b.stealCapacity(1);
2721        return operand;
2722    }
2723
2724    /// This function emits *two* instructions.
2725    fn addSoftFloatCmp(
2726        b: *Block,
2727        l: *Legalize,
2728        float_ty: Type,
2729        op: std.math.CompareOperator,
2730        lhs: Air.Inst.Ref,
2731        rhs: Air.Inst.Ref,
2732    ) Error!Air.Inst.Ref {
2733        const pt = l.pt;
2734        const target = pt.zcu.getTarget();
2735        const use_aeabi = target.cpu.arch.isArm() and switch (target.abi) {
2736            .eabi,
2737            .eabihf,
2738            .musleabi,
2739            .musleabihf,
2740            .gnueabi,
2741            .gnueabihf,
2742            .android,
2743            .androideabi,
2744            => true,
2745            else => false,
2746        };
2747        const func: Air.CompilerRtFunc, const ret_cmp_op: std.math.CompareOperator = switch (float_ty.floatBits(target)) {
2748            // zig fmt: off
2749            16 => switch (op) {
2750                .eq  => .{ .__eqhf2, .eq  },
2751                .neq => .{ .__nehf2, .neq },
2752                .lt  => .{ .__lthf2, .lt  },
2753                .lte => .{ .__lehf2, .lte },
2754                .gt  => .{ .__gthf2, .gt  },
2755                .gte => .{ .__gehf2, .gte },
2756            },
2757            32 => switch (op) {
2758                .eq  => if (use_aeabi) .{ .__aeabi_fcmpeq, .neq } else .{ .__eqsf2, .eq  },
2759                .neq => if (use_aeabi) .{ .__aeabi_fcmpeq, .eq  } else .{ .__nesf2, .neq },
2760                .lt  => if (use_aeabi) .{ .__aeabi_fcmplt, .neq } else .{ .__ltsf2, .lt  },
2761                .lte => if (use_aeabi) .{ .__aeabi_fcmple, .neq } else .{ .__lesf2, .lte },
2762                .gt  => if (use_aeabi) .{ .__aeabi_fcmpgt, .neq } else .{ .__gtsf2, .gt  },
2763                .gte => if (use_aeabi) .{ .__aeabi_fcmpge, .neq } else .{ .__gesf2, .gte },
2764            },
2765            64 => switch (op) {
2766                .eq  => if (use_aeabi) .{ .__aeabi_dcmpeq, .neq } else .{ .__eqdf2, .eq  },
2767                .neq => if (use_aeabi) .{ .__aeabi_dcmpeq, .eq  } else .{ .__nedf2, .neq },
2768                .lt  => if (use_aeabi) .{ .__aeabi_dcmplt, .neq } else .{ .__ltdf2, .lt  },
2769                .lte => if (use_aeabi) .{ .__aeabi_dcmple, .neq } else .{ .__ledf2, .lte },
2770                .gt  => if (use_aeabi) .{ .__aeabi_dcmpgt, .neq } else .{ .__gtdf2, .gt  },
2771                .gte => if (use_aeabi) .{ .__aeabi_dcmpge, .neq } else .{ .__gedf2, .gte },
2772            },
2773            80 => switch (op) {
2774                .eq  => .{ .__eqxf2, .eq  },
2775                .neq => .{ .__nexf2, .neq },
2776                .lt  => .{ .__ltxf2, .lt  },
2777                .lte => .{ .__lexf2, .lte },
2778                .gt  => .{ .__gtxf2, .gt  },
2779                .gte => .{ .__gexf2, .gte },
2780            },
2781            128 => switch (op) {
2782                .eq  => .{ .__eqtf2, .eq  },
2783                .neq => .{ .__netf2, .neq },
2784                .lt  => .{ .__lttf2, .lt  },
2785                .lte => .{ .__letf2, .lte },
2786                .gt  => .{ .__gttf2, .gt  },
2787                .gte => .{ .__getf2, .gte },
2788            },
2789            else => unreachable,
2790            // zig fmt: on
2791        };
2792        const call_inst = try b.addCompilerRtCall(l, func, &.{ lhs, rhs });
2793        const raw_result = call_inst.toRef();
2794        assert(l.typeOf(raw_result).toIntern() == .i32_type);
2795        const zero_i32: Air.Inst.Ref = .fromValue(try pt.intValue(.i32, 0));
2796        const ret_cmp_tag: Air.Inst.Tag = .fromCmpOp(ret_cmp_op, false);
2797        return b.addBinOp(l, ret_cmp_tag, raw_result, zero_i32).toRef();
2798    }
2799
2800    /// Returns the unused capacity of `b.instructions`, and shrinks `b.instructions` down to `b.len`.
2801    /// This is useful when you've provided a buffer big enough for all your instructions, but you are
2802    /// now starting a new block and some of them need to live there instead.
2803    fn stealRemainingCapacity(b: *Block) []Air.Inst.Index {
2804        return b.stealFrom(b.len);
2805    }
2806
2807    /// Returns `len` elements taken from the unused capacity of `b.instructions`, and shrinks
2808    /// `b.instructions` down to not include them anymore.
2809    /// This is useful when you've provided a buffer big enough for all your instructions, but you are
2810    /// now starting a new block and some of them need to live there instead.
2811    fn stealCapacity(b: *Block, len: usize) []Air.Inst.Index {
2812        return b.stealFrom(b.instructions.len - len);
2813    }
2814
2815    fn stealFrom(b: *Block, start: usize) []Air.Inst.Index {
2816        assert(start >= b.len);
2817        defer b.instructions.len = start;
2818        return b.instructions[start..];
2819    }
2820
2821    fn body(b: *const Block) []const Air.Inst.Index {
2822        assert(b.len == b.instructions.len);
2823        return b.instructions;
2824    }
2825};
2826
2827const Loop = struct {
2828    inst: Air.Inst.Index,
2829    block: Block,
2830
2831    /// The return value has `block` initialized to `undefined`; it is the caller's reponsibility
2832    /// to initialize it.
2833    fn init(l: *Legalize, parent_block: *Block) Loop {
2834        return .{
2835            .inst = parent_block.add(l, .{
2836                .tag = .loop,
2837                .data = .{ .ty_pl = .{
2838                    .ty = .noreturn_type,
2839                    .payload = undefined,
2840                } },
2841            }),
2842            .block = undefined,
2843        };
2844    }
2845
2846    fn finish(loop: Loop, l: *Legalize) Error!void {
2847        const data = &l.air_instructions.items(.data)[@intFromEnum(loop.inst)];
2848        data.ty_pl.payload = try l.addBlockBody(loop.block.body());
2849    }
2850};
2851
2852const CondBr = struct {
2853    inst: Air.Inst.Index,
2854    hints: Air.CondBr.BranchHints,
2855    then_block: Block,
2856    else_block: Block,
2857
2858    /// The return value has `then_block` and `else_block` initialized to `undefined`; it is the
2859    /// caller's reponsibility to initialize them.
2860    fn init(l: *Legalize, operand: Air.Inst.Ref, parent_block: *Block, hints: Air.CondBr.BranchHints) CondBr {
2861        return .{
2862            .inst = parent_block.add(l, .{
2863                .tag = .cond_br,
2864                .data = .{ .pl_op = .{
2865                    .operand = operand,
2866                    .payload = undefined,
2867                } },
2868            }),
2869            .hints = hints,
2870            .then_block = undefined,
2871            .else_block = undefined,
2872        };
2873    }
2874
2875    fn finish(cond_br: CondBr, l: *Legalize) Error!void {
2876        const then_body = cond_br.then_block.body();
2877        const else_body = cond_br.else_block.body();
2878        try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, 3 + then_body.len + else_body.len);
2879
2880        const data = &l.air_instructions.items(.data)[@intFromEnum(cond_br.inst)];
2881        data.pl_op.payload = @intCast(l.air_extra.items.len);
2882        l.air_extra.appendSliceAssumeCapacity(&.{
2883            @intCast(then_body.len),
2884            @intCast(else_body.len),
2885            @bitCast(cond_br.hints),
2886        });
2887        l.air_extra.appendSliceAssumeCapacity(@ptrCast(then_body));
2888        l.air_extra.appendSliceAssumeCapacity(@ptrCast(else_body));
2889    }
2890};
2891
2892fn addInstAssumeCapacity(l: *Legalize, inst: Air.Inst) Air.Inst.Index {
2893    defer l.air_instructions.appendAssumeCapacity(inst);
2894    return @enumFromInt(l.air_instructions.len);
2895}
2896
2897fn addExtra(l: *Legalize, comptime Extra: type, extra: Extra) Error!u32 {
2898    const extra_fields = @typeInfo(Extra).@"struct".fields;
2899    try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, extra_fields.len);
2900    defer inline for (extra_fields) |field| l.air_extra.appendAssumeCapacity(switch (field.type) {
2901        u32 => @field(extra, field.name),
2902        Air.Inst.Ref => @intFromEnum(@field(extra, field.name)),
2903        else => @compileError(@typeName(field.type)),
2904    });
2905    return @intCast(l.air_extra.items.len);
2906}
2907
2908fn addBlockBody(l: *Legalize, body: []const Air.Inst.Index) Error!u32 {
2909    try l.air_extra.ensureUnusedCapacity(l.pt.zcu.gpa, 1 + body.len);
2910    defer {
2911        l.air_extra.appendAssumeCapacity(@intCast(body.len));
2912        l.air_extra.appendSliceAssumeCapacity(@ptrCast(body));
2913    }
2914    return @intCast(l.air_extra.items.len);
2915}
2916
2917/// Returns `tag` to remind the caller to `continue :inst` the result.
2918/// `inline` to propagate the comptime-known `tag` result.
2919inline fn replaceInst(l: *Legalize, inst: Air.Inst.Index, comptime tag: Air.Inst.Tag, data: Air.Inst.Data) Air.Inst.Tag {
2920    const orig_ty = if (std.debug.runtime_safety) l.typeOfIndex(inst) else {};
2921    l.air_instructions.set(@intFromEnum(inst), .{ .tag = tag, .data = data });
2922    if (std.debug.runtime_safety) assert(l.typeOfIndex(inst).toIntern() == orig_ty.toIntern());
2923    return tag;
2924}
2925
2926fn compilerRtCall(
2927    l: *Legalize,
2928    orig_inst: Air.Inst.Index,
2929    func: Air.CompilerRtFunc,
2930    args: []const Air.Inst.Ref,
2931    result_ty: Type,
2932) Error!Air.Inst.Tag {
2933    const zcu = l.pt.zcu;
2934    const gpa = zcu.gpa;
2935
2936    const func_ret_ty = func.returnType();
2937
2938    if (func_ret_ty.toIntern() == result_ty.toIntern()) {
2939        try l.air_extra.ensureUnusedCapacity(gpa, @typeInfo(Air.Call).@"struct".fields.len + args.len);
2940        const payload = l.addExtra(Air.Call, .{ .args_len = @intCast(args.len) }) catch unreachable;
2941        l.air_extra.appendSliceAssumeCapacity(@ptrCast(args));
2942        return l.replaceInst(orig_inst, .legalize_compiler_rt_call, .{ .legalize_compiler_rt_call = .{
2943            .func = func,
2944            .payload = payload,
2945        } });
2946    }
2947
2948    // We need to bitcast the result to an "alias" type (e.g. c_int/i32, c_longdouble/f128).
2949
2950    assert(func_ret_ty.bitSize(zcu) == result_ty.bitSize(zcu));
2951
2952    var inst_buf: [3]Air.Inst.Index = undefined;
2953    var main_block: Block = .init(&inst_buf);
2954    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
2955
2956    const call_inst = try main_block.addCompilerRtCall(l, func, args);
2957    const casted_result = main_block.addBitCast(l, result_ty, call_inst.toRef());
2958    main_block.addBr(l, orig_inst, casted_result);
2959
2960    return l.replaceInst(orig_inst, .block, .{ .ty_pl = .{
2961        .ty = .fromType(result_ty),
2962        .payload = try l.addBlockBody(main_block.body()),
2963    } });
2964}
2965
2966fn softFptruncFunc(l: *const Legalize, src_ty: Type, dst_ty: Type) Air.CompilerRtFunc {
2967    const target = l.pt.zcu.getTarget();
2968    const src_bits = src_ty.floatBits(target);
2969    const dst_bits = dst_ty.floatBits(target);
2970    assert(dst_bits < src_bits);
2971    const to_f16_func: Air.CompilerRtFunc = switch (src_bits) {
2972        128 => .__trunctfhf2,
2973        80 => .__truncxfhf2,
2974        64 => .__truncdfhf2,
2975        32 => .__truncsfhf2,
2976        else => unreachable,
2977    };
2978    const offset: u8 = switch (dst_bits) {
2979        16 => 0,
2980        32 => 1,
2981        64 => 2,
2982        80 => 3,
2983        else => unreachable,
2984    };
2985    return @enumFromInt(@intFromEnum(to_f16_func) + offset);
2986}
2987fn softFpextFunc(l: *const Legalize, src_ty: Type, dst_ty: Type) Air.CompilerRtFunc {
2988    const target = l.pt.zcu.getTarget();
2989    const src_bits = src_ty.floatBits(target);
2990    const dst_bits = dst_ty.floatBits(target);
2991    assert(dst_bits > src_bits);
2992    const to_f128_func: Air.CompilerRtFunc = switch (src_bits) {
2993        16 => .__extendhftf2,
2994        32 => .__extendsftf2,
2995        64 => .__extenddftf2,
2996        80 => .__extendxftf2,
2997        else => unreachable,
2998    };
2999    const offset: u8 = switch (dst_bits) {
3000        128 => 0,
3001        80 => 1,
3002        64 => 2,
3003        32 => 3,
3004        else => unreachable,
3005    };
3006    return @enumFromInt(@intFromEnum(to_f128_func) + offset);
3007}
3008fn softFloatFromInt(l: *Legalize, orig_inst: Air.Inst.Index) Error!union(enum) {
3009    call: Air.CompilerRtFunc,
3010    block_payload: Air.Inst.Data,
3011} {
3012    const pt = l.pt;
3013    const zcu = pt.zcu;
3014    const target = zcu.getTarget();
3015
3016    const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
3017    const dest_ty = ty_op.ty.toType();
3018    const src_ty = l.typeOf(ty_op.operand);
3019
3020    const src_info = src_ty.intInfo(zcu);
3021    const float_off: u32 = switch (dest_ty.floatBits(target)) {
3022        16 => 0,
3023        32 => 1,
3024        64 => 2,
3025        80 => 3,
3026        128 => 4,
3027        else => unreachable,
3028    };
3029    const base: Air.CompilerRtFunc = switch (src_info.signedness) {
3030        .signed => .__floatsihf,
3031        .unsigned => .__floatunsihf,
3032    };
3033    fixed: {
3034        const extended_int_bits: u16, const int_bits_off: u32 = switch (src_info.bits) {
3035            0...32 => .{ 32, 0 },
3036            33...64 => .{ 64, 5 },
3037            65...128 => .{ 128, 10 },
3038            else => break :fixed,
3039        };
3040        // x86_64-windows uses an odd callconv for 128-bit integers, so we use the
3041        // arbitrary-precision routine in that case for simplicity.
3042        if (target.cpu.arch == .x86_64 and target.os.tag == .windows and extended_int_bits == 128) {
3043            break :fixed;
3044        }
3045
3046        const func: Air.CompilerRtFunc = @enumFromInt(@intFromEnum(base) + int_bits_off + float_off);
3047        if (extended_int_bits == src_info.bits) return .{ .call = func };
3048
3049        // We need to emit a block which first sign/zero-extends to the right type and *then* calls
3050        // the required routine.
3051        const extended_ty = try l.pt.intType(src_info.signedness, extended_int_bits);
3052
3053        var inst_buf: [4]Air.Inst.Index = undefined;
3054        var main_block: Block = .init(&inst_buf);
3055        try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
3056
3057        const extended_val = main_block.addTyOp(l, .intcast, extended_ty, ty_op.operand).toRef();
3058        const call_inst = try main_block.addCompilerRtCall(l, func, &.{extended_val});
3059        const casted_result = main_block.addBitCast(l, dest_ty, call_inst.toRef());
3060        main_block.addBr(l, orig_inst, casted_result);
3061
3062        return .{ .block_payload = .{ .ty_pl = .{
3063            .ty = .fromType(dest_ty),
3064            .payload = try l.addBlockBody(main_block.body()),
3065        } } };
3066    }
3067
3068    // We need to emit a block which puts the integer into an `alloc` (possibly sign/zero-extended)
3069    // and calls an arbitrary-width conversion routine.
3070
3071    const func: Air.CompilerRtFunc = @enumFromInt(@intFromEnum(base) + 15 + float_off);
3072
3073    // The extended integer routines expect the integer representation where the integer is
3074    // effectively zero- or sign-extended to its ABI size. We represent that by intcasting to
3075    // such an integer type and passing a pointer to *that*.
3076    const extended_ty = try pt.intType(src_info.signedness, @intCast(src_ty.abiSize(zcu) * 8));
3077    assert(extended_ty.abiSize(zcu) == src_ty.abiSize(zcu));
3078
3079    var inst_buf: [6]Air.Inst.Index = undefined;
3080    var main_block: Block = .init(&inst_buf);
3081    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
3082
3083    const extended_val: Air.Inst.Ref = if (extended_ty.toIntern() != src_ty.toIntern()) ext: {
3084        break :ext main_block.addTyOp(l, .intcast, extended_ty, ty_op.operand).toRef();
3085    } else ext: {
3086        _ = main_block.stealCapacity(1);
3087        break :ext ty_op.operand;
3088    };
3089    const extended_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(extended_ty)).toRef();
3090    _ = main_block.addBinOp(l, .store, extended_ptr, extended_val);
3091    const bits_val = try pt.intValue(.usize, src_info.bits);
3092    const call_inst = try main_block.addCompilerRtCall(l, func, &.{ extended_ptr, .fromValue(bits_val) });
3093    const casted_result = main_block.addBitCast(l, dest_ty, call_inst.toRef());
3094    main_block.addBr(l, orig_inst, casted_result);
3095
3096    return .{ .block_payload = .{ .ty_pl = .{
3097        .ty = .fromType(dest_ty),
3098        .payload = try l.addBlockBody(main_block.body()),
3099    } } };
3100}
3101fn softIntFromFloat(l: *Legalize, orig_inst: Air.Inst.Index) Error!union(enum) {
3102    call: Air.CompilerRtFunc,
3103    block_payload: Air.Inst.Data,
3104} {
3105    const pt = l.pt;
3106    const zcu = pt.zcu;
3107    const target = zcu.getTarget();
3108
3109    const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op;
3110    const src_ty = l.typeOf(ty_op.operand);
3111    const dest_ty = ty_op.ty.toType();
3112
3113    const dest_info = dest_ty.intInfo(zcu);
3114    const float_off: u32 = switch (src_ty.floatBits(target)) {
3115        16 => 0,
3116        32 => 1,
3117        64 => 2,
3118        80 => 3,
3119        128 => 4,
3120        else => unreachable,
3121    };
3122    const base: Air.CompilerRtFunc = switch (dest_info.signedness) {
3123        .signed => .__fixhfsi,
3124        .unsigned => .__fixunshfsi,
3125    };
3126    fixed: {
3127        const extended_int_bits: u16, const int_bits_off: u32 = switch (dest_info.bits) {
3128            0...32 => .{ 32, 0 },
3129            33...64 => .{ 64, 5 },
3130            65...128 => .{ 128, 10 },
3131            else => break :fixed,
3132        };
3133        // x86_64-windows uses an odd callconv for 128-bit integers, so we use the
3134        // arbitrary-precision routine in that case for simplicity.
3135        if (target.cpu.arch == .x86_64 and target.os.tag == .windows and extended_int_bits == 128) {
3136            break :fixed;
3137        }
3138
3139        const func: Air.CompilerRtFunc = @enumFromInt(@intFromEnum(base) + int_bits_off + float_off);
3140        if (extended_int_bits == dest_info.bits) return .{ .call = func };
3141
3142        // We need to emit a block which calls the routine and then casts to the required type.
3143
3144        var inst_buf: [3]Air.Inst.Index = undefined;
3145        var main_block: Block = .init(&inst_buf);
3146        try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
3147
3148        const call_inst = try main_block.addCompilerRtCall(l, func, &.{ty_op.operand});
3149        const casted_val = main_block.addTyOp(l, .intcast, dest_ty, call_inst.toRef()).toRef();
3150        main_block.addBr(l, orig_inst, casted_val);
3151
3152        return .{ .block_payload = .{ .ty_pl = .{
3153            .ty = .fromType(dest_ty),
3154            .payload = try l.addBlockBody(main_block.body()),
3155        } } };
3156    }
3157
3158    // We need to emit a block which calls an arbitrary-width conversion routine, then loads the
3159    // integer from an `alloc` and possibly truncates it.
3160    const func: Air.CompilerRtFunc = @enumFromInt(@intFromEnum(base) + 15 + float_off);
3161
3162    const extended_ty = try pt.intType(dest_info.signedness, @intCast(dest_ty.abiSize(zcu) * 8));
3163    assert(extended_ty.abiSize(zcu) == dest_ty.abiSize(zcu));
3164
3165    var inst_buf: [5]Air.Inst.Index = undefined;
3166    var main_block: Block = .init(&inst_buf);
3167    try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len);
3168
3169    const extended_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(extended_ty)).toRef();
3170    const bits_val = try pt.intValue(.usize, dest_info.bits);
3171    _ = try main_block.addCompilerRtCall(l, func, &.{ extended_ptr, .fromValue(bits_val), ty_op.operand });
3172    const extended_val = main_block.addTyOp(l, .load, extended_ty, extended_ptr).toRef();
3173    const result_val = main_block.addTyOp(l, .intcast, dest_ty, extended_val).toRef();
3174    main_block.addBr(l, orig_inst, result_val);
3175
3176    return .{ .block_payload = .{ .ty_pl = .{
3177        .ty = .fromType(dest_ty),
3178        .payload = try l.addBlockBody(main_block.body()),
3179    } } };
3180}
3181fn softFloatFunc(op: Air.Inst.Tag, float_ty: Type, zcu: *const Zcu) Air.CompilerRtFunc {
3182    const f16_func: Air.CompilerRtFunc = switch (op) {
3183        .add, .add_optimized => .__addhf3,
3184        .sub, .sub_optimized => .__subhf3,
3185        .mul, .mul_optimized => .__mulhf3,
3186
3187        .div_float,
3188        .div_float_optimized,
3189        .div_exact,
3190        .div_exact_optimized,
3191        => .__divhf3,
3192
3193        .min => .__fminh,
3194        .max => .__fmaxh,
3195
3196        .ceil => .__ceilh,
3197        .floor => .__floorh,
3198        .trunc_float => .__trunch,
3199        .round => .__roundh,
3200
3201        .log => .__logh,
3202        .log2 => .__log2h,
3203        .log10 => .__log10h,
3204
3205        .exp => .__exph,
3206        .exp2 => .__exp2h,
3207
3208        .sin => .__sinh,
3209        .cos => .__cosh,
3210        .tan => .__tanh,
3211
3212        .abs => .__fabsh,
3213        .sqrt => .__sqrth,
3214        .rem, .rem_optimized => .__fmodh,
3215        .mul_add => .__fmah,
3216
3217        else => unreachable,
3218    };
3219    const offset: u8 = switch (float_ty.floatBits(zcu.getTarget())) {
3220        16 => 0,
3221        32 => 1,
3222        64 => 2,
3223        80 => 3,
3224        128 => 4,
3225        else => unreachable,
3226    };
3227    return @enumFromInt(@intFromEnum(f16_func) + offset);
3228}
3229
3230fn softFloatNegBlockPayload(
3231    l: *Legalize,
3232    orig_inst: Air.Inst.Index,
3233    operand: Air.Inst.Ref,
3234) Error!Air.Inst.Data {
3235    const pt = l.pt;
3236    const zcu = pt.zcu;
3237    const gpa = zcu.gpa;
3238
3239    const float_ty = l.typeOfIndex(orig_inst);
3240
3241    const int_ty: Type, const sign_bit: Value = switch (float_ty.floatBits(zcu.getTarget())) {
3242        16 => .{ .u16, try pt.intValue(.u16, @as(u16, 1) << 15) },
3243        32 => .{ .u32, try pt.intValue(.u32, @as(u32, 1) << 31) },
3244        64 => .{ .u64, try pt.intValue(.u64, @as(u64, 1) << 63) },
3245        80 => .{ .u80, try pt.intValue(.u80, @as(u80, 1) << 79) },
3246        128 => .{ .u128, try pt.intValue(.u128, @as(u128, 1) << 127) },
3247        else => unreachable,
3248    };
3249
3250    const sign_bit_ref: Air.Inst.Ref = .fromValue(sign_bit);
3251
3252    var inst_buf: [4]Air.Inst.Index = undefined;
3253    var main_block: Block = .init(&inst_buf);
3254    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
3255
3256    const operand_as_int = main_block.addBitCast(l, int_ty, operand);
3257    const result_as_int = main_block.addBinOp(l, .xor, operand_as_int, sign_bit_ref).toRef();
3258    const result = main_block.addBitCast(l, float_ty, result_as_int);
3259    main_block.addBr(l, orig_inst, result);
3260
3261    return .{ .ty_pl = .{
3262        .ty = .fromType(float_ty),
3263        .payload = try l.addBlockBody(main_block.body()),
3264    } };
3265}
3266
3267fn softFloatDivTruncFloorBlockPayload(
3268    l: *Legalize,
3269    orig_inst: Air.Inst.Index,
3270    lhs: Air.Inst.Ref,
3271    rhs: Air.Inst.Ref,
3272    air_tag: Air.Inst.Tag,
3273) Error!Air.Inst.Data {
3274    const zcu = l.pt.zcu;
3275    const gpa = zcu.gpa;
3276
3277    const float_ty = l.typeOfIndex(orig_inst);
3278
3279    const floor_tag: Air.Inst.Tag = switch (air_tag) {
3280        .div_trunc, .div_trunc_optimized => .trunc_float,
3281        .div_floor, .div_floor_optimized => .floor,
3282        else => unreachable,
3283    };
3284
3285    var inst_buf: [4]Air.Inst.Index = undefined;
3286    var main_block: Block = .init(&inst_buf);
3287    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
3288
3289    const div_inst = try main_block.addCompilerRtCall(l, softFloatFunc(.div_float, float_ty, zcu), &.{ lhs, rhs });
3290    const floor_inst = try main_block.addCompilerRtCall(l, softFloatFunc(floor_tag, float_ty, zcu), &.{div_inst.toRef()});
3291    const casted_result = main_block.addBitCast(l, float_ty, floor_inst.toRef());
3292    main_block.addBr(l, orig_inst, casted_result);
3293
3294    return .{ .ty_pl = .{
3295        .ty = .fromType(float_ty),
3296        .payload = try l.addBlockBody(main_block.body()),
3297    } };
3298}
3299fn softFloatModBlockPayload(
3300    l: *Legalize,
3301    orig_inst: Air.Inst.Index,
3302    lhs: Air.Inst.Ref,
3303    rhs: Air.Inst.Ref,
3304) Error!Air.Inst.Data {
3305    const pt = l.pt;
3306    const zcu = pt.zcu;
3307    const gpa = zcu.gpa;
3308
3309    const float_ty = l.typeOfIndex(orig_inst);
3310
3311    var inst_buf: [10]Air.Inst.Index = undefined;
3312    var main_block: Block = .init(&inst_buf);
3313    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
3314
3315    const rem = try main_block.addCompilerRtCall(l, softFloatFunc(.rem, float_ty, zcu), &.{ lhs, rhs });
3316    const lhs_lt_zero = try main_block.addSoftFloatCmp(l, float_ty, .lt, lhs, .fromValue(try pt.floatValue(float_ty, 0.0)));
3317
3318    var condbr: CondBr = .init(l, lhs_lt_zero, &main_block, .{});
3319    condbr.then_block = .init(main_block.stealRemainingCapacity());
3320    {
3321        const add = try condbr.then_block.addCompilerRtCall(l, softFloatFunc(.add, float_ty, zcu), &.{ rem.toRef(), rhs });
3322        const inner_rem = try condbr.then_block.addCompilerRtCall(l, softFloatFunc(.rem, float_ty, zcu), &.{ add.toRef(), rhs });
3323        const casted_result = condbr.then_block.addBitCast(l, float_ty, inner_rem.toRef());
3324        condbr.then_block.addBr(l, orig_inst, casted_result);
3325    }
3326    condbr.else_block = .init(condbr.then_block.stealRemainingCapacity());
3327    {
3328        const casted_result = condbr.else_block.addBitCast(l, float_ty, rem.toRef());
3329        condbr.else_block.addBr(l, orig_inst, casted_result);
3330    }
3331
3332    try condbr.finish(l);
3333
3334    return .{ .ty_pl = .{
3335        .ty = .fromType(float_ty),
3336        .payload = try l.addBlockBody(main_block.body()),
3337    } };
3338}
3339fn softFloatCmpBlockPayload(
3340    l: *Legalize,
3341    orig_inst: Air.Inst.Index,
3342    float_ty: Type,
3343    op: std.math.CompareOperator,
3344    lhs: Air.Inst.Ref,
3345    rhs: Air.Inst.Ref,
3346) Error!Air.Inst.Data {
3347    const pt = l.pt;
3348    const gpa = pt.zcu.gpa;
3349
3350    var inst_buf: [3]Air.Inst.Index = undefined;
3351    var main_block: Block = .init(&inst_buf);
3352    try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len);
3353
3354    const result = try main_block.addSoftFloatCmp(l, float_ty, op, lhs, rhs);
3355    main_block.addBr(l, orig_inst, result);
3356
3357    return .{ .ty_pl = .{
3358        .ty = .bool_type,
3359        .payload = try l.addBlockBody(main_block.body()),
3360    } };
3361}
3362
3363/// `inline` to propagate potentially comptime-known return value.
3364inline fn wantScalarizeOrSoftFloat(
3365    l: *const Legalize,
3366    comptime air_tag: Air.Inst.Tag,
3367    ty: Type,
3368) enum {
3369    none,
3370    scalarize,
3371    soft_float,
3372} {
3373    const zcu = l.pt.zcu;
3374    const is_vec, const scalar_ty = switch (ty.zigTypeTag(zcu)) {
3375        .vector => .{ true, ty.childType(zcu) },
3376        else => .{ false, ty },
3377    };
3378
3379    if (is_vec and l.features.has(.scalarize(air_tag))) return .scalarize;
3380
3381    if (l.wantSoftFloatScalar(scalar_ty)) {
3382        return if (is_vec) .scalarize else .soft_float;
3383    }
3384    return .none;
3385}
3386
3387/// `inline` to propagate potentially comptime-known return value.
3388inline fn wantSoftFloatScalar(l: *const Legalize, ty: Type) bool {
3389    const zcu = l.pt.zcu;
3390    return switch (ty.zigTypeTag(zcu)) {
3391        .vector => unreachable,
3392        .float => switch (ty.floatBits(zcu.getTarget())) {
3393            16 => l.features.has(.soft_f16),
3394            32 => l.features.has(.soft_f32),
3395            64 => l.features.has(.soft_f64),
3396            80 => l.features.has(.soft_f80),
3397            128 => l.features.has(.soft_f128),
3398            else => unreachable,
3399        },
3400        else => false,
3401    };
3402}
3403
3404const Air = @import("../Air.zig");
3405const assert = std.debug.assert;
3406const dev = @import("../dev.zig");
3407const InternPool = @import("../InternPool.zig");
3408const Legalize = @This();
3409const std = @import("std");
3410const Type = @import("../Type.zig");
3411const Value = @import("../Value.zig");
3412const Zcu = @import("../Zcu.zig");