Commit c478c7609e

Jacob Young <jacobly0@users.noreply.github.com>
2023-03-05 06:01:15
CBE: implement vector operations
Also, bigint add and sub which is all I was actually trying to do.
1 parent b2e9c0d
lib/zig.h
@@ -612,12 +612,6 @@ static inline bool zig_addo_u32(uint32_t *res, uint32_t lhs, uint32_t rhs, uint8
 #endif
 }
 
-static inline void zig_vaddo_u32(uint8_t *ov, uint32_t *res, int n,
-    const uint32_t *lhs, const uint32_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_u32(&res[i], lhs[i], rhs[i], bits);
-}
-
 zig_extern int32_t  __addosi4(int32_t lhs, int32_t rhs, int *overflow);
 static inline bool zig_addo_i32(int32_t *res, int32_t lhs, int32_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
@@ -632,12 +626,6 @@ static inline bool zig_addo_i32(int32_t *res, int32_t lhs, int32_t rhs, uint8_t
     return overflow || full_res < zig_minInt_i(32, bits) || full_res > zig_maxInt_i(32, bits);
 }
 
-static inline void zig_vaddo_i32(uint8_t *ov, int32_t *res, int n,
-    const int32_t *lhs, const int32_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_i32(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_addo_u64(uint64_t *res, uint64_t lhs, uint64_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
     uint64_t full_res;
@@ -650,12 +638,6 @@ static inline bool zig_addo_u64(uint64_t *res, uint64_t lhs, uint64_t rhs, uint8
 #endif
 }
 
-static inline void zig_vaddo_u64(uint8_t *ov, uint64_t *res, int n,
-    const uint64_t *lhs, const uint64_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_u64(&res[i], lhs[i], rhs[i], bits);
-}
-
 zig_extern int64_t  __addodi4(int64_t lhs, int64_t rhs, int *overflow);
 static inline bool zig_addo_i64(int64_t *res, int64_t lhs, int64_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
@@ -670,12 +652,6 @@ static inline bool zig_addo_i64(int64_t *res, int64_t lhs, int64_t rhs, uint8_t
     return overflow || full_res < zig_minInt_i(64, bits) || full_res > zig_maxInt_i(64, bits);
 }
 
-static inline void zig_vaddo_i64(uint8_t *ov, int64_t *res, int n,
-    const int64_t *lhs, const int64_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_i64(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_addo_u8(uint8_t *res, uint8_t lhs, uint8_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
     uint8_t full_res;
@@ -690,12 +666,6 @@ static inline bool zig_addo_u8(uint8_t *res, uint8_t lhs, uint8_t rhs, uint8_t b
 #endif
 }
 
-static inline void zig_vaddo_u8(uint8_t *ov, uint8_t *res, int n,
-    const uint8_t *lhs, const uint8_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_u8(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_addo_i8(int8_t *res, int8_t lhs, int8_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
     int8_t full_res;
@@ -710,12 +680,6 @@ static inline bool zig_addo_i8(int8_t *res, int8_t lhs, int8_t rhs, uint8_t bits
 #endif
 }
 
-static inline void zig_vaddo_i8(uint8_t *ov, int8_t *res, int n,
-    const int8_t *lhs, const int8_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_i8(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_addo_u16(uint16_t *res, uint16_t lhs, uint16_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
     uint16_t full_res;
@@ -730,12 +694,6 @@ static inline bool zig_addo_u16(uint16_t *res, uint16_t lhs, uint16_t rhs, uint8
 #endif
 }
 
-static inline void zig_vaddo_u16(uint8_t *ov, uint16_t *res, int n,
-    const uint16_t *lhs, const uint16_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_u16(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_addo_i16(int16_t *res, int16_t lhs, int16_t rhs, uint8_t bits) {
 #if zig_has_builtin(add_overflow) || defined(zig_gnuc)
     int16_t full_res;
@@ -750,12 +708,6 @@ static inline bool zig_addo_i16(int16_t *res, int16_t lhs, int16_t rhs, uint8_t
 #endif
 }
 
-static inline void zig_vaddo_i16(uint8_t *ov, int16_t *res, int n,
-    const int16_t *lhs, const int16_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_addo_i16(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_subo_u32(uint32_t *res, uint32_t lhs, uint32_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
     uint32_t full_res;
@@ -768,12 +720,6 @@ static inline bool zig_subo_u32(uint32_t *res, uint32_t lhs, uint32_t rhs, uint8
 #endif
 }
 
-static inline void zig_vsubo_u32(uint8_t *ov, uint32_t *res, int n,
-    const uint32_t *lhs, const uint32_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_u32(&res[i], lhs[i], rhs[i], bits);
-}
-
 zig_extern int32_t  __subosi4(int32_t lhs, int32_t rhs, int *overflow);
 static inline bool zig_subo_i32(int32_t *res, int32_t lhs, int32_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
@@ -788,12 +734,6 @@ static inline bool zig_subo_i32(int32_t *res, int32_t lhs, int32_t rhs, uint8_t
     return overflow || full_res < zig_minInt_i(32, bits) || full_res > zig_maxInt_i(32, bits);
 }
 
-static inline void zig_vsubo_i32(uint8_t *ov, int32_t *res, int n,
-    const int32_t *lhs, const int32_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_i32(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_subo_u64(uint64_t *res, uint64_t lhs, uint64_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
     uint64_t full_res;
@@ -806,12 +746,6 @@ static inline bool zig_subo_u64(uint64_t *res, uint64_t lhs, uint64_t rhs, uint8
 #endif
 }
 
-static inline void zig_vsubo_u64(uint8_t *ov, uint64_t *res, int n,
-    const uint64_t *lhs, const uint64_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_u64(&res[i], lhs[i], rhs[i], bits);
-}
-
 zig_extern int64_t  __subodi4(int64_t lhs, int64_t rhs, int *overflow);
 static inline bool zig_subo_i64(int64_t *res, int64_t lhs, int64_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
@@ -826,12 +760,6 @@ static inline bool zig_subo_i64(int64_t *res, int64_t lhs, int64_t rhs, uint8_t
     return overflow || full_res < zig_minInt_i(64, bits) || full_res > zig_maxInt_i(64, bits);
 }
 
-static inline void zig_vsubo_i64(uint8_t *ov, int64_t *res, int n,
-    const int64_t *lhs, const int64_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_i64(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_subo_u8(uint8_t *res, uint8_t lhs, uint8_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
     uint8_t full_res;
@@ -846,12 +774,6 @@ static inline bool zig_subo_u8(uint8_t *res, uint8_t lhs, uint8_t rhs, uint8_t b
 #endif
 }
 
-static inline void zig_vsubo_u8(uint8_t *ov, uint8_t *res, int n,
-    const uint8_t *lhs, const uint8_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_u8(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_subo_i8(int8_t *res, int8_t lhs, int8_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
     int8_t full_res;
@@ -866,13 +788,6 @@ static inline bool zig_subo_i8(int8_t *res, int8_t lhs, int8_t rhs, uint8_t bits
 #endif
 }
 
-static inline void zig_vsubo_i8(uint8_t *ov, int8_t *res, int n,
-    const int8_t *lhs, const int8_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_i8(&res[i], lhs[i], rhs[i], bits);
-}
-
-
 static inline bool zig_subo_u16(uint16_t *res, uint16_t lhs, uint16_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
     uint16_t full_res;
@@ -887,13 +802,6 @@ static inline bool zig_subo_u16(uint16_t *res, uint16_t lhs, uint16_t rhs, uint8
 #endif
 }
 
-static inline void zig_vsubo_u16(uint8_t *ov, uint16_t *res, int n,
-    const uint16_t *lhs, const uint16_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_u16(&res[i], lhs[i], rhs[i], bits);
-}
-
-
 static inline bool zig_subo_i16(int16_t *res, int16_t lhs, int16_t rhs, uint8_t bits) {
 #if zig_has_builtin(sub_overflow) || defined(zig_gnuc)
     int16_t full_res;
@@ -908,12 +816,6 @@ static inline bool zig_subo_i16(int16_t *res, int16_t lhs, int16_t rhs, uint8_t
 #endif
 }
 
-static inline void zig_vsubo_i16(uint8_t *ov, int16_t *res, int n,
-    const int16_t *lhs, const int16_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_subo_i16(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_mulo_u32(uint32_t *res, uint32_t lhs, uint32_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
     uint32_t full_res;
@@ -926,12 +828,6 @@ static inline bool zig_mulo_u32(uint32_t *res, uint32_t lhs, uint32_t rhs, uint8
 #endif
 }
 
-static inline void zig_vmulo_u32(uint8_t *ov, uint32_t *res, int n,
-    const uint32_t *lhs, const uint32_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_u32(&res[i], lhs[i], rhs[i], bits);
-}
-
 zig_extern int32_t  __mulosi4(int32_t lhs, int32_t rhs, int *overflow);
 static inline bool zig_mulo_i32(int32_t *res, int32_t lhs, int32_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
@@ -946,12 +842,6 @@ static inline bool zig_mulo_i32(int32_t *res, int32_t lhs, int32_t rhs, uint8_t
     return overflow || full_res < zig_minInt_i(32, bits) || full_res > zig_maxInt_i(32, bits);
 }
 
-static inline void zig_vmulo_i32(uint8_t *ov, int32_t *res, int n,
-    const int32_t *lhs, const int32_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_i32(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_mulo_u64(uint64_t *res, uint64_t lhs, uint64_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
     uint64_t full_res;
@@ -964,12 +854,6 @@ static inline bool zig_mulo_u64(uint64_t *res, uint64_t lhs, uint64_t rhs, uint8
 #endif
 }
 
-static inline void zig_vmulo_u64(uint8_t *ov, uint64_t *res, int n,
-    const uint64_t *lhs, const uint64_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_u64(&res[i], lhs[i], rhs[i], bits);
-}
-
 zig_extern int64_t  __mulodi4(int64_t lhs, int64_t rhs, int *overflow);
 static inline bool zig_mulo_i64(int64_t *res, int64_t lhs, int64_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
@@ -984,12 +868,6 @@ static inline bool zig_mulo_i64(int64_t *res, int64_t lhs, int64_t rhs, uint8_t
     return overflow || full_res < zig_minInt_i(64, bits) || full_res > zig_maxInt_i(64, bits);
 }
 
-static inline void zig_vmulo_i64(uint8_t *ov, int64_t *res, int n,
-    const int64_t *lhs, const int64_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_i64(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_mulo_u8(uint8_t *res, uint8_t lhs, uint8_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
     uint8_t full_res;
@@ -1004,12 +882,6 @@ static inline bool zig_mulo_u8(uint8_t *res, uint8_t lhs, uint8_t rhs, uint8_t b
 #endif
 }
 
-static inline void zig_vmulo_u8(uint8_t *ov, uint8_t *res, int n,
-    const uint8_t *lhs, const uint8_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_u8(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_mulo_i8(int8_t *res, int8_t lhs, int8_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
     int8_t full_res;
@@ -1024,12 +896,6 @@ static inline bool zig_mulo_i8(int8_t *res, int8_t lhs, int8_t rhs, uint8_t bits
 #endif
 }
 
-static inline void zig_vmulo_i8(uint8_t *ov, int8_t *res, int n,
-    const int8_t *lhs, const int8_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_i8(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_mulo_u16(uint16_t *res, uint16_t lhs, uint16_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
     uint16_t full_res;
@@ -1044,12 +910,6 @@ static inline bool zig_mulo_u16(uint16_t *res, uint16_t lhs, uint16_t rhs, uint8
 #endif
 }
 
-static inline void zig_vmulo_u16(uint8_t *ov, uint16_t *res, int n,
-    const uint16_t *lhs, const uint16_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_u16(&res[i], lhs[i], rhs[i], bits);
-}
-
 static inline bool zig_mulo_i16(int16_t *res, int16_t lhs, int16_t rhs, uint8_t bits) {
 #if zig_has_builtin(mul_overflow) || defined(zig_gnuc)
     int16_t full_res;
@@ -1064,12 +924,6 @@ static inline bool zig_mulo_i16(int16_t *res, int16_t lhs, int16_t rhs, uint8_t
 #endif
 }
 
-static inline void zig_vmulo_i16(uint8_t *ov, int16_t *res, int n,
-    const int16_t *lhs, const int16_t *rhs, uint8_t bits)
-{
-    for (int i = 0; i < n; ++i) ov[i] = zig_mulo_i16(&res[i], lhs[i], rhs[i], bits);
-}
-
 #define zig_int_builtins(w) \
     static inline bool zig_shlo_u##w(uint##w##_t *res, uint##w##_t lhs, uint8_t rhs, uint8_t bits) { \
         *res = zig_shlw_u##w(lhs, rhs, bits); \
@@ -2090,6 +1944,446 @@ static inline int32_t zig_cmp_big(const void *lhs, const void *rhs, bool is_sign
     return 0;
 }
 
+static inline bool zig_addo_big(void *res, const void *lhs, const void *rhs, bool is_signed, uint16_t bits) {
+    uint8_t *res_bytes = res;
+    const uint8_t *lhs_bytes = lhs;
+    const uint8_t *rhs_bytes = rhs;
+    uint16_t byte_offset = 0;
+    uint16_t remaining_bytes = zig_int_bytes(bits);
+    uint16_t top_bits = remaining_bytes * 8 - bits;
+    bool overflow = false;
+
+#if zig_big_endian
+    byte_offset = remaining_bytes;
+#endif
+
+    while (remaining_bytes >= 128 / CHAR_BIT) {
+        uint16_t limb_bits = 128 - (remaining_bytes == 128 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 128 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 128 / CHAR_BIT && is_signed) {
+            zig_i128 res_limb;
+            zig_i128 tmp_limb;
+            zig_i128 lhs_limb;
+            zig_i128 rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_i128(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_i128(&res_limb, tmp_limb, zig_make_i128(INT64_C(0), overflow ? UINT64_C(1) : UINT64_C(0)), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            zig_u128 res_limb;
+            zig_u128 tmp_limb;
+            zig_u128 lhs_limb;
+            zig_u128 rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_u128(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_u128(&res_limb, tmp_limb, zig_make_u128(UINT64_C(0), overflow ? UINT64_C(1) : UINT64_C(0)), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 128 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 128 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 64 / CHAR_BIT) {
+        uint16_t limb_bits = 64 - (remaining_bytes == 64 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 64 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 64 / CHAR_BIT && is_signed) {
+            int64_t res_limb;
+            int64_t tmp_limb;
+            int64_t lhs_limb;
+            int64_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_i64(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_i64(&res_limb, tmp_limb, overflow ? INT64_C(1) : INT64_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint64_t res_limb;
+            uint64_t tmp_limb;
+            uint64_t lhs_limb;
+            uint64_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_u64(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_u64(&res_limb, tmp_limb, overflow ? UINT64_C(1) : UINT64_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 64 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 64 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 32 / CHAR_BIT) {
+        uint16_t limb_bits = 32 - (remaining_bytes == 32 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 32 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 32 / CHAR_BIT && is_signed) {
+            int32_t res_limb;
+            int32_t tmp_limb;
+            int32_t lhs_limb;
+            int32_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_i32(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_i32(&res_limb, tmp_limb, overflow ? INT32_C(1) : INT32_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint32_t res_limb;
+            uint32_t tmp_limb;
+            uint32_t lhs_limb;
+            uint32_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_u32(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_u32(&res_limb, tmp_limb, overflow ? UINT32_C(1) : UINT32_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 32 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 32 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 16 / CHAR_BIT) {
+        uint16_t limb_bits = 16 - (remaining_bytes == 16 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 16 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 16 / CHAR_BIT && is_signed) {
+            int16_t res_limb;
+            int16_t tmp_limb;
+            int16_t lhs_limb;
+            int16_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_i16(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_i16(&res_limb, tmp_limb, overflow ? INT16_C(1) : INT16_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint16_t res_limb;
+            uint16_t tmp_limb;
+            uint16_t lhs_limb;
+            uint16_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_u16(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_u16(&res_limb, tmp_limb, overflow ? UINT16_C(1) : UINT16_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 16 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 16 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 8 / CHAR_BIT) {
+        uint16_t limb_bits = 8 - (remaining_bytes == 8 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 8 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 8 / CHAR_BIT && is_signed) {
+            int8_t res_limb;
+            int8_t tmp_limb;
+            int8_t lhs_limb;
+            int8_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_i8(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_i8(&res_limb, tmp_limb, overflow ? INT8_C(1) : INT8_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint8_t res_limb;
+            uint8_t tmp_limb;
+            uint8_t lhs_limb;
+            uint8_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_addo_u8(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_addo_u8(&res_limb, tmp_limb, overflow ? UINT8_C(1) : UINT8_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 8 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 8 / CHAR_BIT;
+#endif
+    }
+
+    return overflow;
+}
+
+static inline bool zig_subo_big(void *res, const void *lhs, const void *rhs, bool is_signed, uint16_t bits) {
+    uint8_t *res_bytes = res;
+    const uint8_t *lhs_bytes = lhs;
+    const uint8_t *rhs_bytes = rhs;
+    uint16_t byte_offset = 0;
+    uint16_t remaining_bytes = zig_int_bytes(bits);
+    uint16_t top_bits = remaining_bytes * 8 - bits;
+    bool overflow = false;
+
+#if zig_big_endian
+    byte_offset = remaining_bytes;
+#endif
+
+    while (remaining_bytes >= 128 / CHAR_BIT) {
+        uint16_t limb_bits = 128 - (remaining_bytes == 128 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 128 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 128 / CHAR_BIT && is_signed) {
+            zig_i128 res_limb;
+            zig_i128 tmp_limb;
+            zig_i128 lhs_limb;
+            zig_i128 rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_i128(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_i128(&res_limb, tmp_limb, zig_make_i128(INT64_C(0), overflow ? UINT64_C(1) : UINT64_C(0)), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            zig_u128 res_limb;
+            zig_u128 tmp_limb;
+            zig_u128 lhs_limb;
+            zig_u128 rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_u128(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_u128(&res_limb, tmp_limb, zig_make_u128(UINT64_C(0), overflow ? UINT64_C(1) : UINT64_C(0)), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 128 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 128 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 64 / CHAR_BIT) {
+        uint16_t limb_bits = 64 - (remaining_bytes == 64 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 64 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 64 / CHAR_BIT && is_signed) {
+            int64_t res_limb;
+            int64_t tmp_limb;
+            int64_t lhs_limb;
+            int64_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_i64(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_i64(&res_limb, tmp_limb, overflow ? INT64_C(1) : INT64_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint64_t res_limb;
+            uint64_t tmp_limb;
+            uint64_t lhs_limb;
+            uint64_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_u64(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_u64(&res_limb, tmp_limb, overflow ? UINT64_C(1) : UINT64_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 64 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 64 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 32 / CHAR_BIT) {
+        uint16_t limb_bits = 32 - (remaining_bytes == 32 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 32 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 32 / CHAR_BIT && is_signed) {
+            int32_t res_limb;
+            int32_t tmp_limb;
+            int32_t lhs_limb;
+            int32_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_i32(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_i32(&res_limb, tmp_limb, overflow ? INT32_C(1) : INT32_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint32_t res_limb;
+            uint32_t tmp_limb;
+            uint32_t lhs_limb;
+            uint32_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_u32(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_u32(&res_limb, tmp_limb, overflow ? UINT32_C(1) : UINT32_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 32 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 32 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 16 / CHAR_BIT) {
+        uint16_t limb_bits = 16 - (remaining_bytes == 16 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 16 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 16 / CHAR_BIT && is_signed) {
+            int16_t res_limb;
+            int16_t tmp_limb;
+            int16_t lhs_limb;
+            int16_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_i16(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_i16(&res_limb, tmp_limb, overflow ? INT16_C(1) : INT16_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint16_t res_limb;
+            uint16_t tmp_limb;
+            uint16_t lhs_limb;
+            uint16_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_u16(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_u16(&res_limb, tmp_limb, overflow ? UINT16_C(1) : UINT16_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 16 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 16 / CHAR_BIT;
+#endif
+    }
+
+    while (remaining_bytes >= 8 / CHAR_BIT) {
+        uint16_t limb_bits = 8 - (remaining_bytes == 8 / CHAR_BIT ? top_bits : 0);
+
+#if zig_big_endian
+        byte_offset -= 8 / CHAR_BIT;
+#endif
+
+        if (remaining_bytes == 8 / CHAR_BIT && is_signed) {
+            int8_t res_limb;
+            int8_t tmp_limb;
+            int8_t lhs_limb;
+            int8_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_i8(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_i8(&res_limb, tmp_limb, overflow ? INT8_C(1) : INT8_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        } else {
+            uint8_t res_limb;
+            uint8_t tmp_limb;
+            uint8_t lhs_limb;
+            uint8_t rhs_limb;
+            bool limb_overflow;
+
+            memcpy(&lhs_limb, &lhs_bytes[byte_offset], sizeof(lhs_limb));
+            memcpy(&rhs_limb, &rhs_bytes[byte_offset], sizeof(rhs_limb));
+            limb_overflow = zig_subo_u8(&tmp_limb, lhs_limb, rhs_limb, limb_bits);
+            overflow = limb_overflow ^ zig_subo_u8(&res_limb, tmp_limb, overflow ? UINT8_C(1) : UINT8_C(0), limb_bits);
+            memcpy(&res_bytes[byte_offset], &res_limb, sizeof(res_limb));
+        }
+
+        remaining_bytes -= 8 / CHAR_BIT;
+
+#if zig_little_endian
+        byte_offset += 8 / CHAR_BIT;
+#endif
+    }
+
+    return overflow;
+}
+
+static inline void zig_addw_big(void *res, const void *lhs, const void *rhs, bool is_signed, uint16_t bits) {
+    (void)zig_addo_big(res, lhs, rhs, is_signed, bits);
+}
+
+static inline void zig_subw_big(void *res, const void *lhs, const void *rhs, bool is_signed, uint16_t bits) {
+    (void)zig_subo_big(res, lhs, rhs, is_signed, bits);
+}
+
 static inline uint16_t zig_clz_big(const void *val, bool is_signed, uint16_t bits) {
     const uint8_t *val_bytes = val;
     uint16_t byte_offset = 0;
@@ -3092,80 +3386,6 @@ zig_msvc_atomics_128op(u128, max)
 
 #endif /* _MSC_VER && (_M_IX86 || _M_X64) */
 
-/* ============================= Vector Support ============================= */
-
-#define zig_cmp_vec(operation, operator) \
-    static inline void zig_##operation##_vec(bool *result, const void *lhs, const void *rhs, uint32_t len, bool is_signed, uint16_t elem_bits) { \
-        uint32_t index = 0; \
-        const uint8_t *lhs_ptr = lhs; \
-        const uint8_t *rhs_ptr = rhs; \
-        uint16_t elem_bytes = zig_int_bytes(elem_bits); \
- \
-        while (index < len) { \
-            result[index] = zig_cmp_big(lhs_ptr, rhs_ptr, is_signed, elem_bits) operator 0; \
-            lhs_ptr += elem_bytes; \
-            rhs_ptr += elem_bytes; \
-            index += 1; \
-        } \
-    }
-zig_cmp_vec(eq, ==)
-zig_cmp_vec(ne, !=)
-zig_cmp_vec(lt, < )
-zig_cmp_vec(le, <=)
-zig_cmp_vec(gt, > )
-zig_cmp_vec(ge, >=)
-
-static inline void zig_clz_vec(void *result, const void *val, uint32_t len, bool is_signed, uint16_t elem_bits) {
-    uint32_t index = 0;
-    const uint8_t *val_ptr = val;
-    uint16_t elem_bytes = zig_int_bytes(elem_bits);
-
-    while (index < len) {
-        uint16_t lz = zig_clz_big(val_ptr, is_signed, elem_bits);
-        if (elem_bits <= 128) {
-            ((uint8_t *)result)[index] = (uint8_t)lz;
-        } else {
-            ((uint16_t *)result)[index] = lz;
-        }
-        val_ptr += elem_bytes;
-        index += 1;
-    }
-}
-
-static inline void zig_ctz_vec(void *result, const void *val, uint32_t len, bool is_signed, uint16_t elem_bits) {
-    uint32_t index = 0;
-    const uint8_t *val_ptr = val;
-    uint16_t elem_bytes = zig_int_bytes(elem_bits);
-
-    while (index < len) {
-        uint16_t tz = zig_ctz_big(val_ptr, is_signed, elem_bits);
-        if (elem_bits <= 128) {
-            ((uint8_t *)result)[index] = (uint8_t)tz;
-        } else {
-            ((uint16_t *)result)[index] = tz;
-        }
-        val_ptr += elem_bytes;
-        index += 1;
-    }
-}
-
-static inline void zig_popcount_vec(void *result, const void *val, uint32_t len, bool is_signed, uint16_t elem_bits) {
-    uint32_t index = 0;
-    const uint8_t *val_ptr = val;
-    uint16_t elem_bytes = zig_int_bytes(elem_bits);
-
-    while (index < len) {
-        uint16_t pc = zig_popcount_big(val_ptr, is_signed, elem_bits);
-        if (elem_bits <= 128) {
-            ((uint8_t *)result)[index] = (uint8_t)pc;
-        } else {
-            ((uint16_t *)result)[index] = pc;
-        }
-        val_ptr += elem_bytes;
-        index += 1;
-    }
-}
-
 /* ======================== Special Case Intrinsics ========================= */
 
 #if (_MSC_VER && _M_X64) || defined(__x86_64__)
src/codegen/c.zig
@@ -444,8 +444,8 @@ pub const Function = struct {
         return f.object.dg.renderType(w, t);
     }
 
-    fn renderIntCast(f: *Function, w: anytype, dest_ty: Type, src: CValue, src_ty: Type, location: ValueRenderLocation) !void {
-        return f.object.dg.renderIntCast(w, dest_ty, .{ .c_value = .{ .f = f, .value = src } }, src_ty, location);
+    fn renderIntCast(f: *Function, w: anytype, dest_ty: Type, src: CValue, v: Vectorizer, src_ty: Type, location: ValueRenderLocation) !void {
+        return f.object.dg.renderIntCast(w, dest_ty, .{ .c_value = .{ .f = f, .value = src, .v = v } }, src_ty, location);
     }
 
     fn fmtIntLiteral(f: *Function, ty: Type, val: Value) !std.fmt.Formatter(formatIntLiteral) {
@@ -1593,6 +1593,7 @@ pub const DeclGen = struct {
         c_value: struct {
             f: *Function,
             value: CValue,
+            v: Vectorizer,
         },
         value: struct {
             value: Value,
@@ -1602,6 +1603,7 @@ pub const DeclGen = struct {
             switch (self.*) {
                 .c_value => |v| {
                     try v.f.writeCValue(w, v.value, location);
+                    try v.v.elem(v.f, w);
                 },
                 .value => |v| {
                     try dg.renderValue(w, value_ty, v.value, location);
@@ -1887,7 +1889,6 @@ pub const DeclGen = struct {
                 if (cty.isFloat()) cty.floatActiveBits(dg.module.getTarget()) else dg.byteSize(cty) * 8,
             }),
             .array => try writer.writeAll("big"),
-            .vector => try writer.writeAll("vec"),
         }
     }
 
@@ -1895,34 +1896,19 @@ pub const DeclGen = struct {
         switch (info) {
             .none => {},
             .bits => {
-                const cty = try dg.typeToCType(ty, .complete);
-                if (cty.castTag(.vector)) |pl| {
-                    var len_pl = Value.Payload.U64{ .base = .{ .tag = .int_u64 }, .data = pl.data.len };
-                    try writer.print(", {}", .{try dg.fmtIntLiteral(
-                        Type.u32,
-                        Value.initPayload(&len_pl.base),
-                        .FunctionArgument,
-                    )});
-                }
-
                 const target = dg.module.getTarget();
-                const elem_ty = ty.shallowElemType();
-                const elem_info = if (elem_ty.isAbiInt())
-                    elem_ty.intInfo(target)
-                else
-                    std.builtin.Type.Int{
-                        .signedness = .unsigned,
-                        .bits = @intCast(u16, elem_ty.bitSize(target)),
-                    };
-                switch (cty.tag()) {
-                    else => {},
-                    .array, .vector => try writer.print(", {}", .{elem_info.signedness == .signed}),
-                }
+                const int_info = if (ty.isAbiInt()) ty.intInfo(target) else std.builtin.Type.Int{
+                    .signedness = .unsigned,
+                    .bits = @intCast(u16, ty.bitSize(target)),
+                };
+
+                const cty = try dg.typeToCType(ty, .complete);
+                if (cty.tag() == .array) try writer.print(", {}", .{int_info.signedness == .signed});
 
-                var bits_pl = Value.Payload.U64{ .base = .{ .tag = .int_u64 }, .data = elem_info.bits };
+                var bits_pl = Value.Payload.U64{ .base = .{ .tag = .int_u64 }, .data = int_info.bits };
                 try writer.print(", {}", .{try dg.fmtIntLiteral(switch (cty.tag()) {
                     else => Type.u8,
-                    .array, .vector => Type.u16,
+                    .array => Type.u16,
                 }, Value.initPayload(&bits_pl.base), .FunctionArgument)});
             },
         }
@@ -2786,10 +2772,10 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
             .div_trunc, .div_exact => try airBinOp(f, inst, "/", "div_trunc", .none),
             .rem => blk: {
                 const bin_op = f.air.instructions.items(.data)[inst].bin_op;
-                const lhs_ty = f.air.typeOf(bin_op.lhs);
+                const lhs_scalar_ty = f.air.typeOf(bin_op.lhs).scalarType();
                 // For binary operations @TypeOf(lhs)==@TypeOf(rhs),
                 // so we only check one.
-                break :blk if (lhs_ty.isInt())
+                break :blk if (lhs_scalar_ty.isInt())
                     try airBinOp(f, inst, "%", "rem", .none)
                 else
                     try airBinFloatOp(f, inst, "fmod");
@@ -2833,10 +2819,10 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
 
             .slice => try airSlice(f, inst),
 
-            .cmp_gt  => try airCmpOp(f, inst, .gt),
-            .cmp_gte => try airCmpOp(f, inst, .gte),
-            .cmp_lt  => try airCmpOp(f, inst, .lt),
-            .cmp_lte => try airCmpOp(f, inst, .lte),
+            .cmp_gt  => try airCmpOp(f, inst, f.air.instructions.items(.data)[inst].bin_op, .gt),
+            .cmp_gte => try airCmpOp(f, inst, f.air.instructions.items(.data)[inst].bin_op, .gte),
+            .cmp_lt  => try airCmpOp(f, inst, f.air.instructions.items(.data)[inst].bin_op, .lt),
+            .cmp_lte => try airCmpOp(f, inst, f.air.instructions.items(.data)[inst].bin_op, .lte),
 
             .cmp_eq  => try airEquality(f, inst, .eq),
             .cmp_neq => try airEquality(f, inst, .neq),
@@ -2844,7 +2830,7 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
             .cmp_vector => blk: {
                 const ty_pl = f.air.instructions.items(.data)[inst].ty_pl;
                 const extra = f.air.extraData(Air.VectorCmp, ty_pl.payload).data;
-                break :blk try airCmpBuiltinCall(f, inst, extra, extra.compareOperator(), .operator, .bits,);
+                break :blk try airCmpOp(f, inst, extra, extra.compareOperator());
             },
             .cmp_lt_errors_len => try airCmpLtErrorsLen(f, inst),
 
@@ -3294,7 +3280,10 @@ fn airArg(f: *Function, inst: Air.Inst.Index) !CValue {
 
 fn airLoad(f: *Function, inst: Air.Inst.Index) !CValue {
     const ty_op = f.air.instructions.items(.data)[inst].ty_op;
-    const ptr_info = f.air.typeOf(ty_op.operand).ptrInfo().data;
+
+    const ptr_ty = f.air.typeOf(ty_op.operand);
+    const ptr_scalar_ty = ptr_ty.scalarType();
+    const ptr_info = ptr_scalar_ty.ptrInfo().data;
     const src_ty = ptr_info.pointee_type;
 
     if (!src_ty.hasRuntimeBitsIgnoreComptime() or
@@ -3312,16 +3301,19 @@ fn airLoad(f: *Function, inst: Air.Inst.Index) !CValue {
     const is_aligned = ptr_info.@"align" == 0 or ptr_info.@"align" >= src_ty.abiAlignment(target);
     const is_array = lowersToArray(src_ty, target);
     const need_memcpy = !is_aligned or is_array;
-    const writer = f.object.writer();
 
+    const writer = f.object.writer();
     const local = try f.allocLocal(inst, src_ty);
+    const v = try Vectorizer.start(f, inst, writer, ptr_ty);
 
     if (need_memcpy) {
         try writer.writeAll("memcpy(");
         if (!is_array) try writer.writeByte('&');
-        try f.writeCValue(writer, local, .FunctionArgument);
+        try f.writeCValue(writer, local, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(", (const char *)");
         try f.writeCValue(writer, operand, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(", sizeof(");
         try f.renderType(writer, src_ty);
         try writer.writeAll("))");
@@ -3351,6 +3343,7 @@ fn airLoad(f: *Function, inst: Air.Inst.Index) !CValue {
         const field_ty = Type.initPayload(&field_pl.base);
 
         try f.writeCValue(writer, local, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(" = (");
         try f.renderType(writer, src_ty);
         try writer.writeAll(")zig_wrap_");
@@ -3369,16 +3362,21 @@ fn airLoad(f: *Function, inst: Air.Inst.Index) !CValue {
         try f.object.dg.renderTypeForBuiltinFnName(writer, host_ty);
         try writer.writeByte('(');
         try f.writeCValueDeref(writer, operand);
+        try v.elem(f, writer);
         try writer.print(", {})", .{try f.fmtIntLiteral(bit_offset_ty, bit_offset_val)});
         if (cant_cast) try writer.writeByte(')');
         try f.object.dg.renderBuiltinInfo(writer, field_ty, .bits);
         try writer.writeByte(')');
     } else {
         try f.writeCValue(writer, local, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(" = ");
         try f.writeCValueDeref(writer, operand);
+        try v.elem(f, writer);
     }
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -3444,15 +3442,22 @@ fn airIntCast(f: *Function, inst: Air.Inst.Index) !CValue {
 
     const operand = try f.resolveInst(ty_op.operand);
     try reap(f, inst, &.{ty_op.operand});
-    const writer = f.object.writer();
+
     const inst_ty = f.air.typeOfIndex(inst);
-    const local = try f.allocLocal(inst, inst_ty);
+    const inst_scalar_ty = inst_ty.scalarType();
     const operand_ty = f.air.typeOf(ty_op.operand);
+    const scalar_ty = operand_ty.scalarType();
 
+    const writer = f.object.writer();
+    const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = ");
-    try f.renderIntCast(writer, inst_ty, operand, operand_ty, .Other);
+    try f.renderIntCast(writer, inst_scalar_ty, operand, v, scalar_ty, .Other);
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -3578,7 +3583,10 @@ fn storeUndefined(f: *Function, lhs_child_ty: Type, dest_ptr: CValue) !CValue {
 fn airStore(f: *Function, inst: Air.Inst.Index) !CValue {
     // *a = b;
     const bin_op = f.air.instructions.items(.data)[inst].bin_op;
-    const ptr_info = f.air.typeOf(bin_op.lhs).ptrInfo().data;
+
+    const ptr_ty = f.air.typeOf(bin_op.lhs);
+    const ptr_scalar_ty = ptr_ty.scalarType();
+    const ptr_info = ptr_scalar_ty.ptrInfo().data;
     if (!ptr_info.pointee_type.hasRuntimeBitsIgnoreComptime()) {
         try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
         return .none;
@@ -3601,11 +3609,13 @@ fn airStore(f: *Function, inst: Air.Inst.Index) !CValue {
         ptr_info.@"align" >= ptr_info.pointee_type.abiAlignment(target);
     const is_array = lowersToArray(ptr_info.pointee_type, target);
     const need_memcpy = !is_aligned or is_array;
-    const writer = f.object.writer();
 
     const src_val = try f.resolveInst(bin_op.rhs);
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
 
+    const writer = f.object.writer();
+    const v = try Vectorizer.start(f, inst, writer, ptr_ty);
+
     if (need_memcpy) {
         // For this memcpy to safely work we need the rhs to have the same
         // underlying type as the lhs (i.e. they must both be arrays of the same underlying type).
@@ -3626,9 +3636,11 @@ fn airStore(f: *Function, inst: Air.Inst.Index) !CValue {
 
         try writer.writeAll("memcpy((char *)");
         try f.writeCValue(writer, ptr_val, .FunctionArgument);
+        try v.elem(f, writer);
         try writer.writeAll(", ");
         if (!is_array) try writer.writeByte('&');
         try f.writeCValue(writer, array_src, .FunctionArgument);
+        try v.elem(f, writer);
         try writer.writeAll(", sizeof(");
         try f.renderType(writer, src_ty);
         try writer.writeAll("))");
@@ -3672,12 +3684,14 @@ fn airStore(f: *Function, inst: Air.Inst.Index) !CValue {
         const mask_val = Value.initPayload(&mask_pl.base);
 
         try f.writeCValueDeref(writer, ptr_val);
+        try v.elem(f, writer);
         try writer.writeAll(" = zig_or_");
         try f.object.dg.renderTypeForBuiltinFnName(writer, host_ty);
         try writer.writeAll("(zig_and_");
         try f.object.dg.renderTypeForBuiltinFnName(writer, host_ty);
         try writer.writeByte('(');
         try f.writeCValueDeref(writer, ptr_val);
+        try v.elem(f, writer);
         try writer.print(", {x}), zig_shl_", .{try f.fmtIntLiteral(host_ty, mask_val)});
         try f.object.dg.renderTypeForBuiltinFnName(writer, host_ty);
         try writer.writeByte('(');
@@ -3699,14 +3713,19 @@ fn airStore(f: *Function, inst: Air.Inst.Index) !CValue {
             try writer.writeByte(')');
         }
         try f.writeCValue(writer, src_val, .Other);
+        try v.elem(f, writer);
         if (cant_cast) try writer.writeByte(')');
         try writer.print(", {}))", .{try f.fmtIntLiteral(bit_offset_ty, bit_offset_val)});
     } else {
         try f.writeCValueDeref(writer, ptr_val);
+        try v.elem(f, writer);
         try writer.writeAll(" = ");
         try f.writeCValue(writer, src_val, .Other);
+        try v.elem(f, writer);
     }
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
+
     return .none;
 }
 
@@ -3724,51 +3743,39 @@ fn airOverflow(f: *Function, inst: Air.Inst.Index, operation: []const u8, info:
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
 
     const inst_ty = f.air.typeOfIndex(inst);
-    const vector_ty = f.air.typeOf(bin_op.lhs);
-    const scalar_ty = vector_ty.scalarType();
-    const w = f.object.writer();
+    const operand_ty = f.air.typeOf(bin_op.lhs);
+    const scalar_ty = operand_ty.scalarType();
 
+    const w = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
-
-    switch (vector_ty.zigTypeTag()) {
-        .Vector => {
-            try w.writeAll("zig_v");
-            try w.writeAll(operation);
-            try w.writeAll("o_");
-            try f.object.dg.renderTypeForBuiltinFnName(w, scalar_ty);
-            try w.writeAll("(");
-            try f.writeCValueMember(w, local, .{ .field = 1 });
-            try w.writeAll(", ");
-            try f.writeCValueMember(w, local, .{ .field = 0 });
-            try w.print(", {d}, ", .{vector_ty.vectorLen()});
-        },
-        else => {
-            try f.writeCValueMember(w, local, .{ .field = 1 });
-            try w.writeAll(" = zig_");
-            try w.writeAll(operation);
-            try w.writeAll("o_");
-            try f.object.dg.renderTypeForBuiltinFnName(w, scalar_ty);
-            try w.writeAll("(&");
-            try f.writeCValueMember(w, local, .{ .field = 0 });
-            try w.writeAll(", ");
-        },
-    }
-
+    const v = try Vectorizer.start(f, inst, w, operand_ty);
+    try f.writeCValueMember(w, local, .{ .field = 1 });
+    try v.elem(f, w);
+    try w.writeAll(" = zig_");
+    try w.writeAll(operation);
+    try w.writeAll("o_");
+    try f.object.dg.renderTypeForBuiltinFnName(w, scalar_ty);
+    try w.writeAll("(&");
+    try f.writeCValueMember(w, local, .{ .field = 0 });
+    try v.elem(f, w);
+    try w.writeAll(", ");
     try f.writeCValue(w, lhs, .FunctionArgument);
+    try v.elem(f, w);
     try w.writeAll(", ");
     try f.writeCValue(w, rhs, .FunctionArgument);
+    try v.elem(f, w);
     try f.object.dg.renderBuiltinInfo(w, scalar_ty, info);
     try w.writeAll(");\n");
+    try v.end(f, inst, w);
 
     return local;
 }
 
 fn airNot(f: *Function, inst: Air.Inst.Index) !CValue {
-    const inst_ty = f.air.typeOfIndex(inst);
-    if (inst_ty.tag() != .bool)
-        return try airUnBuiltinCall(f, inst, "not", .bits);
-
     const ty_op = f.air.instructions.items(.data)[inst].ty_op;
+    const operand_ty = f.air.typeOf(ty_op.operand);
+    const scalar_ty = operand_ty.scalarType();
+    if (scalar_ty.tag() != .bool) return try airUnBuiltinCall(f, inst, "not", .bits);
 
     if (f.liveness.isUnused(inst)) {
         try reap(f, inst, &.{ty_op.operand});
@@ -3778,14 +3785,20 @@ fn airNot(f: *Function, inst: Air.Inst.Index) !CValue {
     const op = try f.resolveInst(ty_op.operand);
     try reap(f, inst, &.{ty_op.operand});
 
+    const inst_ty = f.air.typeOfIndex(inst);
+
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
-
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = ");
     try writer.writeByte('!');
     try f.writeCValue(writer, op, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -3798,71 +3811,89 @@ fn airBinOp(
 ) !CValue {
     const bin_op = f.air.instructions.items(.data)[inst].bin_op;
     const operand_ty = f.air.typeOf(bin_op.lhs);
+    const scalar_ty = operand_ty.scalarType();
     const target = f.object.dg.module.getTarget();
-    if ((operand_ty.isInt() and operand_ty.bitSize(target) > 64) or operand_ty.isRuntimeFloat())
+    if ((scalar_ty.isInt() and scalar_ty.bitSize(target) > 64) or scalar_ty.isRuntimeFloat())
         return try airBinBuiltinCall(f, inst, operation, info);
 
+    if (f.liveness.isUnused(inst)) {
+        try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
+        return .none;
+    }
+
     const lhs = try f.resolveInst(bin_op.lhs);
     const rhs = try f.resolveInst(bin_op.rhs);
-
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
 
-    if (f.liveness.isUnused(inst)) return .none;
-
     const inst_ty = f.air.typeOfIndex(inst);
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = ");
     try f.writeCValue(writer, lhs, .Other);
+    try v.elem(f, writer);
     try writer.writeByte(' ');
     try writer.writeAll(operator);
     try writer.writeByte(' ');
     try f.writeCValue(writer, rhs, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
 
     return local;
 }
 
-fn airCmpOp(f: *Function, inst: Air.Inst.Index, operator: std.math.CompareOperator) !CValue {
-    const bin_op = f.air.instructions.items(.data)[inst].bin_op;
-
+fn airCmpOp(
+    f: *Function,
+    inst: Air.Inst.Index,
+    data: anytype,
+    operator: std.math.CompareOperator,
+) !CValue {
     if (f.liveness.isUnused(inst)) {
-        try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
+        try reap(f, inst, &.{ data.lhs, data.rhs });
         return .none;
     }
 
-    const operand_ty = f.air.typeOf(bin_op.lhs);
+    const operand_ty = f.air.typeOf(data.lhs);
+    const scalar_ty = operand_ty.scalarType();
+
     const target = f.object.dg.module.getTarget();
-    const operand_bits = operand_ty.bitSize(target);
-    if (operand_ty.isInt() and operand_bits > 64)
+    const scalar_bits = scalar_ty.bitSize(target);
+    if (scalar_ty.isInt() and scalar_bits > 64)
         return airCmpBuiltinCall(
             f,
             inst,
-            bin_op,
+            data,
             operator,
             .cmp,
-            if (operand_bits > 128) .bits else .none,
+            if (scalar_bits > 128) .bits else .none,
         );
-    if (operand_ty.isRuntimeFloat())
-        return airCmpBuiltinCall(f, inst, bin_op, operator, .operator, .none);
+    if (scalar_ty.isRuntimeFloat())
+        return airCmpBuiltinCall(f, inst, data, operator, .operator, .none);
 
     const inst_ty = f.air.typeOfIndex(inst);
-    const lhs = try f.resolveInst(bin_op.lhs);
-    const rhs = try f.resolveInst(bin_op.rhs);
-    try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
+    const lhs = try f.resolveInst(data.lhs);
+    const rhs = try f.resolveInst(data.rhs);
+    try reap(f, inst, &.{ data.lhs, data.rhs });
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = ");
     try f.writeCValue(writer, lhs, .Other);
+    try v.elem(f, writer);
     try writer.writeByte(' ');
     try writer.writeAll(compareOperatorC(operator));
     try writer.writeByte(' ');
     try f.writeCValue(writer, rhs, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
 
     return local;
 }
@@ -3974,11 +4005,14 @@ fn airPtrAddSub(f: *Function, inst: Air.Inst.Index, operator: u8) !CValue {
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
 
     const inst_ty = f.air.typeOfIndex(inst);
-    const elem_ty = inst_ty.elemType2();
+    const inst_scalar_ty = inst_ty.scalarType();
+    const elem_ty = inst_scalar_ty.elemType2();
 
     const local = try f.allocLocal(inst, inst_ty);
     const writer = f.object.writer();
+    const v = try Vectorizer.start(f, inst, writer, inst_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = ");
 
     if (elem_ty.hasRuntimeBitsIgnoreComptime()) {
@@ -3986,19 +4020,26 @@ fn airPtrAddSub(f: *Function, inst: Air.Inst.Index, operator: u8) !CValue {
         // results in a NULL pointer, or if LHS is NULL. The operation is only UB
         // if the result is NULL and then dereferenced.
         try writer.writeByte('(');
-        try f.renderType(writer, inst_ty);
+        try f.renderType(writer, inst_scalar_ty);
         try writer.writeAll(")(((uintptr_t)");
         try f.writeCValue(writer, lhs, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(") ");
         try writer.writeByte(operator);
         try writer.writeAll(" (");
         try f.writeCValue(writer, rhs, .Other);
+        try v.elem(f, writer);
         try writer.writeAll("*sizeof(");
         try f.renderType(writer, elem_ty);
         try writer.writeAll(")))");
-    } else try f.writeCValue(writer, lhs, .Initializer);
+    } else {
+        try f.writeCValue(writer, lhs, .Other);
+        try v.elem(f, writer);
+    }
 
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -4011,10 +4052,12 @@ fn airMinMax(f: *Function, inst: Air.Inst.Index, operator: u8, operation: []cons
     }
 
     const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
+
     const target = f.object.dg.module.getTarget();
-    if (inst_ty.isInt() and inst_ty.bitSize(target) > 64)
+    if (inst_scalar_ty.isInt() and inst_scalar_ty.bitSize(target) > 64)
         return try airBinBuiltinCall(f, inst, operation[1..], .none);
-    if (inst_ty.isRuntimeFloat())
+    if (inst_scalar_ty.isRuntimeFloat())
         return try airBinFloatOp(f, inst, operation);
 
     const lhs = try f.resolveInst(bin_op.lhs);
@@ -4023,19 +4066,26 @@ fn airMinMax(f: *Function, inst: Air.Inst.Index, operator: u8, operation: []cons
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, inst_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     // (lhs <> rhs) ? lhs : rhs
     try writer.writeAll(" = (");
     try f.writeCValue(writer, lhs, .Other);
+    try v.elem(f, writer);
     try writer.writeByte(' ');
     try writer.writeByte(operator);
     try writer.writeByte(' ');
     try f.writeCValue(writer, rhs, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(") ? ");
     try f.writeCValue(writer, lhs, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" : ");
     try f.writeCValue(writer, rhs, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
 
     return local;
 }
@@ -6002,30 +6052,35 @@ fn airUnBuiltinCall(
     const operand = try f.resolveInst(ty_op.operand);
     try reap(f, inst, &.{ty_op.operand});
     const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
     const operand_ty = f.air.typeOf(ty_op.operand);
+    const scalar_ty = operand_ty.scalarType();
 
-    const inst_cty = try f.typeToCType(inst_ty, .complete);
-    const ref_ret = switch (inst_cty.tag()) {
-        else => false,
-        .array, .vector => true,
-    };
+    const inst_scalar_cty = try f.typeToCType(inst_scalar_ty, .complete);
+    const ref_ret = inst_scalar_cty.tag() == .array;
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     if (!ref_ret) {
         try f.writeCValue(writer, local, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(" = ");
     }
     try writer.print("zig_{s}_", .{operation});
-    try f.object.dg.renderTypeForBuiltinFnName(writer, operand_ty);
+    try f.object.dg.renderTypeForBuiltinFnName(writer, scalar_ty);
     try writer.writeByte('(');
     if (ref_ret) {
         try f.writeCValue(writer, local, .FunctionArgument);
+        try v.elem(f, writer);
         try writer.writeAll(", ");
     }
     try f.writeCValue(writer, operand, .FunctionArgument);
-    try f.object.dg.renderBuiltinInfo(writer, operand_ty, info);
+    try v.elem(f, writer);
+    try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info);
     try writer.writeAll(");\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6047,21 +6102,38 @@ fn airBinBuiltinCall(
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
 
     const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
     const operand_ty = f.air.typeOf(bin_op.lhs);
+    const scalar_ty = operand_ty.scalarType();
+
+    const inst_scalar_cty = try f.typeToCType(inst_scalar_ty, .complete);
+    const ref_ret = inst_scalar_cty.tag() == .array;
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
-    try f.writeCValue(writer, local, .Other);
-    try writer.writeAll(" = zig_");
-    try writer.writeAll(operation);
-    try writer.writeByte('_');
-    try f.object.dg.renderTypeForBuiltinFnName(writer, operand_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
+    if (!ref_ret) {
+        try f.writeCValue(writer, local, .Other);
+        try v.elem(f, writer);
+        try writer.writeAll(" = ");
+    }
+    try writer.print("zig_{s}_", .{operation});
+    try f.object.dg.renderTypeForBuiltinFnName(writer, scalar_ty);
     try writer.writeByte('(');
+    if (ref_ret) {
+        try f.writeCValue(writer, local, .FunctionArgument);
+        try v.elem(f, writer);
+        try writer.writeAll(", ");
+    }
     try f.writeCValue(writer, lhs, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(", ");
     try f.writeCValue(writer, rhs, .FunctionArgument);
-    try f.object.dg.renderBuiltinInfo(writer, operand_ty, info);
+    try v.elem(f, writer);
+    try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info);
     try writer.writeAll(");\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6073,45 +6145,56 @@ fn airCmpBuiltinCall(
     operation: enum { cmp, operator },
     info: BuiltinInfo,
 ) !CValue {
-    const inst_ty = f.air.typeOfIndex(inst);
-    const operand_ty = f.air.typeOf(data.lhs);
+    if (f.liveness.isUnused(inst)) {
+        try reap(f, inst, &.{ data.lhs, data.rhs });
+        return .none;
+    }
 
     const lhs = try f.resolveInst(data.lhs);
     const rhs = try f.resolveInst(data.rhs);
     try reap(f, inst, &.{ data.lhs, data.rhs });
 
-    const inst_cty = try f.typeToCType(inst_ty, .complete);
-    const ref_ret = switch (inst_cty.tag()) {
-        else => false,
-        .array, .vector => true,
-    };
+    const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
+    const operand_ty = f.air.typeOf(data.lhs);
+    const scalar_ty = operand_ty.scalarType();
+
+    const inst_scalar_cty = try f.typeToCType(inst_scalar_ty, .complete);
+    const ref_ret = inst_scalar_cty.tag() == .array;
 
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     if (!ref_ret) {
         try f.writeCValue(writer, local, .Other);
+        try v.elem(f, writer);
         try writer.writeAll(" = ");
     }
     try writer.print("zig_{s}_", .{switch (operation) {
         else => @tagName(operation),
         .operator => compareOperatorAbbrev(operator),
     }});
-    try f.object.dg.renderTypeForBuiltinFnName(writer, operand_ty);
+    try f.object.dg.renderTypeForBuiltinFnName(writer, scalar_ty);
     try writer.writeByte('(');
     if (ref_ret) {
         try f.writeCValue(writer, local, .FunctionArgument);
+        try v.elem(f, writer);
         try writer.writeAll(", ");
     }
     try f.writeCValue(writer, lhs, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(", ");
     try f.writeCValue(writer, rhs, .FunctionArgument);
-    try f.object.dg.renderBuiltinInfo(writer, operand_ty, info);
+    try v.elem(f, writer);
+    try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info);
     try writer.writeByte(')');
     if (!ref_ret) try writer.print(" {s} {}", .{
         compareOperatorC(operator),
         try f.fmtIntLiteral(Type.initTag(.i32), Value.zero),
     });
     try writer.writeAll(";\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6498,65 +6581,35 @@ fn airReduce(f: *Function, inst: Air.Inst.Index) !CValue {
     const operand = try f.resolveInst(reduce.operand);
     try reap(f, inst, &.{reduce.operand});
     const operand_ty = f.air.typeOf(reduce.operand);
-    const vector_len = operand_ty.vectorLen();
     const writer = f.object.writer();
 
-    const Op = union(enum) {
-        call_fn: []const u8,
+    const op: union(enum) {
+        float_op: []const u8,
+        builtin: []const u8,
         infix: []const u8,
         ternary: []const u8,
-    };
-    var fn_name_buf: [64]u8 = undefined;
-    const op: Op = switch (reduce.operation) {
+    } = switch (reduce.operation) {
         .And => .{ .infix = " &= " },
         .Or => .{ .infix = " |= " },
         .Xor => .{ .infix = " ^= " },
         .Min => switch (scalar_ty.zigTypeTag()) {
-            .Int => Op{ .ternary = " < " },
-            .Float => op: {
-                const float_bits = scalar_ty.floatBits(target);
-                break :op Op{
-                    .call_fn = std.fmt.bufPrintZ(&fn_name_buf, "{s}fmin{s}", .{
-                        libcFloatPrefix(float_bits), libcFloatSuffix(float_bits),
-                    }) catch unreachable,
-                };
-            },
+            .Int => .{ .ternary = " < " },
+            .Float => .{ .float_op = "fmin" },
             else => unreachable,
         },
         .Max => switch (scalar_ty.zigTypeTag()) {
-            .Int => Op{ .ternary = " > " },
-            .Float => op: {
-                const float_bits = scalar_ty.floatBits(target);
-                break :op Op{
-                    .call_fn = std.fmt.bufPrintZ(&fn_name_buf, "{s}fmax{s}", .{
-                        libcFloatPrefix(float_bits), libcFloatSuffix(float_bits),
-                    }) catch unreachable,
-                };
-            },
+            .Int => .{ .ternary = " > " },
+            .Float => .{ .float_op = "fmax" },
             else => unreachable,
         },
         .Add => switch (scalar_ty.zigTypeTag()) {
-            .Int => Op{ .infix = " += " },
-            .Float => op: {
-                const float_bits = scalar_ty.floatBits(target);
-                break :op Op{
-                    .call_fn = std.fmt.bufPrintZ(&fn_name_buf, "__add{s}f3", .{
-                        compilerRtFloatAbbrev(float_bits),
-                    }) catch unreachable,
-                };
-            },
+            .Int => .{ .infix = " += " },
+            .Float => .{ .builtin = "add" },
             else => unreachable,
         },
         .Mul => switch (scalar_ty.zigTypeTag()) {
-            .Int => Op{ .infix = " *= " },
-            .Float => op: {
-                const float_bits = scalar_ty.floatBits(target);
-                break :op Op{
-                    .call_fn = std.fmt.bufPrintZ(&fn_name_buf, "__mul{s}f3", .{
-                        compilerRtFloatAbbrev(float_bits),
-                    }) catch unreachable,
-                };
-            },
+            .Int => .{ .infix = " *= " },
+            .Float => .{ .builtin = "mul" },
             else => unreachable,
         },
     };
@@ -6572,75 +6625,94 @@ fn airReduce(f: *Function, inst: Air.Inst.Index) !CValue {
     //     }
     //     break :reduce accum;
     //   }
-    const it = try f.allocLocal(inst, Type.usize);
-    try f.writeCValue(writer, it, .Other);
-    try writer.writeAll(" = 0;\n");
 
     const accum = try f.allocLocal(inst, scalar_ty);
     try f.writeCValue(writer, accum, .Other);
     try writer.writeAll(" = ");
 
-    const init_val = switch (reduce.operation) {
-        .And, .Or, .Xor, .Add => "0",
+    var arena = std.heap.ArenaAllocator.init(f.object.dg.gpa);
+    defer arena.deinit();
+
+    const ExpectedContents = union {
+        u: Value.Payload.U64,
+        i: Value.Payload.I64,
+        f16: Value.Payload.Float_16,
+        f32: Value.Payload.Float_32,
+        f64: Value.Payload.Float_64,
+        f80: Value.Payload.Float_80,
+        f128: Value.Payload.Float_128,
+    };
+    var stack align(@alignOf(ExpectedContents)) =
+        std.heap.stackFallback(@sizeOf(ExpectedContents), arena.allocator());
+
+    try f.object.dg.renderValue(writer, scalar_ty, switch (reduce.operation) {
+        .Or, .Xor, .Add => Value.zero,
+        .And => switch (scalar_ty.zigTypeTag()) {
+            .Bool => Value.one,
+            else => switch (scalar_ty.intInfo(target).signedness) {
+                .unsigned => try scalar_ty.maxInt(stack.get(), target),
+                .signed => Value.negative_one,
+            },
+        },
         .Min => switch (scalar_ty.zigTypeTag()) {
-            .Int => "TODO_intmax",
-            .Float => "TODO_nan",
+            .Bool => Value.one,
+            .Int => try scalar_ty.maxInt(stack.get(), target),
+            .Float => try Value.floatToValue(std.math.nan(f128), stack.get(), scalar_ty, target),
             else => unreachable,
         },
         .Max => switch (scalar_ty.zigTypeTag()) {
-            .Int => "TODO_intmin",
-            .Float => "TODO_nan",
+            .Bool => Value.zero,
+            .Int => try scalar_ty.minInt(stack.get(), target),
+            .Float => try Value.floatToValue(std.math.nan(f128), stack.get(), scalar_ty, target),
             else => unreachable,
         },
-        .Mul => "1",
-    };
-    try writer.writeAll(init_val);
-    try writer.writeAll(";");
-    try f.object.indent_writer.insertNewline();
-    try writer.writeAll("for (;");
-    try f.writeCValue(writer, it, .Other);
-    try writer.print("<{d};++", .{vector_len});
-    try f.writeCValue(writer, it, .Other);
-    try writer.writeAll(") ");
-    try f.writeCValue(writer, accum, .Other);
+        .Mul => Value.one,
+    }, .Initializer);
+    try writer.writeAll(";\n");
 
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
+    try f.writeCValue(writer, accum, .Other);
     switch (op) {
-        .call_fn => |fn_name| {
-            try writer.print(" = {s}(", .{fn_name});
+        .float_op => |operation| {
+            try writer.writeAll(" = zig_libc_name_");
+            try f.object.dg.renderTypeForBuiltinFnName(writer, scalar_ty);
+            try writer.print("({s})(", .{operation});
+            try f.writeCValue(writer, accum, .FunctionArgument);
+            try writer.writeAll(", ");
+            try f.writeCValue(writer, operand, .Other);
+            try v.elem(f, writer);
+            try writer.writeByte(')');
+        },
+        .builtin => |operation| {
+            try writer.print(" = zig_{s}_", .{operation});
+            try f.object.dg.renderTypeForBuiltinFnName(writer, scalar_ty);
+            try writer.writeByte('(');
             try f.writeCValue(writer, accum, .FunctionArgument);
             try writer.writeAll(", ");
             try f.writeCValue(writer, operand, .Other);
-            try writer.writeAll("[");
-            try f.writeCValue(writer, it, .Other);
-            try writer.writeAll("])");
+            try v.elem(f, writer);
+            try writer.writeByte(')');
         },
         .infix => |ass| {
             try writer.writeAll(ass);
             try f.writeCValue(writer, operand, .Other);
-            try writer.writeAll("[");
-            try f.writeCValue(writer, it, .Other);
-            try writer.writeAll("]");
+            try v.elem(f, writer);
         },
         .ternary => |cmp| {
             try writer.writeAll(" = ");
             try f.writeCValue(writer, accum, .Other);
             try writer.writeAll(cmp);
             try f.writeCValue(writer, operand, .Other);
-            try writer.writeAll("[");
-            try f.writeCValue(writer, it, .Other);
-            try writer.writeAll("] ? ");
+            try v.elem(f, writer);
+            try writer.writeAll(" ? ");
             try f.writeCValue(writer, accum, .Other);
             try writer.writeAll(" : ");
             try f.writeCValue(writer, operand, .Other);
-            try writer.writeAll("[");
-            try f.writeCValue(writer, it, .Other);
-            try writer.writeAll("]");
+            try v.elem(f, writer);
         },
     }
-
     try writer.writeAll(";\n");
-
-    try freeLocal(f, inst, it.new_local, 0);
+    try v.end(f, inst, writer);
 
     return accum;
 }
@@ -6774,7 +6846,7 @@ fn airAggregateInit(f: *Function, inst: Air.Inst.Index) !CValue {
                     try writer.writeByte('(');
 
                     if (inst_ty.isAbiInt() and (field_ty.isAbiInt() or field_ty.isPtrAtRuntime())) {
-                        try f.renderIntCast(writer, inst_ty, element, field_ty, .FunctionArgument);
+                        try f.renderIntCast(writer, inst_ty, element, .{}, field_ty, .FunctionArgument);
                     } else {
                         try writer.writeByte('(');
                         try f.renderType(writer, inst_ty);
@@ -6916,7 +6988,6 @@ fn airWasmMemoryGrow(f: *Function, inst: Air.Inst.Index) !CValue {
 }
 
 fn airFloatNeg(f: *Function, inst: Air.Inst.Index) !CValue {
-    const inst_ty = f.air.typeOfIndex(inst);
     const un_op = f.air.instructions.items(.data)[inst].un_op;
     if (f.liveness.isUnused(inst)) {
         try reap(f, inst, &.{un_op});
@@ -6925,16 +6996,23 @@ fn airFloatNeg(f: *Function, inst: Air.Inst.Index) !CValue {
 
     const operand = try f.resolveInst(un_op);
     try reap(f, inst, &.{un_op});
+
     const operand_ty = f.air.typeOf(un_op);
+    const scalar_ty = operand_ty.scalarType();
 
     const writer = f.object.writer();
-    const local = try f.allocLocal(inst, inst_ty);
+    const local = try f.allocLocal(inst, operand_ty);
+    const v = try Vectorizer.start(f, inst, writer, operand_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = zig_neg_");
-    try f.object.dg.renderTypeForBuiltinFnName(writer, operand_ty);
+    try f.object.dg.renderTypeForBuiltinFnName(writer, scalar_ty);
     try writer.writeByte('(');
     try f.writeCValue(writer, operand, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(");\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6944,19 +7022,28 @@ fn airUnFloatOp(f: *Function, inst: Air.Inst.Index, operation: []const u8) !CVal
         try reap(f, inst, &.{un_op});
         return .none;
     }
+
     const operand = try f.resolveInst(un_op);
     try reap(f, inst, &.{un_op});
-    const writer = f.object.writer();
+
     const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
+
+    const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, inst_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = zig_libc_name_");
-    try f.object.dg.renderTypeForBuiltinFnName(writer, inst_ty);
+    try f.object.dg.renderTypeForBuiltinFnName(writer, inst_scalar_ty);
     try writer.writeByte('(');
     try writer.writeAll(operation);
     try writer.writeAll(")(");
     try f.writeCValue(writer, operand, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(");\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6966,23 +7053,32 @@ fn airBinFloatOp(f: *Function, inst: Air.Inst.Index, operation: []const u8) !CVa
         try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
         return .none;
     }
+
     const lhs = try f.resolveInst(bin_op.lhs);
     const rhs = try f.resolveInst(bin_op.rhs);
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs });
 
-    const writer = f.object.writer();
     const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
+
+    const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, inst_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = zig_libc_name_");
-    try f.object.dg.renderTypeForBuiltinFnName(writer, inst_ty);
+    try f.object.dg.renderTypeForBuiltinFnName(writer, inst_scalar_ty);
     try writer.writeByte('(');
     try writer.writeAll(operation);
     try writer.writeAll(")(");
     try f.writeCValue(writer, lhs, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(", ");
     try f.writeCValue(writer, rhs, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(");\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -6993,23 +7089,34 @@ fn airMulAdd(f: *Function, inst: Air.Inst.Index) !CValue {
         try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs, pl_op.operand });
         return .none;
     }
-    const inst_ty = f.air.typeOfIndex(inst);
+
     const mulend1 = try f.resolveInst(bin_op.lhs);
     const mulend2 = try f.resolveInst(bin_op.rhs);
     const addend = try f.resolveInst(pl_op.operand);
     try reap(f, inst, &.{ bin_op.lhs, bin_op.rhs, pl_op.operand });
+
+    const inst_ty = f.air.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType();
+
     const writer = f.object.writer();
     const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorizer.start(f, inst, writer, inst_ty);
     try f.writeCValue(writer, local, .Other);
+    try v.elem(f, writer);
     try writer.writeAll(" = zig_libc_name_");
-    try f.object.dg.renderTypeForBuiltinFnName(writer, inst_ty);
+    try f.object.dg.renderTypeForBuiltinFnName(writer, inst_scalar_ty);
     try writer.writeAll("(fma)(");
     try f.writeCValue(writer, mulend1, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(", ");
     try f.writeCValue(writer, mulend2, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(", ");
     try f.writeCValue(writer, addend, .FunctionArgument);
+    try v.elem(f, writer);
     try writer.writeAll(");\n");
+    try v.end(f, inst, writer);
+
     return local;
 }
 
@@ -7510,6 +7617,47 @@ fn formatIntLiteral(
     try data.cty.renderLiteralSuffix(writer);
 }
 
+const Vectorizer = struct {
+    index: CValue = .none,
+
+    pub fn start(f: *Function, inst: Air.Inst.Index, writer: anytype, ty: Type) !Vectorizer {
+        return if (ty.zigTypeTag() == .Vector) index: {
+            var len_pl = Value.Payload.U64{ .base = .{ .tag = .int_u64 }, .data = ty.vectorLen() };
+
+            const local = try f.allocLocal(inst, Type.usize);
+
+            try writer.writeAll("for (");
+            try f.writeCValue(writer, local, .Other);
+            try writer.print(" = {d}; ", .{try f.fmtIntLiteral(Type.usize, Value.zero)});
+            try f.writeCValue(writer, local, .Other);
+            try writer.print(" < {d}; ", .{
+                try f.fmtIntLiteral(Type.usize, Value.initPayload(&len_pl.base)),
+            });
+            try f.writeCValue(writer, local, .Other);
+            try writer.print(" += {d}) {{\n", .{try f.fmtIntLiteral(Type.usize, Value.one)});
+            f.object.indent_writer.pushIndent();
+
+            break :index .{ .index = local };
+        } else .{};
+    }
+
+    pub fn elem(self: Vectorizer, f: *Function, writer: anytype) !void {
+        if (self.index != .none) {
+            try writer.writeByte('[');
+            try f.writeCValue(writer, self.index, .Other);
+            try writer.writeByte(']');
+        }
+    }
+
+    pub fn end(self: Vectorizer, f: *Function, inst: Air.Inst.Index, writer: anytype) !void {
+        if (self.index != .none) {
+            f.object.indent_writer.popIndent();
+            try writer.writeAll("}\n");
+            try freeLocal(f, inst, self.index.new_local, 0);
+        }
+    }
+};
+
 fn isByRef(ty: Type) bool {
     _ = ty;
     return false;
src/type.zig
@@ -4213,7 +4213,7 @@ pub const Type = extern union {
         };
     }
 
-    pub fn shallowElemType(child_ty: Type) Type {
+    fn shallowElemType(child_ty: Type) Type {
         return switch (child_ty.zigTypeTag()) {
             .Array, .Vector => child_ty.childType(),
             else => child_ty,
src/value.zig
@@ -3319,7 +3319,7 @@ pub const Value = extern union {
         }
     }
 
-    fn floatToValue(float: f128, arena: Allocator, dest_ty: Type, target: Target) !Value {
+    pub fn floatToValue(float: f128, arena: Allocator, dest_ty: Type, target: Target) !Value {
         switch (dest_ty.floatBits(target)) {
             16 => return Value.Tag.float_16.create(arena, @floatCast(f16, float)),
             32 => return Value.Tag.float_32.create(arena, @floatCast(f32, float)),
test/behavior/bitreverse.zig
@@ -96,7 +96,6 @@ fn vector8() !void {
 
 test "bitReverse vectors u8" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
@@ -115,7 +114,6 @@ fn vector16() !void {
 
 test "bitReverse vectors u16" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
@@ -134,7 +132,6 @@ fn vector24() !void {
 
 test "bitReverse vectors u24" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
test/behavior/byteswap.zig
@@ -62,7 +62,6 @@ fn vector8() !void {
 
 test "@byteSwap vectors u8" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
@@ -81,7 +80,6 @@ fn vector16() !void {
 
 test "@byteSwap vectors u16" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
@@ -100,7 +98,6 @@ fn vector24() !void {
 
 test "@byteSwap vectors u24" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
test/behavior/cast.zig
@@ -598,7 +598,6 @@ test "cast *[1][*]const u8 to [*]const ?[*]const u8" {
 
 test "vector casts" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
test/behavior/floatop.zig
@@ -141,7 +141,6 @@ fn testSqrt() !void {
 
 test "@sqrt with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -234,7 +233,6 @@ fn testSin() !void {
 
 test "@sin with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -275,7 +273,6 @@ fn testCos() !void {
 
 test "@cos with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -315,7 +312,6 @@ fn testExp() !void {
 
 test "@exp with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -355,7 +351,6 @@ fn testExp2() !void {
 
 test "@exp2" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -409,7 +404,6 @@ test "@log with @vectors" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
 
     {
@@ -447,7 +441,6 @@ test "@log2 with vectors" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     // https://github.com/ziglang/zig/issues/13681
     if (builtin.zig_backend == .stage2_llvm and
         builtin.cpu.arch == .aarch64 and
@@ -491,7 +484,6 @@ test "@log10 with vectors" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
 
     comptime try testLog10WithVectors();
     try testLog10WithVectors();
@@ -537,7 +529,6 @@ fn testFabs() !void {
 
 test "@fabs with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -660,7 +651,6 @@ fn testFloor() !void {
 
 test "@floor with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -754,7 +744,6 @@ fn testCeil() !void {
 
 test "@ceil with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -848,7 +837,6 @@ fn testTrunc() !void {
 
 test "@trunc with vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
test/behavior/maximum_minimum.zig
@@ -25,7 +25,6 @@ test "@max" {
 test "@max on vectors" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@@ -75,7 +74,6 @@ test "@min for vectors" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
 
     const S = struct {
test/behavior/muladd.zig
@@ -100,7 +100,6 @@ fn vector16() !void {
 }
 
 test "vector f16" {
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -124,7 +123,6 @@ fn vector32() !void {
 }
 
 test "vector f32" {
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -148,7 +146,6 @@ fn vector64() !void {
 }
 
 test "vector f64" {
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -171,7 +168,6 @@ fn vector80() !void {
 }
 
 test "vector f80" {
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -195,7 +191,6 @@ fn vector128() !void {
 }
 
 test "vector f128" {
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
test/behavior/vector.zig
@@ -25,7 +25,6 @@ test "implicit cast vector to array - bool" {
 
 test "vector wrap operators" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -116,7 +115,6 @@ test "vector float operators" {
 
 test "vector bit operators" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -442,7 +440,6 @@ test "vector comparison operators" {
 
 test "vector division operators" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -525,7 +522,6 @@ test "vector division operators" {
 
 test "vector bitwise not operator" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -557,7 +553,6 @@ test "vector bitwise not operator" {
 
 test "vector shift operators" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -651,7 +646,6 @@ test "vector shift operators" {
 
 test "vector reduce operation" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -707,7 +701,7 @@ test "vector reduce operation" {
 
             // LLVM 11 ERROR: Cannot select type
             // https://github.com/ziglang/zig/issues/7138
-            if (builtin.target.cpu.arch != .aarch64) {
+            if (builtin.zig_backend != .stage2_llvm or builtin.target.cpu.arch != .aarch64) {
                 try testReduce(.Min, [4]i64{ 1234567, -386, 0, 3 }, @as(i64, -386));
                 try testReduce(.Min, [4]u64{ 99, 9999, 9, 99999 }, @as(u64, 9));
             }
@@ -725,7 +719,7 @@ test "vector reduce operation" {
 
             // LLVM 11 ERROR: Cannot select type
             // https://github.com/ziglang/zig/issues/7138
-            if (builtin.target.cpu.arch != .aarch64) {
+            if (builtin.zig_backend != .stage2_llvm or builtin.target.cpu.arch != .aarch64) {
                 try testReduce(.Max, [4]i64{ 1234567, -386, 0, 3 }, @as(i64, 1234567));
                 try testReduce(.Max, [4]u64{ 99, 9999, 9, 99999 }, @as(u64, 99999));
             }
@@ -773,14 +767,14 @@ test "vector reduce operation" {
 
             // LLVM 11 ERROR: Cannot select type
             // https://github.com/ziglang/zig/issues/7138
-            if (false) {
-                try testReduce(.Min, [4]f16{ -1.9, 5.1, f16_nan, 100.0 }, f16_nan);
-                try testReduce(.Min, [4]f32{ -1.9, 5.1, f32_nan, 100.0 }, f32_nan);
-                try testReduce(.Min, [4]f64{ -1.9, 5.1, f64_nan, 100.0 }, f64_nan);
-
-                try testReduce(.Max, [4]f16{ -1.9, 5.1, f16_nan, 100.0 }, f16_nan);
-                try testReduce(.Max, [4]f32{ -1.9, 5.1, f32_nan, 100.0 }, f32_nan);
-                try testReduce(.Max, [4]f64{ -1.9, 5.1, f64_nan, 100.0 }, f64_nan);
+            if (builtin.zig_backend != .stage2_llvm) {
+                try testReduce(.Min, [4]f16{ -1.9, 5.1, f16_nan, 100.0 }, @as(f16, -1.9));
+                try testReduce(.Min, [4]f32{ -1.9, 5.1, f32_nan, 100.0 }, @as(f32, -1.9));
+                try testReduce(.Min, [4]f64{ -1.9, 5.1, f64_nan, 100.0 }, @as(f64, -1.9));
+
+                try testReduce(.Max, [4]f16{ -1.9, 5.1, f16_nan, 100.0 }, @as(f16, 100.0));
+                try testReduce(.Max, [4]f32{ -1.9, 5.1, f32_nan, 100.0 }, @as(f32, 100.0));
+                try testReduce(.Max, [4]f64{ -1.9, 5.1, f64_nan, 100.0 }, @as(f64, 100.0));
             }
 
             try testReduce(.Mul, [4]f16{ -1.9, 5.1, f16_nan, 100.0 }, f16_nan);
@@ -831,7 +825,6 @@ test "mask parameter of @shuffle is comptime scope" {
 
 test "saturating add" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -863,7 +856,6 @@ test "saturating add" {
 
 test "saturating subtraction" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -886,7 +878,6 @@ test "saturating subtraction" {
 
 test "saturating multiplication" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -913,7 +904,6 @@ test "saturating multiplication" {
 
 test "saturating shift-left" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -1047,7 +1037,6 @@ test "@mulWithOverflow" {
 }
 
 test "@shlWithOverflow" {
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -1202,7 +1191,6 @@ test "zero multiplicand" {
 
 test "@intCast to u0" {
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO